In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import shap
import lime
import lime.lime_tabular

In [None]:
# Load the datasets
fraud_data = pd.read_csv('../data/raw/Fraud_Data.csv')
ip_data = pd.read_csv('../data/raw/IpAddress_to_Country.csv')
creditcard_data = pd.read_csv('../data/raw/creditcard.csv')

In [None]:
# Convert datetime strings to datetime objects
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

# Extract useful datetime components
fraud_data['signup_hour'] = fraud_data['signup_time'].dt.hour
fraud_data['signup_day'] = fraud_data['signup_time'].dt.dayofweek
fraud_data['purchase_hour'] = fraud_data['purchase_time'].dt.hour
fraud_data['purchase_day'] = fraud_data['purchase_time'].dt.dayofweek

# Drop the original datetime columns
fraud_data = fraud_data.drop(columns=['signup_time', 'purchase_time'])

In [None]:
# For fraud_data
fraud_X = fraud_data.drop(columns=['class'])
fraud_y = fraud_data['class']

# Split fraud_data
fraud_X_train, fraud_X_test, fraud_y_train, fraud_y_test = train_test_split(fraud_X, fraud_y, test_size=0.3, random_state=42)

In [None]:
# Define preprocessing for numerical and categorical features
numeric_features = ['purchase_value', 'age']  # Example numeric features
categorical_features = ['source', 'browser', 'sex', 'signup_hour', 'signup_day', 'purchase_hour', 'purchase_day']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [None]:
# Use the best model (Random Forest in this case) and pipeline for fraud data
rf_pipeline_fraud = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier())])
rf_pipeline_fraud.fit(fraud_X_train, fraud_y_train)

In [None]:
# SHAP explainability
explainer_fraud = shap.Explainer(rf_pipeline_fraud.named_steps['classifier'])
shap_values_fraud = explainer_fraud(rf_pipeline_fraud.named_steps['preprocessor'].transform(fraud_X_test))

# SHAP plots
shap.summary_plot(shap_values_fraud, rf_pipeline_fraud.named_steps['preprocessor'].transform(fraud_X_test), feature_names=fraud_X.columns)
shap.dependence_plot("purchase_value", shap_values_fraud, rf_pipeline_fraud.named_steps['preprocessor'].transform(fraud_X_test), feature_names=fraud_X.columns)
shap.force_plot(explainer_fraud.expected_value, shap_values_fraud[0, :], fraud_X_test.iloc[0, :], feature_names=fraud_X.columns)

In [None]:
# LIME explainability
explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=rf_pipeline_fraud.named_steps['preprocessor'].transform(fraud_X_train),
    feature_names=fraud_X.columns,
    class_names=['Not Fraud', 'Fraud'],
    mode='classification'
)

# Explain a single prediction with LIME
i = 0  # Index of the instance to explain
exp = explainer.explain_instance(
    data_row=rf_pipeline_fraud.named_steps['preprocessor'].transform(fraud_X_test)[i],
    predict_fn=rf_pipeline_fraud.named_steps['classifier'].predict_proba
)

# Display LIME explanation
exp.show_in_notebook(show_table=True, show_all=False)
exp.as_pyplot_figure()