# 📍  
Using the `fraudTrain.csv` dataset, create models to predict if a transaction is fraudulent or not. After building your model, use it to make predictions on the `fraudTest.csv` dataset. Your goal is not only to make accurate predictions but also to understand how your model makes these decisions. To do this, use Model Interpretability techniques to show which information helps the model decide if a transaction is fraudulent. Your task is to explain your model's decisions in a way that anyone can understand, even without a background in data science.📍 📍

In [3]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
     ---------------------------------------- 99.8/99.8 MB 7.8 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3
Note: you may need to restart the kernel to use updated packages.


In [4]:
# Make necessary imports
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import xgboost

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix, confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [5]:
!pip install shap



In [6]:
# Load the data
fraud_train = pd.read_csv("dataset/fraudTrain.csv")
fraud_test = pd.read_csv("dataset/fraudTest.csv")

In [7]:
# Explore the data
print(fraud_train.head())
print(fraud_train.info())

   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Suite 393  ...  48

In [8]:
# Preprocess the data (handle missing values, categorical encoding, etc.)

# Step 2: Model Training
# Train various machine learning models (e.g., logistic regression, random forest, XGBoost) on the fraudTrain.csv dataset

# Step 3: Model Evaluation
# Evaluate the performance of the trained models using appropriate metrics (e.g., accuracy, precision, recall, F1-score, ROC-AUC)

# Step 4: Model Interpretability
# Use model interpretability techniques to understand how the models make decisions and explain their predictions
# Example: Feature Importance Analysis, SHAP (SHapley Additive exPlanations), LIME (Local Interpretable Model-agnostic Explanations)

# Step 5: Prediction on Test Data
# Use the trained models to make predictions on the fraudTest.csv dataset
# Evaluate the performance of the models on the test data

# Example code for training a logistic regression model and interpreting its decisions
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import shap

In [9]:
# Prepare data
X_train = fraud_train.drop('is_fraud', axis=1)
y_train = fraud_train['is_fraud']
X_test = fraud_test.drop('is_fraud', axis=1)
y_test = fraud_test['is_fraud']

In [10]:
# Combine train and test data for preprocessing
data = pd.concat([fraud_train, fraud_test], ignore_index=True)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 23 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Unnamed: 0             int64  
 1   trans_date_trans_time  object 
 2   cc_num                 int64  
 3   merchant               object 
 4   category               object 
 5   amt                    float64
 6   first                  object 
 7   last                   object 
 8   gender                 object 
 9   street                 object 
 10  city                   object 
 11  state                  object 
 12  zip                    int64  
 13  lat                    float64
 14  long                   float64
 15  city_pop               int64  
 16  job                    object 
 17  dob                    object 
 18  trans_num              object 
 19  unix_time              int64  
 20  merch_lat              float64
 21  merch_long             float64
 22  is_fraud          

In [12]:
# Drop unnecessary columns
data.drop(columns=["Unnamed: 0"], inplace=True)

In [13]:
# Handle missing values
# Replace missing values in categorical columns with the mode
categorical_cols = data.select_dtypes(include=["object"]).columns
for col in categorical_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

In [14]:
# Replace missing values in numerical columns with the mean
numerical_cols = data.select_dtypes(include=["int64", "float64"]).columns
imputer = SimpleImputer(strategy="mean")
data[numerical_cols] = imputer.fit_transform(data[numerical_cols])

In [15]:
# Perform categorical encoding
label_encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

In [16]:
# Split the preprocessed data back into train and test datasets
fraud_train_preprocessed = data.iloc[:fraud_train.shape[0], :]
fraud_test_preprocessed = data.iloc[fraud_train.shape[0]:, :]

In [17]:
# Split into features and target variables
X_train = fraud_train_preprocessed.drop(columns=["is_fraud"])
y_train = fraud_train_preprocessed["is_fraud"]
X_test = fraud_test_preprocessed.drop(columns=["is_fraud"])
y_test = fraud_test_preprocessed["is_fraud"]

In [18]:
# Further splitting for validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
# Train logistic regression model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

# Train random forest model
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)


In [None]:
# Train XGBoost model
xgboost_model = XGBClassifier()
xgboost_model.fit(X_train, y_train)

In [None]:
# Define a function to evaluate the models
def evaluate_model(model, X_val, y_val):
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_pred_proba[:, 1])
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")

# Evaluate logistic regression model
print("Logistic Regression Model:")
evaluate_model(logistic_regression_model, X_val, y_val)
print()

# Evaluate random forest model
print("Random Forest Model:")
evaluate_model(random_forest_model, X_val, y_val)
print()

# Evaluate XGBoost model
print("XGBoost Model:")
evaluate_model(xgboost_model, X_val, y_val)

In [None]:
import shap
from sklearn.inspection import plot_partial_dependence
import matplotlib.pyplot as plt
from lime import lime_tabular
from lime.lime_tabular import LimeTabularExplainer

# 1. Feature Importance Analysis (for Random Forest and XGBoost models)
# Random Forest Feature Importance
plt.figure(figsize=(10, 6))
plt.title("Random Forest Feature Importance")
feat_importances_rf = pd.Series(random_forest_model.feature_importances_, index=X_train.columns)
feat_importances_rf.nlargest(10).plot(kind='barh')
plt.show()

# XGBoost Feature Importance
plt.figure(figsize=(10, 6))
plt.title("XGBoost Feature Importance")
feat_importances_xgb = pd.Series(xgboost_model.feature_importances_, index=X_train.columns)
feat_importances_xgb.nlargest(10).plot(kind='barh')
plt.show()

# 2. SHAP (SHapley Additive exPlanations)
# SHAP Summary Plot for Random Forest
explainer_rf = shap.TreeExplainer(random_forest_model)
shap_values_rf = explainer_rf.shap_values(X_val)
shap.summary_plot(shap_values_rf, X_val, plot_type="bar")

# SHAP Summary Plot for XGBoost
explainer_xgb = shap.TreeExplainer(xgboost_model)
shap_values_xgb = explainer_xgb.shap_values(X_val)
shap.summary_plot(shap_values_xgb, X_val, plot_type="bar")

# 3. LIME (Local Interpretable Model-agnostic Explanations)
# Initialize LIME Explainer
lime_explainer = LimeTabularExplainer(X_train.values, mode="classification", feature_names=X_train.columns)

# Explain individual predictions using LIME
# Example: Explain the first instance in the validation data for the Random Forest model
lime_exp_rf = lime_explainer.explain_instance(X_val.iloc[0], random_forest_model.predict_proba, num_features=5)
lime_exp_rf.show_in_notebook(show_table=True)

In [None]:
# Predictions on test data
y_pred_lr = logistic_regression_model.predict(X_test)
y_pred_rf = random_forest_model.predict(X_test)
y_pred_xgb = xgboost_model.predict(X_test)

# Evaluation metrics
def evaluate_performance(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")

print("Logistic Regression Model:")
evaluate_performance(y_test, y_pred_lr)
print()

print("Random Forest Model:")
evaluate_performance(y_test, y_pred_rf)
print()

print("XGBoost Model:")
evaluate_performance(y_test, y_pred_xgb)