# Fraud Prediction

## Artificial Financial Dataset
- https://www.kaggle.com/datasets/ealaxi/paysim1


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('data/PS_20174392719_1491204439457_log.csv')
df.info()
df.head()

In [None]:
# Check class balance
df['isFlaggedFraud'].value_counts(1)

In [None]:
# Check nunique for categorical columns
df.select_dtypes('object').nunique()

In [None]:
# See if checking the nameOrig and nameDest columns for equality is a good feature
orig_is_dest = df['nameOrig'] == df['nameDest']
orig_is_dest.value_counts(1)

## EDA

### How common is fraud?

In [None]:
sns.countplot(df, x='isFraud', hue='isFraud', stat='probability')
df['isFraud'].value_counts(1)

### How do fraudulent transactions compare to non-fradulent?

In [None]:
sns.boxplot(df, y='amount', x='isFraud', hue='isFraud')

In [None]:
sns.barplot(df, y='amount', x='isFraud', hue='isFraud', errorbar=('ci',68))

In [None]:
# sns.barplot(df, y='amount', x='isFraud', hue='isFraud', errorbar=('ci',68), estimator='median')

##### Transaction Type 

In [None]:
# How often is each type of transaction fraudulent?
df.groupby(['type'])['isFraud'].mean()

### How correct was pre-existing flag?

In [None]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix


In [None]:
print(classification_report(df['isFraud'], df['isFlaggedFraud'] ))
ConfusionMatrixDisplay.from_predictions(df['isFraud'], df['isFlaggedFraud'],cmap='Greens')

> Recall is terrible for fraud.

# Modeling

In [None]:
drop_cols = ['nameOrig','nameDest','isFlaggedFraud']
df_ml = df.drop(columns=drop_cols, errors='ignore')
df_ml

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.compose import make_column_selector
from sklearn import set_config
set_config(transform_output='pandas')

In [None]:
num_selector = make_column_selector(dtype_include="number")
num_pipe = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]
)

cat_selector = make_column_selector(dtype_include=["object",'category'])
cat_pipe = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipe, num_selector),
        ('cat', cat_pipe, cat_selector)
    ],
    verbose_feature_names_out=False
)


In [None]:
### Split the data
X = df_ml.drop(columns='isFraud')
y = df_ml['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"{X_train.shape=}, {X_test.shape=}")
print(y_train.value_counts(1))

In [None]:
# Fit preprocessor on training data
X_train_tf = preprocessor.fit_transform(X_train)
X_test_tf = preprocessor.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


In [None]:
!pip install dojo_ds
import dojo_ds as ds

In [None]:
clf = RandomForestClassifier(class_weight='balanced', n_jobs=-1)
clf_pipe = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('clf', clf)
    ]
)
clf_pipe.fit(X_train, y_train)
ds.evaluate.evaluate_classification(clf_pipe, X_train=X_train, y_train=y_train,
                                    X_test=X_test, y_test=y_test)

In [None]:
# !pip install imbalanced-learn

In [None]:
# cat_features = cat_selector(X_train)
# cat_features

# cat_features_mask = np.zeros_like(X_train_tf.columns, dtype=bool)
# cat_features_mask

In [None]:
# ## Make a mask for the cat features
# for cat in cat_features:
#     for i in range(len(X_train_tf.columns)):
#         if X_train_tf.columns[i].startswith(cat):
#             cat_features_mask[i] = True
# cat_features_mask

In [None]:
X_train_tf

## Resampling

In [None]:
from imblearn.over_sampling import SMOTENC
# from imblearn.pipeline import Pipeline as imbpipeline

In [None]:
# Converting categories to category dtype for SMOTENC to work
cat_cols = cat_selector(X_train)


for col in cat_cols:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')
X_train.dtypes

In [None]:
# Slicing out the encoder for SMOTENC
ohe = preprocessor.named_transformers_['cat'].named_steps['encoder']
ohe

In [None]:
# Creating SMOTENC for categorical features
smote = SMOTENC(categorical_features='auto',
                categorical_encoder=ohe,
                random_state=42)

In [None]:
# Fit the smote on the training data
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
X_train_smote

In [None]:
y_train_smote.value_counts(1)

In [None]:
# Preprocess (Permanently change the data types of the categorical columns)
X_train_smote_tf = preprocessor.fit_transform(X_train_smote)
X_test_tf = preprocessor.transform(X_test)
X_train_smote_tf.head()

### RandomForest (Smote)

In [None]:
clf_rf = RandomForestClassifier(n_jobs=-1, random_state=42)
clf_rf.fit(X_train_smote_tf, y_train_smote)

ds.evaluate.evaluate_classification(clf_rf, X_train=X_train_smote_tf, y_train=y_train_smote,
                                    X_test=X_test_tf, y_test=y_test)

from sklearn.metrics import roc_auc_score, RocCurveDisplay

y_pred = clf_rf.predict_proba(X_test_tf)[:,1]
roc_auc_score(y_test, y_pred)

### LogReg (Smote)

In [None]:
# Logistic Regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_smote_tf, y_train_smote)
ds.evaluate.evaluate_classification(clf, X_train=X_train_smote_tf, y_train=y_train_smote,
                                    X_test=X_test_tf, y_test=y_test)

In [None]:
# from sklearn.metrics import roc_auc_score, RocCurveDisplay

# y_pred = clf.predict_proba(X_test_tf)[:,1]
# roc_auc_score(y_test, y_pred)

### SVM

In [None]:
# from sklearn.svm import SVC, LinearSVC

# svm = LinearSVC()
# svm.fit(X_train_smote_tf, y_train_smote)
# ds.evaluate.evaluate_classification(svm, X_train=X_train_smote_tf, y_train=y_train_smote,
#                                     X_test=X_test_tf, y_test=y_test)

## Would isolation forest identify the correct fraudulent transactions?

In [None]:
X_train_tf

In [None]:
# import IsolatonForest
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(contamination=0.001, random_state=42)
iso_forest.fit(X_train_tf)


In [None]:

pred_train = iso_forest.predict(X_train_tf)

print(f"Classification Report for Train Data")
print(classification_report(y_train, pred_train==-1))
ConfusionMatrixDisplay.from_predictions(y_train, pred_train==-1, cmap='Greens')
plt.show()

print(f"Classification Report for Test Data")
pred_test = iso_forest.predict(X_test_tf)
print(classification_report(y_test, pred_test==-1))
ConfusionMatrixDisplay.from_predictions(y_test, pred_test==-1, cmap='Reds')
plt.show()

In [None]:
# ds.evaluate.evaluate_classification(iso_forest, X_train=X_train_tf, y_train=y_train,
#                                     X_test=X_test_tf, y_test=y_test)

## Explaining the Best Model

In [None]:
import shap
shap.initjs()

In [None]:
# Sample for SHAP (decided to use smoted data for equal representation of fraud and non-fraud in the sample)
X_shap = shap.sample(X_train_smote_tf, 500, random_state=321)
y_shap =  y_train_smote.loc[X_shap.index]
print(y_shap.value_counts(1))

In [None]:
explainer = shap.Explainer(clf_rf, X_shap)
shap_values = explainer(X_shap)

In [None]:
shap_values.shape

In [None]:
shap.summary_plot(shap_values[:,:,1], X_shap)

In [None]:
shap.force_plot(shap_values[:,:,1], X_shap)

In [None]:
X

In [None]:
# # force plot
# shap.force_plot(explainer.expected_value, shap_values[:,:,1], X_shap)