# Modeling of Fraud Detection

### Initiation

In [69]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, date, time, timedelta
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
#importing over and undersampling algorithms from imblearn (you will have to manually install it in your envoirenment with pip install imblearn) 
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb
import pickle

In [70]:
# important functions
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Oranges):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    # Plot the confusion matrix
    plt.figure(figsize = (10, 10))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, size = 24)
    plt.colorbar(aspect=4)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, size = 14)
    plt.yticks(tick_marks, classes, size = 14)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    
    # Labeling the plot
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), fontsize = 20,
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
        
    plt.grid(None)
    plt.tight_layout()
    plt.ylabel('True label', size = 18)
    plt.xlabel('Predicted label', size = 18)

def print_evaluations(y,predictions):
    print(f"Accuracy: {accuracy_score(y, predictions)}")
    print(f"Precision: {precision_score(y, predictions)}")
    print(f"Recall: {recall_score(y, predictions)}")
    print(f"F1: {f1_score(y, predictions)}")


In [78]:
X = pd.read_csv('./data/X.csv', index_col='Unnamed: 0')
y = pd.read_csv('./data/y.csv', index_col='Unnamed: 0')


In [79]:
X.columns

Index(['PricingStrategy_0', 'PricingStrategy_1', 'PricingStrategy_2',
       'PricingStrategy_4', 'ProviderId_ProviderId_1',
       'ProviderId_ProviderId_2', 'ProviderId_ProviderId_3',
       'ProviderId_ProviderId_4', 'ProviderId_ProviderId_5',
       'ProviderId_ProviderId_6', 'ProductId_ProductId_1',
       'ProductId_ProductId_10', 'ProductId_ProductId_11',
       'ProductId_ProductId_13', 'ProductId_ProductId_14',
       'ProductId_ProductId_15', 'ProductId_ProductId_16',
       'ProductId_ProductId_19', 'ProductId_ProductId_2',
       'ProductId_ProductId_20', 'ProductId_ProductId_21',
       'ProductId_ProductId_22', 'ProductId_ProductId_23',
       'ProductId_ProductId_24', 'ProductId_ProductId_27',
       'ProductId_ProductId_3', 'ProductId_ProductId_4',
       'ProductId_ProductId_5', 'ProductId_ProductId_6',
       'ProductId_ProductId_7', 'ProductId_ProductId_8',
       'ProductId_ProductId_9', 'ChannelId_ChannelId_1',
       'ChannelId_ChannelId_2', 'ChannelId_ChannelId_3

In [64]:
len(X.columns)

75

### Split into train and test

In [80]:
#splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Resample the data

#### 1. Oversample Frauds

In [81]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_over, y_over = ros.fit_resample(X_train, y_train)

np.mean(y_over)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


FraudResult    0.5
dtype: float64

#### 2. Create Synthetic Fraud Data

In [6]:
#used smote algorithm (synthetic oversampling) to oversample fradulent class
# dataframes of synthetic datapoints: smote_data_X, smote_data_Y
smote_algo = SMOTE(random_state=50)
X_syn, y_syn = smote_algo.fit_resample(X_train, y_train)
X_syn = pd.DataFrame(data=X_syn, columns=X_train.columns)
y_syn = pd.DataFrame(data=y_syn, columns=['FraudResult'])
np.mean(y_syn)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


FraudResult    0.5
dtype: float64

#### 3. Undersample Non-Frauds

In [7]:
# used randomundersampler algorithm to undersample non fradulent class
# dataframes for undersampled data: X_res, y_res
rus = RandomUnderSampler(random_state=50)
X_under, y_under = rus.fit_resample(X_train, y_train)
np.mean(y_under)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


FraudResult    0.5
dtype: float64

# Model fitting

In [14]:
X_input = X_syn
y_input = y_syn

# random forest
model_rf = RandomForestClassifier(n_estimators=100, 
                               random_state=50, 
                               max_features = 'sqrt',
                               n_jobs=-1, verbose = 1)
model_rf.fit(X_input, y_input)

# logistic
model_log = LogisticRegression(random_state=50).fit(X_input, y_input)

# knn
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_input, y_input)

# naive
model_naive = MultinomialNB()
model_naive.fit(X_input, y_input)

# svm
model_svm = SVC(random_state = 50, probability = True)
model_svm.fit(X_input, y_input)


  model_rf.fit(X_input, y_input)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.8s finished
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Evaluation of Train Data

In [10]:
# rf
train_rf_predictions = model_rf.predict(X_input)
train_rf_probs = model_rf.predict_proba(X_input)[:, 1]

# log
train_log_predictions = model_log.predict(X_input)
train_log_probs = model_log.predict_proba(X_input)[:, 1]

# knn
train_knn_predictions = model_knn.predict(X_input)
train_knn_probs = model_knn.predict_proba(X_input)[:, 1]

#naive
train_naive_predictions = model_naive.predict(X_input)
train_naive_probs = model_naive.predict_proba(X_input)[:, 1]

# svm
train_svm_predictions = model_svm.predict(X_input)
train_svm_probs = model_svm.predict_proba(X_input)[:, 1]

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.2s finished


In [11]:
for i,p in enumerate([train_rf_predictions, train_log_predictions, train_knn_predictions, train_naive_predictions, train_svm_predictions]):
    print(i)
    print_evaluations(y_input, p)


0
Accuracy: 0.9997185961072461
Precision: 0.9994531164549446
Recall: 0.9999843664504026
F1: 0.9997186708761839
1
Accuracy: 0.9897287579144844
Precision: 0.9913726843500494
Recall: 0.9880559681075588
F1: 0.9897115474960068
2
Accuracy: 0.9960290784022512
Precision: 0.9964792589231226
Recall: 0.9955757054639256
F1: 0.9960272772772772
3
Accuracy: 0.9875087938716486
Precision: 0.9799307425933051
Recall: 0.9954037364183538
F1: 0.9876066387467038
4
Accuracy: 0.9900179785820371
Precision: 0.9883307886454991
Recall: 0.9917454858125537
F1: 0.9900351928584249


In [12]:
# rf
test_rf_predictions = model_rf.predict(X_test)
test_rf_probs = model_rf.predict_proba(X_test)[:, 1]

# log
test_log_predictions = model_log.predict(X_test)
test_log_probs = model_log.predict_proba(X_test)[:, 1]

# knn
test_knn_predictions = model_knn.predict(X_test)
test_knn_probs = model_knn.predict_proba(X_test)[:, 1]

#naive
test_naive_predictions = model_naive.predict(X_test)
test_naive_probs = model_naive.predict_proba(X_test)[:, 1]

# svm
test_svm_predictions = model_svm.predict(X_test)
test_svm_probs = model_svm.predict_proba(X_test)[:, 1]

for i,p in enumerate([test_rf_predictions, test_log_predictions, test_knn_predictions, test_naive_predictions, test_svm_predictions]):
    print(i)
    print_evaluations(y_test, p)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


0
Accuracy: 0.9989546707212772
Precision: 0.6904761904761905
Recall: 0.8923076923076924
F1: 0.7785234899328859
1
Accuracy: 0.9915106591909785
Precision: 0.18960244648318042
Recall: 0.9538461538461539
F1: 0.31632653061224486
2
Accuracy: 0.9954385631473914
Precision: 0.29743589743589743
Recall: 0.8923076923076924
F1: 0.4461538461538462
3
Accuracy: 0.9779847318572017
Precision: 0.08333333333333333
Recall: 0.9692307692307692
F1: 0.15347137637028013
4
Accuracy: 0.9885013779340492
Precision: 0.14691943127962084
Recall: 0.9538461538461539
F1: 0.2546201232032854


In [13]:
# Training predictions (to demonstrate overfitting)
train_rf_predictions = model_rf.predict(smote_data_X)
train_rf_probs = model_rf.predict_proba(smote_data_X)[:, 1]

# Testing predictions (to determine performance)
test_rf_predictions = model_rf.predict(X_test)
test_rf_probs = model_rf.predict_proba(X_test)[:, 1]

# Confusion matrix
cm = confusion_matrix(y_test, test_rf_predictions)
plot_confusion_matrix(cm, classes = ['Fraud', 'No Fraud'],
                      title = 'Fraud Confusion Matrix')

print_evaluations(y_test, test_rf_predictions)


NameError: name 'smote_data_X' is not defined

In [None]:
cm = confusion_matrix(smote_data_Y, train_rf_predictions)
plot_confusion_matrix(cm, classes = ['Fraud', 'No Fraud'],
                      title = 'Fraud Confusion Matrix')
print_evaluations(smote_data_Y, train_rf_predictions)

In [None]:
model_svm = SVC(random_state = 50, probability = True)
model_svm.fit(smote_data_X, smote_data_Y)

# Training predictions (to demonstrate overfitting)
train_svm_predictions = model_svm.predict(smote_data_X)
train_svm_probs = model_svm.predict_proba(smote_data_X)[:, 1]

# Testing predictions (to determine performance)
test_svm_predictions = model_svm.predict(X_test)
test_svm_probs = model_svm.predict_proba(X_test)[:, 1]

# Confusion matrix
cm = confusion_matrix(y_test, test_svm_predictions)
plot_confusion_matrix(cm, classes = ['Fraud', 'No Fraud'],
                      title = 'Fraud Confusion Matrix')

print_evaluations(y_test, test_svm_predictions)

In [None]:
model_adaboost =  AdaBoostClassifier(random_state = 50)
model_adaboost.fit(smote_data_X, smote_data_Y)

# Training predictions (to demonstrate overfitting)
train_adaboost_predictions = model_adaboost.predict(smote_data_X)
train_adaboost_probs = model_adaboost.predict_proba(smote_data_X)[:, 1]

# Testing predictions (to determine performance)
test_adaboost_predictions = model_adaboost.predict(X_test)
test_adaboost_probs = model_adaboost.predict_proba(X_test)[:, 1]

# Confusion matrix
cm = confusion_matrix(y_test, test_adaboost_predictions)
plot_confusion_matrix(cm, classes = ['Fraud', 'No Fraud'],
                      title = 'Fraud Confusion Matrix')

print_evaluations(y_test, test_adaboost_predictions)

# Stacking two best models

In [82]:
X_over.columns

Index(['PricingStrategy_0', 'PricingStrategy_1', 'PricingStrategy_2',
       'PricingStrategy_4', 'ProviderId_ProviderId_1',
       'ProviderId_ProviderId_2', 'ProviderId_ProviderId_3',
       'ProviderId_ProviderId_4', 'ProviderId_ProviderId_5',
       'ProviderId_ProviderId_6', 'ProductId_ProductId_1',
       'ProductId_ProductId_10', 'ProductId_ProductId_11',
       'ProductId_ProductId_13', 'ProductId_ProductId_14',
       'ProductId_ProductId_15', 'ProductId_ProductId_16',
       'ProductId_ProductId_19', 'ProductId_ProductId_2',
       'ProductId_ProductId_20', 'ProductId_ProductId_21',
       'ProductId_ProductId_22', 'ProductId_ProductId_23',
       'ProductId_ProductId_24', 'ProductId_ProductId_27',
       'ProductId_ProductId_3', 'ProductId_ProductId_4',
       'ProductId_ProductId_5', 'ProductId_ProductId_6',
       'ProductId_ProductId_7', 'ProductId_ProductId_8',
       'ProductId_ProductId_9', 'ChannelId_ChannelId_1',
       'ChannelId_ChannelId_2', 'ChannelId_ChannelId_3

In [83]:
# best random forest
X_input = X_over
y_input = y_over
model_rf = RandomForestClassifier(bootstrap=False, max_depth=30, n_estimators=500,
                       n_jobs=-1, random_state=50, verbose=1, max_features = 'sqrt')
model_rf.fit(X_input, y_input)
prediction_rf = model_rf.predict(X_test)
print_evaluations(y_test, prediction_rf)
print(confusion_matrix(y_test, prediction_rf))

  model_rf.fit(X_input, y_input)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    5.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.2s


Accuracy: 0.9989546707212772
Precision: 0.7162162162162162
Recall: 0.8153846153846154
F1: 0.762589928057554
[[31483    21]
 [   12    53]]


[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.2s finished


In [58]:
analysis = X_test
analysis['predictions'] = prediction_rf
analysis['actual'] = y_test
fp = analysis.query('predictions == 1 & actual == 0')

In [59]:
fp.sum(axis=0)

PricingStrategy_0           0
PricingStrategy_1           0
PricingStrategy_2          19
PricingStrategy_4           2
ProviderId_ProviderId_1     5
                           ..
ProductId_ProductId_18      0
ProductId_ProductId_25      0
ProductId_ProductId_26      0
predictions                21
actual                      0
Length: 77, dtype: int64

In [None]:
print_evaluations(y_test, )

In [84]:
filename = 'finalized_model.sav'
pickle.dump(model_rf, open(filename, 'wb'))



In [17]:
# best xgb boost model
model_xgb = xgb.XGBClassifier(n_estimators=101,
                        random_state=50,
                        n_jobs=-1,
                        max_features = 'sqrt',
                        verbose = 1, max_depth=90, learning_rate=0.5, use_label_encoder=False)
model_xgb.fit(X_input, y_input)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Parameters: { "max_features", "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.5, max_delta_step=0,
              max_depth=90, max_features='sqrt', min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=101,
              n_jobs=-1, num_parallel_tree=1, predictor='auto', random_state=50,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbose=1, ...)

In [11]:
from sklearn.ensemble import StackingClassifier

In [23]:
estimators = [('model_rf', model_rf) , ('xg_boost', model_xgb)]
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(penalty='none'))
clf.fit(X_input, np.ravel(y_input))

predictions_stacked = clf.predict(X_test)
print_evaluations(y_test, predictions_stacked)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 485 out of 500 | elapsed:    4.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    4.1s finished
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Parameters: { "max_features", "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 485 out of 500 | elapsed:    3.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    3.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    3.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 

Parameters: { "max_features", "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Parameters: { "max_features", "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Parameters: { "max_features", "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Parameters: { "max_features", "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Parameters: { "max_features", "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s


Accuracy: 0.9989229940764675
Precision: 0.7627118644067796
Recall: 0.6923076923076923
F1: 0.7258064516129032


[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.3s finished
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
