# Modeling of Fraud Detection

### Initiation

In [32]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, date, time, timedelta
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
#importing over and undersampling algorithms from imblearn (you will have to manually install it in your envoirenment with pip install imblearn) 
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

In [21]:
# important functions
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Oranges):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    # Plot the confusion matrix
    plt.figure(figsize = (10, 10))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, size = 24)
    plt.colorbar(aspect=4)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, size = 14)
    plt.yticks(tick_marks, classes, size = 14)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    
    # Labeling the plot
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), fontsize = 20,
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
        
    plt.grid(None)
    plt.tight_layout()
    plt.ylabel('True label', size = 18)
    plt.xlabel('Predicted label', size = 18)

def print_evaluations(y,predictions):
    print(f"Accuracy: {accuracy_score(y, predictions)}")
    print(f"Precision: {precision_score(y, predictions)}")
    print(f"Recall: {recall_score(y, predictions)}")
    print(f"F1: {f1_score(y, predictions)}")


In [19]:
X = pd.read_csv('./data/X.csv', index_col='Unnamed: 0')
y = pd.read_csv('./data/y.csv', index_col='Unnamed: 0')


Unnamed: 0,Value,PricingStrategy,Hour,DirectionOfMoney,ProviderId_ProviderId_1,ProviderId_ProviderId_2,ProviderId_ProviderId_3,ProviderId_ProviderId_4,ProviderId_ProviderId_5,ProviderId_ProviderId_6,...,ProductCategory_airtime,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_movies,ProductCategory_other,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,Valuelog
0,1000,2,2,0.0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,6.907755
1,20,2,2,1.0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,2.995732
2,500,2,2,0.0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,6.214608
3,21800,2,3,0.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,9.989665
4,644,2,3,1.0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,6.467699


In [33]:
X.columns

Index(['Value', 'PricingStrategy', 'Hour', 'DirectionOfMoney',
       'ProviderId_ProviderId_1', 'ProviderId_ProviderId_2',
       'ProviderId_ProviderId_3', 'ProviderId_ProviderId_4',
       'ProviderId_ProviderId_5', 'ProviderId_ProviderId_6',
       'ProductId_ProductId_1', 'ProductId_ProductId_10',
       'ProductId_ProductId_11', 'ProductId_ProductId_12',
       'ProductId_ProductId_13', 'ProductId_ProductId_14',
       'ProductId_ProductId_15', 'ProductId_ProductId_16',
       'ProductId_ProductId_19', 'ProductId_ProductId_2',
       'ProductId_ProductId_20', 'ProductId_ProductId_21',
       'ProductId_ProductId_22', 'ProductId_ProductId_23',
       'ProductId_ProductId_24', 'ProductId_ProductId_27',
       'ProductId_ProductId_3', 'ProductId_ProductId_4',
       'ProductId_ProductId_5', 'ProductId_ProductId_6',
       'ProductId_ProductId_7', 'ProductId_ProductId_8',
       'ProductId_ProductId_9', 'ChannelId_ChannelId_1',
       'ChannelId_ChannelId_2', 'ChannelId_ChannelId_3',

### Split into train and test

In [22]:
#splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Resample the data

#### 1. Oversample Frauds

In [27]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_over, y_over = ros.fit_resample(X_train, y_train)

np.mean(y_over)

FraudResult    0.5
dtype: float64

#### 2. Create Synthetic Fraud Data

In [28]:
#used smote algorithm (synthetic oversampling) to oversample fradulent class
# dataframes of synthetic datapoints: smote_data_X, smote_data_Y
smote_algo = SMOTE(random_state=50)
X_syn, y_syn = smote_algo.fit_resample(X_train, y_train)
X_syn = pd.DataFrame(data=X_syn, columns=X_train.columns)
y_syn = pd.DataFrame(data=y_syn, columns=['FraudResult'])
np.mean(y_syn)

FraudResult    0.5
dtype: float64

#### 3. Undersample Non-Frauds

In [29]:
# used randomundersampler algorithm to undersample non fradulent class
# dataframes for undersampled data: X_res, y_res
rus = RandomUnderSampler(random_state=50)
X_under, y_under = rus.fit_resample(X_train, y_train)
np.mean(y_under)

FraudResult    0.5
dtype: float64

# Model fitting

In [39]:
X_input = X_over
y_input = y_over

# random forest
model_rf = RandomForestClassifier(n_estimators=100, 
                               random_state=50, 
                               max_features = 'sqrt',
                               n_jobs=-1, verbose = 1)
model_rf.fit(X_input, y_input)

# logistic
model_log = LogisticRegression(random_state=50).fit(X_input, y_input)

# knn
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_input, y_input)

# naive
model_naive = MultinomialNB()
model_naive.fit(X_input, y_input)

# svm
model_svm = SVC(random_state = 50, probability = True)
model_svm.fit(X_input, y_input)


  model_rf.fit(X_input, y_input)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.0s finished
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Evaluation of Train Data

In [35]:
# rf
train_rf_predictions = model_rf.predict(X_input)
train_rf_probs = model_rf.predict_proba(X_input)[:, 1]

# log
train_log_predictions = model_log.predict(X_input)
train_log_probs = model_log.predict_proba(X_input)[:, 1]

# knn
train_knn_predictions = model_knn.predict(X_input)
train_knn_probs = model_knn.predict_proba(X_input)[:, 1]

#naive
train_naive_predictions = model_naive.predict(X_input)
train_naive_probs = model_naive.predict_proba(X_input)[:, 1]

# svm
train_svm_predictions = model_svm.predict(X_input)
train_svm_probs = model_svm.predict_proba(X_input)[:, 1]

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [37]:
for i,p in enumerate([train_rf_predictions, train_log_predictions, train_knn_predictions, train_naive_predictions, train_svm_predictions]):
    print(i)
    print_evaluations(y_input, p)


0
Accuracy: 0.99609375
Precision: 0.9922480620155039
Recall: 1.0
F1: 0.9961089494163424
1
Accuracy: 0.984375
Precision: 0.984375
Recall: 0.984375
F1: 0.984375
2
Accuracy: 0.984375
Precision: 0.984375
Recall: 0.984375
F1: 0.984375
3
Accuracy: 0.984375
Precision: 0.9769230769230769
Recall: 0.9921875
F1: 0.9844961240310077
4
Accuracy: 0.95703125
Precision: 1.0
Recall: 0.9140625
F1: 0.9551020408163265


In [38]:
# rf
test_rf_predictions = model_rf.predict(X_test)
test_rf_probs = model_rf.predict_proba(X_test)[:, 1]

# log
test_log_predictions = model_log.predict(X_test)
test_log_probs = model_log.predict_proba(X_test)[:, 1]

# knn
test_knn_predictions = model_knn.predict(X_test)
test_knn_probs = model_knn.predict_proba(X_test)[:, 1]

#naive
test_naive_predictions = model_naive.predict(X_test)
test_naive_probs = model_naive.predict_proba(X_test)[:, 1]

# svm
test_svm_predictions = model_svm.predict(X_test)
test_svm_probs = model_svm.predict_proba(X_test)[:, 1]

for i,p in enumerate([test_rf_predictions, test_log_predictions, test_knn_predictions, test_naive_predictions, test_svm_predictions]):
    print(i)
    print_evaluations(y_test, p)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


0
Accuracy: 0.9915423358357883
Precision: 0.19207317073170732
Recall: 0.9692307692307692
F1: 0.3206106870229008
1
Accuracy: 0.9856188032563591
Precision: 0.12085769980506822
Recall: 0.9538461538461539
F1: 0.21453287197231832
2
Accuracy: 0.9892299407646742
Precision: 0.15538847117794485
Recall: 0.9538461538461539
F1: 0.2672413793103448
3
Accuracy: 0.9731698818461149
Precision: 0.07032967032967033
Recall: 0.9846153846153847
F1: 0.13128205128205128
4
Accuracy: 0.9979093414425544
Precision: 0.49572649572649574
Recall: 0.8923076923076924
F1: 0.6373626373626374


In [None]:
# Training predictions (to demonstrate overfitting)
train_rf_predictions = model_rf.predict(smote_data_X)
train_rf_probs = model_rf.predict_proba(smote_data_X)[:, 1]

# Testing predictions (to determine performance)
test_rf_predictions = model_rf.predict(X_test)
test_rf_probs = model_rf.predict_proba(X_test)[:, 1]

# Confusion matrix
cm = confusion_matrix(y_test, test_rf_predictions)
plot_confusion_matrix(cm, classes = ['Fraud', 'No Fraud'],
                      title = 'Fraud Confusion Matrix')

print_evaluations(y_test, test_rf_predictions)


In [None]:
cm = confusion_matrix(smote_data_Y, train_rf_predictions)
plot_confusion_matrix(cm, classes = ['Fraud', 'No Fraud'],
                      title = 'Fraud Confusion Matrix')
print_evaluations(smote_data_Y, train_rf_predictions)

In [None]:
model_svm = SVC(random_state = 50, probability = True)
model_svm.fit(smote_data_X, smote_data_Y)

# Training predictions (to demonstrate overfitting)
train_svm_predictions = model_svm.predict(smote_data_X)
train_svm_probs = model_svm.predict_proba(smote_data_X)[:, 1]

# Testing predictions (to determine performance)
test_svm_predictions = model_svm.predict(X_test)
test_svm_probs = model_svm.predict_proba(X_test)[:, 1]

# Confusion matrix
cm = confusion_matrix(y_test, test_svm_predictions)
plot_confusion_matrix(cm, classes = ['Fraud', 'No Fraud'],
                      title = 'Fraud Confusion Matrix')

print_evaluations(y_test, test_svm_predictions)

In [None]:
model_adaboost =  AdaBoostClassifier(random_state = 50)
model_adaboost.fit(smote_data_X, smote_data_Y)

# Training predictions (to demonstrate overfitting)
train_adaboost_predictions = model_adaboost.predict(smote_data_X)
train_adaboost_probs = model_adaboost.predict_proba(smote_data_X)[:, 1]

# Testing predictions (to determine performance)
test_adaboost_predictions = model_adaboost.predict(X_test)
test_adaboost_probs = model_adaboost.predict_proba(X_test)[:, 1]

# Confusion matrix
cm = confusion_matrix(y_test, test_adaboost_predictions)
plot_confusion_matrix(cm, classes = ['Fraud', 'No Fraud'],
                      title = 'Fraud Confusion Matrix')

print_evaluations(y_test, test_adaboost_predictions)