# Modeling of Fraud Detection

### Initiation

In [172]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, date, time, timedelta
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
#importing over and undersampling algorithms from imblearn (you will have to manually install it in your envoirenment with pip install imblearn) 
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

In [173]:
# important functions
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Oranges):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    # Plot the confusion matrix
    plt.figure(figsize = (10, 10))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, size = 24)
    plt.colorbar(aspect=4)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, size = 14)
    plt.yticks(tick_marks, classes, size = 14)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    
    # Labeling the plot
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), fontsize = 20,
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
        
    plt.grid(None)
    plt.tight_layout()
    plt.ylabel('True label', size = 18)
    plt.xlabel('Predicted label', size = 18)

def print_evaluations(y,predictions):
    print(f"Accuracy: {accuracy_score(y, predictions)}")
    print(f"Precision: {precision_score(y, predictions)}")
    print(f"Recall: {recall_score(y, predictions)}")
    print(f"F1: {f1_score(y, predictions)}")


In [174]:
X = pd.read_csv('./data/X.csv', index_col='Unnamed: 0')
y = pd.read_csv('./data/y.csv', index_col='Unnamed: 0')


In [175]:
X.pop('Valuelog')

0        6.907755
1        2.995732
2        6.214608
3        9.989665
4        6.467699
           ...   
95657    6.907755
95658    6.907755
95659    2.995732
95660    8.006368
95661    4.094345
Name: Valuelog, Length: 95662, dtype: float64

In [176]:
X.columns

Index(['Value', 'PricingStrategy', 'Hour', 'DirectionOfMoney',
       'ProviderId_ProviderId_1', 'ProviderId_ProviderId_2',
       'ProviderId_ProviderId_3', 'ProviderId_ProviderId_4',
       'ProviderId_ProviderId_5', 'ProviderId_ProviderId_6',
       'ProductId_ProductId_1', 'ProductId_ProductId_10',
       'ProductId_ProductId_11', 'ProductId_ProductId_12',
       'ProductId_ProductId_13', 'ProductId_ProductId_14',
       'ProductId_ProductId_15', 'ProductId_ProductId_16',
       'ProductId_ProductId_19', 'ProductId_ProductId_2',
       'ProductId_ProductId_20', 'ProductId_ProductId_21',
       'ProductId_ProductId_22', 'ProductId_ProductId_23',
       'ProductId_ProductId_24', 'ProductId_ProductId_27',
       'ProductId_ProductId_3', 'ProductId_ProductId_4',
       'ProductId_ProductId_5', 'ProductId_ProductId_6',
       'ProductId_ProductId_7', 'ProductId_ProductId_8',
       'ProductId_ProductId_9', 'ChannelId_ChannelId_1',
       'ChannelId_ChannelId_2', 'ChannelId_ChannelId_3',

### Split into train and test

In [177]:
#splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Standardize continuous variables

In [178]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
cont_variables = ['Value']
scaler = MinMaxScaler()
X_train[cont_variables] = scaler.fit_transform(X_train[cont_variables], )
X_test[cont_variables] = scaler.fit_transform(X_test[cont_variables], )


### Resample the data

#### 1. Oversample Frauds

In [179]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_over, y_over = ros.fit_resample(X_train, y_train)

np.mean(y_over)

FraudResult    0.5
dtype: float64

#### 2. Create Synthetic Fraud Data

In [180]:
#used smote algorithm (synthetic oversampling) to oversample fradulent class
# dataframes of synthetic datapoints: smote_data_X, smote_data_Y
smote_algo = SMOTE(random_state=50)
X_syn, y_syn = smote_algo.fit_resample(X_train, y_train)
X_syn = pd.DataFrame(data=X_syn, columns=X_train.columns)
y_syn = pd.DataFrame(data=y_syn, columns=['FraudResult'])
np.mean(y_syn)

FraudResult    0.5
dtype: float64

#### 3. Undersample Non-Frauds

In [181]:
# used randomundersampler algorithm to undersample non fradulent class
# dataframes for undersampled data: X_res, y_res
rus = RandomUnderSampler(random_state=50)
X_under, y_under = rus.fit_resample(X_train, y_train)
np.mean(y_under)

FraudResult    0.5
dtype: float64

# Model fitting

In [201]:
X_input = X_syn
y_input = y_syn
from sklearn.model_selection import GridSearchCV

# random forest
model_rf = RandomForestClassifier(random_state=50, 
                               n_jobs=-1, verbose = 1)
param_grid = {'bootstrap': [True, False], 'max_depth': [10, 30, 50, 70, 100],
'max_features': ['auto', 'sqrt', 'log2'],
'n_estimators': [100, 500, 1000]}

model_rfcv = GridSearchCV(estimator=model_rf, param_grid=param_grid, cv=5)
model_rfcv.fit(X_input, y_input)


# logistic

#model_log = LogisticRegression(random_state=50).fit(X_input, y_input)

# knn
#model_knn = KNeighborsClassifier(n_neighbors=5)
#model_knn.fit(X_input, y_input)

## naive
#model_naive = MultinomialNB()
#model_naive.fit(X_input, y_input)

# svm
#model_svm = SVC(random_state = 50, probability = True)
#model_svm.fit(X_input, y_input)

  estimator.fit(X_train, y_train, **fit_params)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
  estimator.fit(X_train, y_train, **fit_params)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
  estimator.fit(X_train, y_train, **fit_params)
[Parallel(n_jobs

KeyboardInterrupt: 

Evaluation of Train Data

In [193]:
# rf
train_rfcv_predictions = model_rfcv.predict(X_input)
#train_rf_probs = model_rf.predict_proba(X_input)[:, 1]


# log
#train_log_predictions = model_log.predict(X_input)
#train_log_probs = model_log.predict_proba(X_input)[:, 1]

# knn
#train_knn_predictions = model_knn.predict(X_input)
#train_knn_probs = model_knn.predict_proba(X_input)[:, 1]

#naive
#train_naive_predictions = model_naive.predict(X_input)
#train_naive_probs = model_naive.predict_proba(X_input)[:, 1]

# svm
#train_svm_predictions = model_svm.predict(X_input)
#train_svm_probs = model_svm.predict_proba(X_input)[:, 1]

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.8s finished


In [194]:
for i,p in enumerate([train_rfcv_predictions]):#, train_log_predictions, train_knn_predictions, train_svm_predictions]):
    print(i)#(numstr(p))
    print_evaluations(y_input, p)


0
Accuracy: 0.9998202141796295
Precision: 0.9996405576045508
Recall: 1.0
F1: 0.9998202464967605


In [195]:
# rf
test_rfcv_predictions = model_rfcv.predict(X_test)
#test_rf_probs = model_rf.predict_proba(X_test)[:, 1]

# log
#test_log_predictions = model_log.predict(X_test)
#test_log_probs = model_log.predict_proba(X_test)[:, 1]

# knn
#test_knn_predictions = model_knn.predict(X_test)
#test_knn_probs = model_knn.predict_proba(X_test)[:, 1]

#naive
#test_naive_predictions = model_naive.predict(X_test)
#test_naive_probs = model_naive.predict_proba(X_test)[:, 1]

# svm
#test_svm_predictions = model_svm.predict(X_test)
#test_svm_probs = model_svm.predict_proba(X_test)[:, 1]



[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.2s finished


In [196]:
for i,p in enumerate([test_rfcv_predictions]):#, test_log_predictions, test_knn_predictions, test_svm_predictions]):
    print(i)#(numstr(p))
    print_evaluations(y_test, p)

0
Accuracy: 0.9991447305901359
Precision: 0.7435897435897436
Recall: 0.8923076923076924
F1: 0.8111888111888114


In [197]:
model_rfcv.best_estimator_

RandomForestClassifier(bootstrap=False, max_depth=30, n_estimators=500,
                       n_jobs=-1, random_state=50, verbose=1)

In [199]:
# fit best model
model_rf = RandomForestClassifier(bootstrap=False, max_depth=30, n_estimators=500,
                       n_jobs=-1, random_state=50, verbose=1)
model_rf.fit(X_input, y_input)
train_rf_predictions = model_rf.predict(X_input)
test_rf_predictions = model_rf.predict(X_test)
print_evaluations(y_test, test_rf_predictions)

  model_rf.fit(X_input, y_input)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    4.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s


Accuracy: 0.9991447305901359
Precision: 0.7435897435897436
Recall: 0.8923076923076924
F1: 0.8111888111888114


[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.2s finished


In [200]:
features = list(X_train.columns)

fi = pd.DataFrame({'feature': features,
                   'importance': model_rf.feature_importances_}).\
                    sort_values('importance', ascending = False)
fi

Unnamed: 0,feature,importance
0,Value,0.5045278
16,ProductId_ProductId_15,0.09290619
37,ProductCategory_airtime,0.05207417
9,ProviderId_ProviderId_6,0.04756885
29,ProductId_ProductId_6,0.03740968
39,ProductCategory_financial_services,0.03611306
7,ProviderId_ProviderId_4,0.03058698
34,ChannelId_ChannelId_2,0.02885825
3,DirectionOfMoney,0.02759839
35,ChannelId_ChannelId_3,0.02320963
