<a href="https://colab.research.google.com/github/liuzheqi0723/capstone-fraud-detection/blob/main/models/6_Supervised_Machine_Learning_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Application for real-time fraudulent transaction detection**

## Load and preprocessing the datasets

### Load data

In [18]:
### import libraries ###
import numpy as np
import pandas as pd


In [19]:
# # # Run it if it is the first time you running this notebook.

# # # Mount your google drive to colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
X = pd.read_csv('/content/drive/MyDrive/Capstone/Data/X_raw.csv')
X.drop(columns=['Unnamed: 0'], inplace=True) # drop index col

y_df = pd.read_csv('/content/drive/MyDrive/Capstone/Data/y_raw.csv')
y_df.drop(columns=['Unnamed: 0'], inplace=True) # drop index col

X.head()

Unnamed: 0,id_01,id_02,id_05,id_06,id_11,id_12,id_13,id_15,id_16,id_17,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,0.0,70787.0,,,100.0,NotFound,,New,NotFound,166.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-5.0,98945.0,0.0,-5.0,100.0,NotFound,49.0,New,NotFound,166.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-5.0,191631.0,0.0,0.0,100.0,NotFound,52.0,Found,Found,121.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,-5.0,221832.0,0.0,-6.0,100.0,NotFound,52.0,New,NotFound,225.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,0.0,7460.0,1.0,0.0,100.0,NotFound,,Found,Found,166.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
y = y_df.to_numpy().reshape(-1) # convert type and reshape y


X = X.drop(columns=['dist1', 'D11'], inplace=False) # all nan values
X.name = 'X'

# print(X.shape)

### Balance the X
Balaced X  does not improve the performance of ML Models. 

So the following part are all comment out. 

But we still keep all these codes for potencially used in the future. 

In [24]:
# X['isFraud']=y
# print(len(X[X['isFraud']==1])) # number of fraud
# print(len(X[X['isFraud']==0])/len(X)) # ratio of non fraud

# # resample to make a balanced dataset
# df_balance = X.groupby('isFraud').apply(lambda x: x.sample(n=10000)).reset_index(drop = True)
# print(df_balance.shape)

# X = df_balance.drop(columns=['isFraud'], inplace=False) # drop id and label
# y = df_balance['isFraud']

### Create encoding for categorical vairables


In [25]:
categorical_columns = X.dtypes[X.dtypes == np.object].index.tolist() # list of columns with categorical variables

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """Entry point for launching an IPython kernel.


In [26]:
#create ordinal encoders for categorical variables
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder( dtype=int)
oe.fit(X[categorical_columns])
X[categorical_columns] = oe.transform(X[categorical_columns]) 

## Split the dataset to train and test

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test\
   = train_test_split(X, y, test_size=0.2, random_state=697)


##Fill Nan values (Part B)
Fill with mean and most frequent values. fit and train using pipeline.

    1.b fill with **mean** value of the column
>numerical 'id_XX'

    2.b fill with **mean** value of the column using sklearn.impute.

>card1 - card6: payment card information.Such as card type, card category, issue bank, country, etc.

>addr: both addresses are for purchaser.
addr1 as billing region.
addr2 as billing country.

>dist: distances between (not limited) billing address, mailing address, zip code, IP address, phone area, etc.
<br>

    2.c fill with **most frequent value** in the column.
  >C1-C14: counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.
  
  >D1-D15: timedelta, such as days between previous transaction, etc.
<br>

In [28]:
X_null = X.isnull().sum(axis=0).to_frame() # count Nans in every col.
X_null.rename(columns={0: '#_Nans'}, inplace=True) # rename cols.

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer

### Step 1:
# filter out the cols with Nans.
X_null = X.isnull().sum(axis=0).to_frame() # count Nans in every col.
X_null.rename(columns={0: '#_Nans'}, inplace=True) # rename cols.
X_NanCols = X_null[X_null['#_Nans']>0].index # get a series contains all the names of cols with Nan.

X_fullCols = X_null[X_null['#_Nans']==0].index   # column names without NA

# make lists, indicating which stratage will be used in imputing the cols.
cols_fill_mean = []
cols_fill_freq = []

for col in X_NanCols:
  if str(col).startswith('C'): # cols C1-C1
    cols_fill_freq.append(col)
  elif str(col).startswith('D'): # cols D1-D15 and 'Device ...' which has been filled previously.
    cols_fill_freq.append(col)
  else:
    cols_fill_mean.append(col) # cols id_XX and cols has already been filled with other startages earlier.

# make all the cols still included in the following processing
cols_fill_freq.extend(X_fullCols.to_list())

In [30]:
# Step 2:
# instantiate the imputers, within a pipeline
# imputer imputes with the mean
imp_mean = Pipeline(steps=[('imputer', SimpleImputer(missing_values=np.nan, strategy='mean'))])


# imputer imputes with 'most_frequent'
imp_freq = Pipeline(steps=[('imputer',SimpleImputer(missing_values=np.nan, strategy='most_frequent'))])


                                                             

In [31]:
# Step 3:
# put the features list and the transformers together by col transformer.
imp_preprocessor = ColumnTransformer(transformers=[('imp_mean', imp_mean, cols_fill_mean),\
                                                   ('imp_freq',imp_freq,cols_fill_freq)])#,remainder='passthrough' )
    

In [32]:
# Step 4:
# fit and trans the datasets with 'imp_preprocessor'.
imp_preprocessor.fit(X_train)

X_train = imp_preprocessor.transform(X_train)
X_test = imp_preprocessor.transform(X_test)

In [33]:
# print(list(X.columns))

In [34]:
# np.save("X_train.npy", X_train)
# !cp X_train.npy "drive/MyDrive/Capstone/Data/"
# np.save("X_test.npy", X_train)
# !cp X_test.npy "drive/MyDrive/Capstone/Data/"

In [35]:
# np.save("y_train.npy", y_train)
# !cp y_train.npy "drive/MyDrive/Capstone/Data/"
# np.save("y_test.npy", y_test)
# !cp y_test.npy "drive/MyDrive/Capstone/Data/"

In [36]:
# from joblib import dump, load
# dump(imp_preprocessor, "nan_processor.joblib")
# !cp nan_processor.joblib "drive/MyDrive/Capstone/Data/"

## Supervised Machine Learning Models

In [37]:
from sklearn.metrics import accuracy_score

from sklearn.metrics import f1_score 
# F1 = 2 * (precision * recall) / (precision + recall)
# The recall is the ratio tp / (tp + fn)
# The recall is intuitively the ability of the classifier to find all the positive samples
# The precision is the ratio tp / (tp + fp) 
# The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.

from sklearn.metrics import confusion_matrix
# [[true negatives, false negatives], 
# [true positives, false positives]].

### k-NN Classifier
Too slow.</br>
Will not feed this model to our final system.



In [38]:
from sklearn.neighbors import KNeighborsClassifier


def knn_clf(n_neighbors=5):
    
    print("The number of nearest neighbors is: {}".format(n_neighbors))
    
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    Acu = accuracy_score(y_test, y_pred)
    print("the accuracy score is: {}".format(round(Acu, 4)))

    F1 = f1_score(y_test, y_pred)
    print("the F1 score is: {}".format(round(F1, 4)))

    Mtrx = confusion_matrix(y_test, y_pred, labels = [0, 1])
    print("the confusion matrix score is:\n{}\n".format(Mtrx))
    

    return knn


In [39]:
# neighbors=list(range(1,11,3))

# for n in neighbors:
#     knn_clf(n)

### Decision Tree


In [40]:
from sklearn.tree import DecisionTreeClassifier

def tree_clf(max_depth=5):
    
    print("The number of depth is: {}".format(max_depth))
    
    dtree = DecisionTreeClassifier(max_depth=max_depth, criterion = 'entropy')
    dtree.fit(X_train, y_train)
    y_pred = dtree.predict(X_test)

    Acu = accuracy_score(y_test, y_pred)
    print("the accuracy score is: {}".format(round(Acu, 4)))

    F1 = f1_score(y_test, y_pred)
    print("the F1 score is: {}".format(round(F1, 4)))

    Mtrx = confusion_matrix(y_test, y_pred, labels = [0, 1])
    print("the confusion matrix score is:\n{}\n".format(Mtrx))

    return dtree


In [41]:
# depths=list(range(3,21,3))

# for n in depths:
#     tree_clf(n)

In [42]:
# depths=list(range(12,18,1))

# for n in depths:
#     tree_clf(n)

#### result for unbalanced X:
The number of depth is: 12
the accuracy score is: 0.9612
the F1 score is: 0.7172
the confusion matrix score is:
[[24597   240]
 [  805  1325]]

The number of depth is: 13
the accuracy score is: 0.9607
the F1 score is: 0.7248
the confusion matrix score is:
[[24509   328]
 [  733  1397]]

The number of depth is: 14
the accuracy score is: 0.961
the F1 score is: 0.7273
the confusion matrix score is:
[[24510   327]
 [  726  1404]]

The number of depth is: 15
the accuracy score is: 0.9617
the F1 score is: 0.7344
the confusion matrix score is:
[[24506   331]
 [  702  1428]]

The number of depth is: 16
the accuracy score is: 0.9605
the F1 score is: 0.7292
the confusion matrix score is:
[[24466   371]
 [  695  1435]]

The number of depth is: 17
the accuracy score is: 0.9603
the F1 score is: 0.7337
the confusion matrix score is:
[[24423   414]
 [  656  1474]]


 #### result for balanced X:
 The number of depth is: 12
the accuracy score is: 0.8572
the F1 score is: 0.8528
the confusion matrix score is:
[[1775  213]
 [ 358 1654]]

The number of depth is: 13
the accuracy score is: 0.8528
the F1 score is: 0.8488
the confusion matrix score is:
[[1758  230]
 [ 359 1653]]

The number of depth is: 14
the accuracy score is: 0.855
the F1 score is: 0.8524
the confusion matrix score is:
[[1745  243]
 [ 337 1675]]

The number of depth is: 15
the accuracy score is: 0.8515
the F1 score is: 0.8495
the confusion matrix score is:
[[1729  259]
 [ 335 1677]]

The number of depth is: 16
the accuracy score is: 0.8538
the F1 score is: 0.8524
the confusion matrix score is:
[[1726  262]
 [ 323 1689]]

The number of depth is: 17
the accuracy score is: 0.8492
the F1 score is: 0.8489
the confusion matrix score is:
[[1703  285]
 [ 318 1694]]

In [43]:
# best paras
best_DT = DecisionTreeClassifier(max_depth=16, criterion = 'entropy')
best_DT.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=16)

### Random Forest

In [44]:
from sklearn.ensemble import RandomForestClassifier

def rf_clf(max_depth=5):
    
    print("The number of depth is: {}".format(max_depth))
    
    rf = RandomForestClassifier(max_depth = max_depth)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    Acu = accuracy_score(y_test, y_pred)
    print("the accuracy score is: {}".format(round(Acu, 4)))

    F1 = f1_score(y_test, y_pred)
    print("the F1 score is: {}".format(round(F1, 4)))

    Mtrx = confusion_matrix(y_test, y_pred, labels = [0, 1])
    print("the confusion matrix score is:\n{}\n".format(Mtrx))

    return rf

In [45]:
# depths=list(range(5,25,3))

# for n in depths:
#     rf_clf(n)

In [50]:
# depths=list(range(10,36,5))

# for n in depths:
#     rf_clf(n)

In [51]:
#best paras
best_rf= RandomForestClassifier(max_depth = 35)
best_rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=35)

#### Unbalanced X
The number of depth is: 10
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:8: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  
the accuracy score is: 0.9585
the F1 score is: 0.6612
the confusion matrix score is:
[[24754    83]
 [ 1037  1093]]

The number of depth is: 15
the accuracy score is: 0.9644
the F1 score is: 0.723
the confusion matrix score is:
[[24752    85]
 [  876  1254]]

The number of depth is: 20
the accuracy score is: 0.967
the F1 score is: 0.7494
the confusion matrix score is:
[[24744    93]
 [  798  1332]]

The number of depth is: 25
the accuracy score is: 0.9681
the F1 score is: 0.7616
the confusion matrix score is:
[[24736   101]
 [  758  1372]]

The number of depth is: 30
the accuracy score is: 0.9693
the F1 score is: 0.7699
the confusion matrix score is:
[[24751    86]
 [  743  1387]]

The number of depth is: 35  
the accuracy score is: 0.969
the F1 score is: 0.7687
the confusion matrix score is:
[[24742    95]
 [  741  1389]]


#### balanced X
The number of depth is: 10
the accuracy score is: 0.8625
the F1 score is: 0.8546
the confusion matrix score is:
[[1834  154]
 [ 396 1616]]

The number of depth is: 15
the accuracy score is: 0.883
the F1 score is: 0.8787
the confusion matrix score is:
[[1837  151]
 [ 317 1695]]

The number of depth is: 20
the accuracy score is: 0.889
the F1 score is: 0.886
the confusion matrix score is:
[[1831  157]
 [ 287 1725]]

The number of depth is: 25
the accuracy score is: 0.8908
the F1 score is: 0.8878
the confusion matrix score is:
[[1834  154]
 [ 283 1729]]

The number of depth is: 30
the accuracy score is: 0.895
the F1 score is: 0.8925
the confusion matrix score is:
[[1836  152]
 [ 268 1744]]

The number of depth is: 35
the accuracy score is: 0.8925
the F1 score is: 0.8897
the confusion matrix score is:
[[1835  153]
 [ 277 1735]]

### Support Vector Machines

In [52]:
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR

def svc_machine():
    
    svc_m = SVC()
    svc_m.fit(X_train, y_train)
    y_pred = svc_m.predict(X_test)

    Acu = accuracy_score(y_test, y_pred)
    print("the accuracy score is: {}".format(round(Acu, 4)))

    F1 = f1_score(y_test, y_pred)
    print("the F1 score is: {}".format(round(F1, 4)))

    Mtrx = confusion_matrix(y_test, y_pred, labels = [0, 1])
    print("the confusion matrix score is:\n{}\n".format(Mtrx))

    return svc_machine

In [53]:
# svc_machine()

### XGBooster

In [54]:
from xgboost import XGBClassifier


def xgb_clf(max_depth=5):
    
    print("The number of depth is: {}".format(max_depth))
    
    xgb = XGBClassifier(max_depth = max_depth)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)

    Acu = accuracy_score(y_test, y_pred)
    print("the accuracy score is: {}".format(round(Acu, 4)))

    F1 = f1_score(y_test, y_pred)
    print("the F1 score is: {}".format(round(F1, 4)))

    Mtrx = confusion_matrix(y_test, y_pred, labels = [0, 1])
    print("the confusion matrix score is:\n{}\n".format(Mtrx))

    return xgb_clf

In [55]:
# depths=list(range(10,30,5))

# for n in depths:
#     xgb_clf(n)



In [None]:
#best paras
best_xgb = XGBClassifier(max_depth = 30)
best_xgb.fit(X_train, y_train)

#### unbalanced X
The number of depth is: 5
the accuracy score is: 0.9664
the F1 score is: 0.7487
the confusion matrix score is:
[[24714   123]
 [  782  1348]]

The number of depth is: 10
the accuracy score is: 0.9766
the F1 score is: 0.8333
the confusion matrix score is:
[[24755    82]
 [  550  1580]]

The number of depth is: 15
the accuracy score is: 0.9775
the F1 score is: 0.8406
the confusion matrix score is:
[[24756    81]
 [  527  1603]]

The number of depth is: 20
the accuracy score is: 0.9776
the F1 score is: 0.8417
the confusion matrix score is:
[[24757    80]
 [  524  1606]]

#### balanced X

The number of depth is: 5
the accuracy score is: 0.8975
the F1 score is: 0.895
the confusion matrix score is:
[[1843  145]
 [ 265 1747]]

The number of depth is: 10
the accuracy score is: 0.9112
the F1 score is: 0.9097
the confusion matrix score is:
[[1856  132]
 [ 223 1789]]

The number of depth is: 15
the accuracy score is: 0.9168
the F1 score is: 0.9152
the confusion matrix score is:
[[1870  118]
 [ 215 1797]]

The number of depth is: 20
the accuracy score is: 0.9142
the F1 score is: 0.9129
the confusion matrix score is:
[[1859  129]
 [ 214 1798]]

## persist the model for future use without having to retrain.

In [None]:
from joblib import dump, load
dump(best_DT, 'DesicionTree.joblib') 
dump(best_rf, 'RandomForest.joblib')

!cp DesicionTree.joblib "drive/MyDrive/Capstone/Data/"
!cp RandomForest.joblib "drive/MyDrive/Capstone/Data/"

In [None]:
best_xgb.save_model('XGBooster.json')
!cp XGBooster.json "drive/MyDrive/Capstone/Data/"