In [1]:
# Import necessary modules

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

import pickle

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report,  confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Import train data
df = pd.read_pickle('dataset/train_engineered_new.pkl')
test = pd.read_pickle('dataset/test_engineered_new.pkl')

## Evaluation Metrics

### Classification Metrics

In [3]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import average_precision_score, precision_recall_curve


def eval_metrics(actual, pred):
    print("Accuracy: ", accuracy_score(actual, pred))
    print (confusion_matrix(actual, pred))
    print (classification_report(actual, pred))
    
def model_results(X_train, y_train, X_test, y_test, model):
    # Fit the training model to the training set
    model.fit(X_train, y_train)
    
    # Obtain the predicted values and probabilities from the model 
    predicted = model.predict(X_test)
    probs = model.predict_proba(X_test)
    
    # Print the accuracy and roc_auc_score performance metric
    print("Accuracy: ", accuracy_score(y_test, predicted))
    print("AUC: ", roc_auc_score(y_test, probs[:,1]))
    print("-----")
    # Print the classification report and confusion matrix
    print (confusion_matrix(y_test, predicted))
    print (classification_report(y_test, predicted))

## Create X and y

### With Missing

In [36]:
# Create a list of columns to drop
remove_list = ['isFraud']

# Create features and target
X = df.drop(remove_list, axis=1)
X = pd.get_dummies(X)
y = df.isFraud

# Split data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0, stratify=y)

X.shape

(590540, 109)

### Without Missing

In [8]:
# Create a list of columns to drop
remove_list = ['isFraud']

# Create a list of columns with more than 20% missing
remove_missing_cols = df.isna().mean()[df.isna().mean()>0.2].index.to_list()
# df.drop(remove_missing_cols, axis=1, inplace=True)

no_missing = df.drop(remove_missing_cols, axis=1).dropna()

# Create features and target
X = no_missing.drop(remove_list, axis=1)
y = no_missing.isFraud

# Split data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0, stratify=y)

X.shape

(346886, 92)

## Pipeline

In [14]:
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=10)
p_components = pca.fit_transform(X_scaled)

print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.cumsum())

[4.38579482e-01 3.44616903e-01 1.18522384e-01 3.80642733e-02
 3.16319289e-02 1.97857929e-02 7.79176771e-03 6.54208611e-04
 1.22596019e-04 5.79793898e-05]
[0.43857948 0.78319639 0.90171877 0.93978304 0.97141497 0.99120076
 0.99899253 0.99964674 0.99976934 0.99982732]


In [33]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
# from sklearn.neighbors import KNeighborsClassifier as KNN
from imblearn.pipeline import Pipeline
#from sklearn.model_selection import GridSearchCV



# Initiate steps
smt = SMOTE(sampling_strategy="minority", random_state=42)
rf = RandomForestClassifier(n_estimators=200, max_depth=30, 
                            random_state=42)

# Create the pipeline
pipeline = Pipeline([('smt', smt), 
                     ('rf', rf)])

In [20]:
model_results(X_train, y_train, X_test, y_test, pipeline)

Accuracy:  0.9554897853285415
AUC:  0.881943433442838
-----
[[98261  3723]
 [  909  1173]]
              precision    recall  f1-score   support

           0       0.99      0.96      0.98    101984
           1       0.24      0.56      0.34      2082

    accuracy                           0.96    104066
   macro avg       0.62      0.76      0.66    104066
weighted avg       0.98      0.96      0.96    104066



In [24]:
model_results(X_train, y_train, X_test, y_test, pipeline)

Accuracy:  0.9850191224799646
AUC:  0.8895722859429533
-----
[[101886     98]
 [  1461    621]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    101984
           1       0.86      0.30      0.44      2082

    accuracy                           0.99    104066
   macro avg       0.92      0.65      0.72    104066
weighted avg       0.98      0.99      0.98    104066



In [34]:
model_results(X_train, y_train, X_test, y_test, pipeline)

Accuracy:  0.9853554475044684
AUC:  0.8935515317503233
-----
[[101957     27]
 [  1497    585]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    101984
           1       0.96      0.28      0.43      2082

    accuracy                           0.99    104066
   macro avg       0.97      0.64      0.71    104066
weighted avg       0.98      0.99      0.98    104066



In [32]:
model_results(X_train, y_train, X_test, y_test, pipeline)

Accuracy:  0.9853554475044684
AUC:  0.8935515317503233
-----
[[101957     27]
 [  1497    585]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    101984
           1       0.96      0.28      0.43      2082

    accuracy                           0.99    104066
   macro avg       0.97      0.64      0.71    104066
weighted avg       0.98      0.99      0.98    104066



In [36]:
rf = RandomForestClassifier(n_estimators=400, max_depth=40, 
                            class_weight='balanced', random_state=42, 
                            n_jobs=-1)

model_results(X_train, y_train, X_test, y_test, rf)

Accuracy:  0.9848173274652624
AUC:  0.9179964673782812
-----
[[101979      5]
 [  1575    507]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    101984
           1       0.99      0.24      0.39      2082

    accuracy                           0.98    104066
   macro avg       0.99      0.62      0.69    104066
weighted avg       0.98      0.98      0.98    104066



In [None]:
# # Fit the pipeline to the train set
# pipeline.fit(X_train, y_train)

# # Predict the labels of the test set
# y_pred = pipeline.predict(X_test)

# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))

## LightGBM

In [4]:
import lightgbm as lgb

ModuleNotFoundError: No module named 'lightgbm'