In [45]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score, confusion_matrix
from sklearn.metrics import roc_auc_score

Credit card dataset obtained from: https://www.kaggle.com/mlg-ulb/creditcardfraud

In [4]:
creditcard = pd.read_csv(r'/Users/admin/Documents/Supervised_learning/Supervised_learning/creditcard.csv')

In [5]:
creditcard.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [6]:
creditcard.shape

(284807, 31)

In [7]:
#The authors have warned us that the dataset is unbalanced, there are very few fraudulent cases
np.unique(creditcard.Class, return_counts = True)

(array([0, 1]), array([284315,    492]))

# Vanilla Logistic Regression

In [8]:
creditcard.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [11]:
Y = creditcard['Class']
X =creditcard.loc[:, ~creditcard.columns.isin(['Class'])]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=108)

In [24]:
logreg = LogisticRegression(C=1e6)
print(np.mean(cross_val_score(logreg, X_train, Y_train, scoring = 'roc_auc')))

0.9307900756500525


# Ridge Regression

In [28]:
roc_scores = []
Cs = []

for value in [1e-10,1e-3,  1, 5, 20]:
    ridge = LogisticRegression(C=value, penalty= 'l2')
    roc = np.mean(cross_val_score(ridge, X_train, Y_train, scoring = 'roc_auc'))
    roc_scores.append(roc)
    Cs.append(value)

In [30]:
df = pd.DataFrame(roc_scores)
df['params'] = Cs
df.columns=['roc_auc_scores', 'params']

df.sort_values(by = 'roc_auc_scores', ascending=False).reset_index(drop=True)

Unnamed: 0,roc_scores,params
0,0.930786,20.0
1,0.93077,5.0
2,0.930713,1.0
3,0.862225,0.001
4,0.568199,1e-10


# Test Set Validation

In [49]:
ridge = LogisticRegression(C=2.000000e+01, penalty= 'l2')
ridge.fit(X_train, Y_train)
roc_auc_score(Y_test, ridge.predict_proba(X_test)[:, 1])

0.9203227183630716

# Lasso

In [31]:
roc_auc_scores = []
Cs = []

for value in [1e-15, 1e-3,  10]:
    lasso = LogisticRegression(C=value, penalty= 'l1')
    roc = np.mean(cross_val_score(lasso, X_train, Y_train, scoring = 'roc_auc'))
    roc_scores.append(roc)
    Cs.append(value)

In [32]:
df = pd.DataFrame(roc_scores)
df['params'] = Cs
df.columns=['roc_auc_scores', 'params']

df.sort_values(by = 'roc_auc_scores', ascending=False).reset_index(drop=True)

Unnamed: 0,roc_scores,params
0,0.977975,10.0
1,0.757371,0.001
2,0.5,1e-15


# Test Set Validation

In [47]:
lasso = LogisticRegression(C=1.000000e+01, penalty= 'l1')
lasso.fit(X_train, Y_train)
roc_auc_score(Y_test, lasso.predict_proba(X_test)[:, 1])

0.9613223274668515

# Random Forest

In [37]:
roc_auc_scores = []
parameters = []

est_number = [100, 500,700] 

for value in est_number:
    rfc = RandomForestClassifier(n_jobs = -1, n_estimators = value, class_weight = 'balanced')
    roc_auc = np.mean(cross_val_score(rfc, X_train, Y_train, scoring = 'roc_auc', n_jobs=-1))
    roc_auc_scores.append(roc_auc)
    parameters.append(value)

In [38]:
df = pd.DataFrame(roc_auc_scores)
df['params'] = parameters
df.columns=['roc_auc_scores', 'params']

df.sort_values(by = 'roc_auc_scores', ascending=False).reset_index(drop=True)

Unnamed: 0,roc_auc_scores,params
0,0.970863,700
1,0.968972,500
2,0.952507,100


In [40]:
roc_auc_scores = []
parameters = []

depth = [8, 20, 50] 

for value in depth:
    rfc = RandomForestClassifier(
          n_jobs = -1, 
          class_weight = 'balanced',
          n_estimators = 700, 
          max_depth = value)
    
    roc_auc = np.mean(cross_val_score(
                        rfc, 
                        X_train, 
                        Y_train, 
                        scoring = 'roc_auc', 
                        n_jobs=-1))
    
    roc_auc_scores.append(roc_auc)
    parameters.append(value)

In [41]:
df = pd.DataFrame(roc_auc_scores)
df['params'] = parameters
df.columns=['roc_auc_scores', 'params']

df.sort_values(by = 'roc_auc_scores', ascending=False).reset_index(drop=True)

Unnamed: 0,roc_auc_scores,params
0,0.982722,8
1,0.972816,20
2,0.970863,50


In [42]:
rfc = RandomForestClassifier(
          n_jobs = -1, 
          class_weight = 'balanced',
          n_estimators = 1000, 
          max_depth = 10)
    
roc_auc = np.mean(cross_val_score(
                        rfc, 
                        X_train, 
                        Y_train, 
                        scoring = 'roc_auc', 
                        n_jobs=-1))
    
print(roc_auc)

0.9832474308568268


# Test Set Validation

In [46]:
rfc= RandomForestClassifier(n_estimators = 1000, max_depth = 10, n_jobs=-1, class_weight='balanced')
rfc.fit(X_train, Y_train)
roc_auc_score(Y_test, rfc.predict_proba(X_test)[:, 1])

0.9679113488176854

# We can manually set a threshhold that reflects our business objectives

In [67]:
def prediction(classifier, feature_set, prob):
    y_predicted = []
    for i in classifier.predict_proba(feature_set)[:, 1]:
        if i > prob:
            y_predicted.append(1)
        else: 
            y_predicted.append(0)
    return y_predicted

y_predicted = prediction(rfc, X_test, 0.05)   

In [68]:
confusion_matrix(Y_test, y_predicted)

array([[55718,  1163],
       [   10,    71]])

In [65]:
y_predicted = prediction(rfc, X_test, 0.3)   

In [66]:
confusion_matrix(Y_test, y_predicted)

array([[56847,    34],
       [   21,    60]])