In [47]:
from sklearn.feature_selection import mutual_info_classif
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from sklearn import metrics
import time, pprint
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB

In [10]:
df = pd.read_csv("creditcard.csv", nrows=100000)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 31 columns):
Time      100000 non-null int64
V1        100000 non-null float64
V2        100000 non-null float64
V3        100000 non-null float64
V4        100000 non-null float64
V5        100000 non-null float64
V6        100000 non-null float64
V7        100000 non-null float64
V8        100000 non-null float64
V9        100000 non-null float64
V10       100000 non-null float64
V11       100000 non-null float64
V12       100000 non-null float64
V13       100000 non-null float64
V14       100000 non-null float64
V15       100000 non-null float64
V16       100000 non-null float64
V17       100000 non-null float64
V18       100000 non-null float64
V19       100000 non-null float64
V20       100000 non-null float64
V21       100000 non-null float64
V22       100000 non-null float64
V23       100000 non-null float64
V24       100000 non-null float64
V25       100000 non-null float64
V26     

In [16]:
df.Class.value_counts()

0    99777
1      223
Name: Class, dtype: int64

In [13]:
x = df.drop('Class', axis=1)
x.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


In [17]:
df_minority = df[(df['Class']==1)]

df_up = resample(df_minority,replace=True,  
                             n_samples=99000)

df_re = pd.concat([df,df_up])

df_re['Class'].value_counts()

0    99777
1    99223
Name: Class, dtype: int64

In [27]:
def random_classifier(df,
                     predictor, 
                     print_output=True,
                     min_sample_split=2):
    
    X = df.drop(labels=predictor,
                        axis=1)

    y = df[predictor]
    
    # Create test/train split with 30% in test set
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        test_size=0.5)
    
    #Initialize model
    rand_forest = RandomForestClassifier(n_estimators=20, 
                                         min_samples_split=min_sample_split)
   
    # Fit training set
    rand_forest.fit(X_train, y_train)
    
    # Get prediciton on test set
    y_pred = rand_forest.predict(X_test)
    
    if print_output:
        print('Test Accuracy:',
              rand_forest.score(X_test, y_test),
              '\n',
              'Train Accuracy:',
              rand_forest.score(X_train, y_train),
              '\n\nClassification Report:\n\n',
              classification_report(y_test, y_pred),
              '\n\nConfusion Matrix:',
              '\n\n',
              pd.DataFrame(confusion_matrix(y_pred, y_test)).rename(columns={1: 'Actual True',
                                                               0: 'Actual False'}).T\
                                                      .rename(columns={1: 'Pred True',
                                                               0: 'Pred False'}))
    
    return rand_forest.score(X_test, y_test), rand_forest
    

random_classifier(df=df_re,
                  predictor='Class',
                  print_output=True,
                  min_sample_split=2)[0]

Test Accuracy: 0.9999396984924623 
 Train Accuracy: 1.0 

Classification Report:

               precision    recall  f1-score   support

           0       1.00      1.00      1.00     49998
           1       1.00      1.00      1.00     49502

   micro avg       1.00      1.00      1.00     99500
   macro avg       1.00      1.00      1.00     99500
weighted avg       1.00      1.00      1.00     99500
 

Confusion Matrix: 

               Pred False  Pred True
Actual False       49992          6
Actual True            0      49502


0.9999396984924623

In [39]:
rand_forest_model = random_classifier(df=df_re,
                                      predictor='Class',
                                      print_output=True,
                                      min_sample_split=2)[1]
X = df_re.drop('Class',
               axis=1)

sorted(zip(X.columns,rand_forest_model.feature_importances_),key = lambda x: x[1], reverse=True)[:10]

Test Accuracy: 0.999929648241206 
 Train Accuracy: 1.0 

Classification Report:

               precision    recall  f1-score   support

           0       1.00      1.00      1.00     50041
           1       1.00      1.00      1.00     49459

   micro avg       1.00      1.00      1.00     99500
   macro avg       1.00      1.00      1.00     99500
weighted avg       1.00      1.00      1.00     99500
 

Confusion Matrix: 

               Pred False  Pred True
Actual False       50034          7
Actual True            0      49459


[('V14', 0.25870202938971304),
 ('V10', 0.15598439646570447),
 ('V4', 0.11203556136485918),
 ('V3', 0.08520843335445973),
 ('V12', 0.06793714486177603),
 ('V17', 0.052508150330321544),
 ('V11', 0.04934706200566714),
 ('V16', 0.038823349552450745),
 ('V18', 0.02517239086928358),
 ('V6', 0.02158521219037096),
 ('V1', 0.020673409914517448),
 ('Amount', 0.012502665783144028),
 ('V20', 0.008613731229786334),
 ('V13', 0.007776683070874845),
 ('V15', 0.007367985447865812),
 ('V2', 0.006551123585673356),
 ('V19', 0.0063199222723994255),
 ('Time', 0.006269382836092416),
 ('V26', 0.006190339534307603),
 ('V21', 0.005779577419433253)]

In [45]:
random_classifier(df=df_re[['V14',
                           'Class']],
                          predictor='Class',
                          print_output=True,
                          min_sample_split=2)[0]

Test Accuracy: 0.9990753768844222 
 Train Accuracy: 0.9999798994974874 

Classification Report:

               precision    recall  f1-score   support

           0       1.00      1.00      1.00     49691
           1       1.00      1.00      1.00     49809

   micro avg       1.00      1.00      1.00     99500
   macro avg       1.00      1.00      1.00     99500
weighted avg       1.00      1.00      1.00     99500
 

Confusion Matrix: 

               Pred False  Pred True
Actual False       49599         92
Actual True            0      49809


0.9990753768844222

In [66]:
def Gaussian_NB(df,
                predictor):  
    
    X = df.drop(labels=predictor,
                        axis=1)

    y = df[predictor]
    
    # Create test/train split with 30% in test set
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        test_size=0.3)
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    nb_score = (accuracy_score(y_test, nb.predict(X_test)))
    # Get prediciton on test set
    y_pred = nb.predict(X_test)
    print('GaussianNB Accuracy: ' + str(nb_score),classification_report(y_test, nb.predict(X_test)))
    return  pd.DataFrame(confusion_matrix(y_pred, y_test)).rename(columns={1: 'Actual True',
                                                               0: 'Actual False'}).T\
                                                      .rename(columns={1: 'Pred True',
                                                               0: 'Pred False'}), nb
    
    
nb_model = Gaussian_NB(df=df_re[['V14','Amount','Time','Class']],
            predictor='Class')

GaussianNB Accuracy: 0.9327303182579565               precision    recall  f1-score   support

           0       0.89      0.98      0.94     29910
           1       0.98      0.88      0.93     29790

   micro avg       0.93      0.93      0.93     59700
   macro avg       0.94      0.93      0.93     59700
weighted avg       0.94      0.93      0.93     59700



In [67]:
nb_model[0]

Unnamed: 0,Pred False,Pred True
Actual False,29398,512
Actual True,3504,26286


In [77]:
nb_model[1].sigma_

array([[1.07278568e+00, 7.18346770e+04, 2.89581952e+08],
       [1.88797523e+01, 6.86743794e+04, 3.10211292e+08]])

In [103]:
def lr_classifier(df,
                 predictor, 
                 print_output=True):
    
    X = df.drop(labels=predictor,
                        axis=1)

    y = df[predictor]

    X_train, X_test, y_train, y_test = train_test_split(df_re.drop('Class', axis=1), df_re['Class'], test_size=0.3)

    Lr = LogisticRegression(penalty = 'l2',
                            dual=False, tol=0.0001,
                            fit_intercept=True, 
                            intercept_scaling=1,
                            C=20)
    # Fit training set
    Lr.fit(X_train, y_train)
    
    # Get prediciton on test set
    y_pred =  Lr.predict(X_test)
    
    if print_output:
        print('Test Accuracy:',
               Lr.score(X_test, y_test),
              '\n',
              'Train Accuracy:',
               Lr.score(X_train, y_train),
              '\n\nClassification Report:\n\n',
              classification_report(y_test, y_pred),
              '\n\nConfusion Matrix:',
              '\n\n',
              pd.DataFrame(confusion_matrix(y_pred, y_test)).rename(columns={1: 'Actual True',
                                                               0: 'Actual False'}).T\
                                                      .rename(columns={1: 'Pred True',
                                                               0: 'Pred False'}))
    
    return  Lr.score(X_test, y_test),  Lr

In [107]:
logistic_model = lr_classifier(df=df_re[['V4','V11','Class']],
                                      predictor='Class',
                                      print_output=True)[1]



Test Accuracy: 0.9561641541038526 
 Train Accuracy: 0.9556209619526203 

Classification Report:

               precision    recall  f1-score   support

           0       0.94      0.97      0.96     29906
           1       0.97      0.94      0.96     29794

   micro avg       0.96      0.96      0.96     59700
   macro avg       0.96      0.96      0.96     59700
weighted avg       0.96      0.96      0.96     59700
 

Confusion Matrix: 

               Pred False  Pred True
Actual False       29126        780
Actual True         1837      27957


In [105]:
X = df_re.drop('Class',
               axis=1)

sorted(zip(list(df_re.drop('Class', axis=1).columns), list(logistic_model.coef_[0])),key = lambda x: x[1], reverse=True)[:10]

[('V4', 0.4069363954671096),
 ('V11', 0.11691498441149403),
 ('V21', 0.07688276702908342),
 ('V15', 0.06656470802285723),
 ('V5', 0.05864548728097507),
 ('V22', 0.054557449323371164),
 ('V2', 0.04859998891136505),
 ('V18', 0.03581295072659923),
 ('V23', 0.03460472992541454),
 ('V7', 0.016187787896918236)]