In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

## Load In Datasets

In [2]:
y = np.load('Data\TARGET.npy')

In [3]:
X = np.load('Data\ALL_DATA_ML.npy')

## Train Test Split

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=.9,
                                                    random_state=42)

In [6]:
del X
del y
#del y_test
#del X_test

In [9]:
counts = np.unique(y_train, return_counts=True)[1]
tot = counts[0] + counts[1]
prop0 = counts[0] / tot
prop1 = counts[1] / tot
prop0, prop1

(0.9299271322138011, 0.07007286778619896)

## Ensemble Models

### Random Forest

**Base Model**

In [45]:
from sklearn.ensemble import RandomForestClassifier

In [46]:
forest_clf = RandomForestClassifier(random_state=42, class_weight={0: prop0, 1: prop1}, n_jobs=-1)

In [47]:
X_train.shape, y_train.shape

((1320611, 16), (1320611,))

In [48]:
forest_clf.fit(X_train, y_train.ravel())

RandomForestClassifier(bootstrap=True,
            class_weight={0: 0.9299271322138011, 1: 0.07007286778619896},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

In [49]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    '''
    Print the accuracy score, classification report, and confusion matrix of classifier.
    '''
    if train:
        '''
        Training Performance
        '''
        print("Train Result:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report:\n {} \n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix:  \n {} \n".format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        '''
        Test Performance
        '''
        print("Test Result:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report:\n {} \n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix:  \n {} \n".format(confusion_matrix(y_test, clf.predict(X_test))))

In [50]:
#print_score(forest_clf, X_train, y_train.ravel(), X_test, y_test.ravel(), train=True)

NameError: name 'X_test' is not defined

In [30]:
print_score(forest_clf, X_train, y_train.ravel(), X_test, y_test.ravel(), train=False)

Test Result:

Accuracy Score: 0.9264

Classification Report:
              precision    recall  f1-score   support

          0       0.93      1.00      0.96   2455953
          1       0.18      0.01      0.03    185270

avg / total       0.88      0.93      0.90   2641223
 

Confusion Matrix:  
 [[2444323   11630]
 [ 182665    2605]] 



**Grid Search**

Memory Issues

In [51]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [52]:
params = {'bootstrap': [True, False],
          'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
          'max_features': ['auto', 'sqrt'],
          'min_samples_leaf': [1, 2, 4],
          'min_samples_split': [2, 5, 10],
          'n_estimators': [10, 100, 250]}

In [53]:
fgrid_clf = RandomizedSearchCV(forest_clf, params, n_iter = 200, cv=3, verbose=2, random_state=42, n_jobs=-1)

#too slow on this computer
#fgrid_clf = GridSearchCV(forest_clf, params, cv=3, n_jobs=-1, verbose=1)

In [None]:
fgrid_clf.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


In [None]:
fgrid_pred = fgrid_clf.predict(X_test)

In [None]:
#print_score(forest_clf, X_train, y_train.ravel(), X_test, y_test.ravel(), train=True)

In [None]:
print_score(forest_clf, X_train, y_train.ravel(), X_test, y_test.ravel(), train=False)

**SVM**

Takes forever

In [7]:
from sklearn.svm import SVC

In [10]:
svc_clf = SVC(class_weight={0: prop0, 1: prop1}, random_state=42)

In [None]:
svc_clf.fit(X_train, y_train)

In [None]:
svc_pred = svc_clf.predict(X_test)

In [None]:
print_score(forest_clf, X_train, y_train.ravel(), X_test, y_test.ravel(), train=False)

Add svc grid search for kernel, gamma, C

**KNN**

Takes forever

In [8]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_train)

In [None]:
knn_pred = knn_clf.predict(X_test)

In [None]:
print_score(forest_clf, X_train, y_train.ravel(), X_test, y_test.ravel(), train=False)

Add knn gridsearch for neighbors, etc

**LOGIT**

All bad, don't use

In [8]:
from sklearn.linear_model import LogisticRegression

In [18]:
lr_clf = LogisticRegression(class_weight={0: prop0, 1: prop1}, random_state=42)

In [10]:
lr_clf.fit(X_train, y_train)

LogisticRegression(C=1.0,
          class_weight={0: 0.92996045770151, 1: 0.07003954229848994},
          dual=False, fit_intercept=True, intercept_scaling=1,
          max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2',
          random_state=None, solver='liblinear', tol=0.0001, verbose=0,
          warm_start=False)

In [11]:
y_pred = lr_clf.predict(X_test)

In [15]:
print_score(lr_clf, X_train, y_train, X_test, y_test, train=True)

Train Result:

Accuracy Score: 0.9300



  'precision', 'predicted', average, warn_for)


Classification Report:
              precision    recall  f1-score   support

          0       0.93      1.00      0.96   8596813
          1       0.00      0.00      0.00    647465

avg / total       0.86      0.93      0.90   9244278
 

Confusion Matrix:  
 [[8596813       0]
 [ 647465       0]] 



MemoryError: 

Bad

In [17]:
from sklearn.model_selection import GridSearchCV
params = {'C': [.001, .01, .1, 10, 100],
          'max_iter': [50, 500, 1000]}

In [20]:
lr_grid = GridSearchCV(lr_clf, params, cv=3, verbose=3, n_jobs=-1)

In [21]:
lr_grid.fit(X_train, y_train)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 32.8min
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 82.0min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0,
          class_weight={0: 0.92996045770151, 1: 0.07003954229848994},
          dual=False, fit_intercept=True, intercept_scaling=1,
          max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2',
          random_state=42, solver='liblinear', tol=0.0001, verbose=0,
          warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [0.001, 0.01, 0.1, 10, 100], 'max_iter': [50, 500, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [22]:
lr_grid.best_estimator_

LogisticRegression(C=0.001,
          class_weight={0: 0.92996045770151, 1: 0.07003954229848994},
          dual=False, fit_intercept=True, intercept_scaling=1,
          max_iter=500, multi_class='ovr', n_jobs=1, penalty='l2',
          random_state=42, solver='liblinear', tol=0.0001, verbose=0,
          warm_start=False)

In [23]:
lr_grid_pred = lr_grid.predict(X_test)

In [26]:
print_score(lr_grid, X_train, y_train, X_test, y_test, train=False)

Test Result:

Accuracy Score: 0.9298



  'precision', 'predicted', average, warn_for)


Classification Report:
              precision    recall  f1-score   support

          0       0.93      1.00      0.96   3683760
          1       0.00      0.00      0.00    278074

avg / total       0.86      0.93      0.90   3961834
 

Confusion Matrix:  
 [[3683760       0]
 [ 278074       0]] 



Still bad...

**Adaboost Models**

In [27]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

In [29]:
ada_clf = AdaBoostClassifier(random_state=42)

In [30]:
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=42)

In [31]:
ada_predict = ada_clf.predict(X_test)

In [32]:
print_score(ada_clf, X_train, y_train, X_test, y_test, train=False)

Test Result:

Accuracy Score: 0.9298

Classification Report:
              precision    recall  f1-score   support

          0       0.93      1.00      0.96   3683760
          1       0.25      0.00      0.00    278074

avg / total       0.88      0.93      0.90   3961834
 

Confusion Matrix:  
 [[3683754       6]
 [ 278072       2]] 



Bad.. Probably needs up sampling... try later w/ upsampling w/ link in slack

**Adaboost with forest base estimator**

Takes forever

In [None]:
adarf_clf = AdaBoostClassifier(base_estimator=forest_clf, n_estimators=200, random_state=42)

In [37]:
adarf_clf.fit(X_train, y_train)

In [None]:
adarf_clf.predict(X_test)

In [None]:
print_score(X_train, X_test, y_train, y_test, train=False)

**Ensemble of ensembles**

Would need to set htis up, then translate predict_proba's to the 1's outputs... if prediction == 1, then add proba using average from only models that voted for 1