In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os
import sys

import csv
import datetime
import itertools
import numpy as np
import pandas as pd
import random
import re
import sklearn
import time
from collections import defaultdict

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context(rc={
       "figure.figsize": (16, 10),
       "axes.titlesize": 14})

from IPython.display import Image, display
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

from os.path import expanduser
sys.path.insert(1, '{}/datsci'.format(expanduser('~')))
from datsci import eda, munge, ml
from datsci import kaggle as kg

In [32]:
FILE_TRAIN                 = 'data/train.csv'
FILE_TRAIN_DEDUP           = 'data/train.dedup.csv'
FILE_TRAIN_DEDUP_ONEHOT    = 'data/train.dedup.onehot.csv'
FILE_TRAIN_DEDUP_ONEHOT_NA = 'data/train.dedup.onehot.na.csv'

FILE_TEST                  = 'data/test.csv'
FILE_TEST_DEDUP            = 'data/test.dedup.csv'
FILE_TEST_DEDUP_ONEHOT     = 'data/test.dedup.onehot.csv'
FILE_TEST_DEDUP_ONEHOT_NA  = 'data/test.dedup.onehot.na.csv'

FILE_SAMPLE_SUBMIT         = 'data/sample_submission.csv'

TARGET_COL                 = 'TARGET'

## 1. Raw Data

In [3]:
from sklearn.cross_validation import train_test_split

# Read in data
df = pd.read_csv(FILE_TRAIN, index_col='ID')
df_test = pd.read_csv(FILE_TEST, index_col='ID')

# Split up the data
feature_cols = list(df.columns)
feature_cols.remove(TARGET_COL)
X_all = df[feature_cols]  # feature values for all students
y_all = df[TARGET_COL]

test_size = 0.3 # 30 percent
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=test_size, random_state=0, stratify=y_all)

In [17]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import SGDClassifier as SGDClf

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer


# # SGD with linear svm
# sgdclf_svm = SGDClf(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15,
#                     n_iter=5, shuffle=True, n_jobs=1, random_state=0, learning_rate='optimal',
#                     power_t=0.5, class_weight=None, warm_start=False, average=False)

# # SGD with logistic regression
# sgdclf_logistic = SGDClf(loss='log', penalty='l2', alpha=0.0001, l1_ratio=0.15,
#                          n_iter=5, shuffle=True, n_jobs=1, random_state=0, learning_rate='optimal',
#                          power_t=0.5, class_weight=None, warm_start=False, average=False)

# descriptions_clfs = [
#     ("SGD linear svm", sgdclf_svm),
#     ("SGD logistic", sgdclf_logistic),
#     ("SVC Linear kernel", SVC(C=1.0, kernel='linear', gamma='auto')),
#     ("SVC polynomial deg 2 kernel", SVC(C=1.0, kernel='poly', degree=2, gamma='auto')),
#     ("SVC polynomial deg 3 kernel", SVC(C=1.0, kernel='poly', degree=3, gamma='auto')),
#     ("SVC rbf kernel", SVC(C=1.0, kernel='rbf', gamma='auto')),
#     ("KNeighbors, 3 neighbors", KNC(n_neighbors=3, weights='uniform')),
#     ("RFC, 10 trees", RFC(n_estimators=10, max_depth=None, min_samples_split=2, n_jobs=4)),
#     ("RFR, 60 trees", RFR(n_estimators=60, max_depth=None, min_samples_split=2, n_jobs=8)),
#     ("LogisticRegression", LogisticRegression(C=1.0, penalty='l2', random_state=0, multi_class='ovr', n_jobs=4)),
#     ("GradientBoostingClassifier", GBC(loss='deviance', learning_rate=0.1, n_estimators=10, max_depth=None, min_samples_split=2)),
#     ("AdaBoostClassifier w SVC linear kernel", ABC(SVC(C=1.0, kernel='linear', gamma='auto'), n_estimators=10, learning_rate=1.0, algorithm='SAMME'))
# ]

In [22]:
descriptions_clfs = [
    ("GradientBoostingClassifier", GBC(loss='deviance', learning_rate=0.1, n_estimators=10, max_depth=None, min_samples_split=2)),
]
no_processing_prelim_results = ml.train_predict(
    descriptions_clfs, X_train, y_train, X_test, y_test, scorer=roc_auc_score)

eda.pprint(no_processing_prelim_results)

+---+----------------------------+----------------+----------------+---------------+--------------------+-------------------+
|   |        description         |  score_train   |   score_test   |   time_train  | time_predict_train | time_predict_test |
+---+----------------------------+----------------+----------------+---------------+--------------------+-------------------+
| 0 | GradientBoostingClassifier | 0.946571413377 | 0.535602898923 | 150.652132988 |   0.273437023163   |   0.110949993134  |
+---+----------------------------+----------------+----------------+---------------+--------------------+-------------------+


|    description     | score_train | score_test |   time_train  | time_predict_train | time_predict_test |
|--------------------|-------------|------------|---------------|--------------------|-------------------|
| LogisticRegression |     0.5     |  0.5       | 3.48322510719 |   0.116809129715   |  0.0402228832245  |
| SGD linear svm     |     0.0     |  0.002183  | 0.378594      |   0.103345         |  0.047913         |
| SGD logistic       |     0.0     |  0.002193  | 0.369095      |   0.108437         |  0.028139         |
| RFC, 10  trees     |	0.827815   |  0.083406  | 0.790865      |   0.24956          |  0.157566         |
| RFC, 10  trees     | 0.859718579 | 0.51823037 | 0.60007190704 |   0.27322602272    |  0.155475139618   |
| RFC, 50  trees     | 0.852189720 | 0.51933902 | 0.59852290153 |   0.24054980278    |  0.164301872253   |
| RFC, 100 trees     | 0.855910125 | 0.51557289 | 0.58913993835 |   0.247822999954   |  0.163942098618   |
| RFR, 10  trees     | 0.995971354 | 0.69631041 | 3.9711160659  |   0.294628143311   |  0.182307958603   |
| RFR, 30  trees     | 0.996592169 | 0.74093666 | 9.39642286301 |   0.290091991425   |  0.176054954529   |
| RFR, 40  trees     | 0.996521443 | 0.74854986 | 12.3622310162 |   0.289952039719   |  0.172476053238   |
| RFR, 50  trees     | 0.996076029 | 0.70951576 | 4.8906700611  |   0.255952119827   |  0.165596961975   |
| RFR, 60  trees     | 0.997072592 | 0.75873464 | 16.9496002197 |   0.258872032166   |  0.16659617424    |
| RFR, 100 trees     | 0.996139285 | 0.70160749 | 4.0920510292  |   0.255863189697   |  0.167984962463   |
| GradientBoostClf   | 0.946571413 | 0.53560289 | 150.652132988 |   0.273437023163   |  0.110949993134   |

In [66]:
%%time

# GridCV for Random Forest
clf_rf = RFR(n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2,
             min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
             max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=2,
             random_state=0, verbose=0, warm_start=False)

param_grid = {
    'n_estimators': [670, 690],
    'max_depth': [9],
}
best_score, best_model = ml.fine_tune_params(clf_rf,
                                             X_train.values, y_train.values,
                                             X_test.values, y_test.values,
                                             param_grid,
                                             n_runs=1,
                                             n_cv=3,
                                             scorer=roc_auc_score,
                                             n_jobs=4)

iteration 0


descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensio

Each iteration time(secs): 1104.896
CPU times: user 7min 22s, sys: 1.62 s, total: 7min 23s
Wall time: 18min 24s


In [67]:
roc_auc_score(y_train, best_model.predict(X_train)), best_score, best_model.n_estimators, best_model.max_depth

(0.89544301238220747, 0.8346291833431333, 670, 9)

n_estimators | max_depth | train score | test score |
-------------|-----------| ------------|------------|
500          | 8         | 0.88057875  | 0.83468680 |
600          | 9         | 0.89524552  | 0.83467362 |
650          | 9         | 0.89546629  | 0.83467145 |
670          | 9         | 0.89544301  | 0.83462918 |

In [68]:
kg.save_submission(best_model.predict(df_test), 'submissions/unprocessed.rfr.csv')

## 2. Dedup data

In [111]:
from sklearn.cross_validation import train_test_split

df = pd.read_csv(FILE_TRAIN_DEDUP)
df_test = pd.read_csv(FILE_TEST_DEDUP, index_col='ID')

# Split up the data
feature_cols = list(df.columns)
feature_cols.remove(TARGET_COL)
X_all = df[feature_cols]  # feature values for all students
y_all = df[TARGET_COL]

test_size = 0.3 # 30 percent
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=test_size, random_state=0, stratify=y_all)

In [112]:
%%time

# GridCV for Random Forest
clf_rf = RFR(n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2,
             min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
             max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=2,
             random_state=0, verbose=0, warm_start=False)

param_grid = {
    'n_estimators': [100, 300, 500, 700],
    'max_depth': [8, 9, 10, 11, 12],
}
best_score, best_model = ml.fine_tune_params(clf_rf,
                                             X_train, y_train,
                                             X_test, y_test,
                                             param_grid,
                                             n_runs=2,
                                             n_cv=3,
                                             scorer=roc_auc_score,
                                             n_jobs=4)

iteration 0
Each iteration time(secs): 2523.339
iteration 1
Each iteration time(secs): 2475.204
CPU times: user 13min 21s, sys: 2.18 s, total: 13min 23s
Wall time: 1h 23min 18s


In [113]:
best_model.n_estimators, best_model.max_depth, roc_auc_score(y_train, best_model.predict(X_train)), best_score

(700, 9, 0.89549650488440458, 0.83470427610605369)

n_estimators | max_depth | train score | test score | gridcv params
-------------|-----------|-------------|------------|-------------------------
700          | 9         | 0.895496504 | 0.83470427 | [100, 300, 500, 700] [8, 9, 10, 11, 12]

In [117]:
kg.save_submission(best_model.predict(df_test), 'submissions/dedup.rfr.csv')

## 3. Binary One-hot data

### Random Forest

In [118]:
from sklearn.cross_validation import train_test_split

df = pd.read_csv(FILE_TRAIN_DEDUP_ONEHOT)
df_test = pd.read_csv(FILE_TEST_DEDUP_ONEHOT, index_col='ID')

# Split up the data
feature_cols = list(df.columns)
feature_cols.remove(TARGET_COL)
X_all = df[feature_cols]  # feature values for all students
y_all = df[TARGET_COL]

test_size = 0.3 # 30 percent
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=test_size, random_state=0, stratify=y_all)

In [122]:
%%time

# GridCV for Random Forest
clf_rf = RFR(n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2,
             min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
             max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=2,
             random_state=0, verbose=0, warm_start=False)

param_grid = {
    'n_estimators': [198, 200, 202],
    'max_depth': [7, 8, 9],
}
best_score, best_model = ml.fine_tune_params(clf_rf,
                                             X_train, y_train,
                                             X_test, y_test,
                                             param_grid,
                                             n_runs=2,
                                             n_cv=3,
                                             scorer=roc_auc_score,
                                             n_jobs=4)

iteration 0
Each iteration time(secs): 565.568
iteration 1
Each iteration time(secs): 558.129
CPU times: user 4min 27s, sys: 1.27 s, total: 4min 28s
Wall time: 18min 43s


In [123]:
best_model.n_estimators, best_model.max_depth, roc_auc_score(y_train, best_model.predict(X_train)), best_score

(200, 9, 0.89489814266356504, 0.83431332657190871)

n_estimators | max_depth | train score | test score | gridcv params
-------------|-----------|-------------|------------|-------------------------
670          | 9         | 0.89544301  | 0.8346291  | 
600          | 10        | 0.90956541  | 0.8348116  | 
620          | 8         | 0.88074159  | 0.8347549  | 
640          | 8         | 0.88078282  | 0.8347805  | 
637          | 8         | 0.88076901  | 0.8347447  | 
630          | 8         | 0.88076376  | 0.8347627  | 
615          | 9         | 0.89537483  | 0.8346972  | 
200          | 10        | 0.90902188  | 0.8347227  | [100, 200, 400, 600, 800], [5, 10, 15]
200          | 9         | 0.89489814  | 0.8343133  | [190, 200, 210], [9, 10, 11]

In [124]:
kg.save_submission(best_model.predict(df_test), 'submissions/onehot.rfr.csv')

### XGBoost

In [None]:
%%time

import xgboost as xgb

# GridCV for Gradient Boost Model
gbm = xgb.XGBRegressor(max_depth=3, n_estimators=300, learning_rate=0.05, nthread=4, objective='binary:logistic')

param_grid = {
    'n_estimators': [90, 100, 110],
    'max_depth': [4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1],
}
best_score, best_model = ml.fine_tune_params(gbm,
                                             X_train, y_train,
                                             X_test, y_test,
                                             param_grid,
                                             n_runs=2,
                                             n_cv=3,
                                             scorer=roc_auc_score,
                                             n_jobs=1)

iteration 0


In [None]:
best_model.n_estimators, best_model.max_depth, best_model.learning_rate, roc_auc_score(y_train, best_model.predict(X_train)), best_score

n_estimators | max_depth | learning_rate | train score | test score | gridcv params
-------------|-----------|---------------|-------------|------------|-------------------------
100          | 5         | 0.05          | 0.868801790 | 0.83883443 | [5, 10, 20, 50, 100], [3, 5, 10, 20]
 90          | 5         | 0.05          | 0.868166755 | 0.83869731 | [80, 88, 90, 92, 100, 150, 200], [4, 5, 6, 7], [0.01, 0.05, 0.1]
 92          | 5         | 0.1           | 0.887339605 | 0.84173848 | [88, 90, 92], [5], [0.01, 0.05, 0.1]

In [None]:
kg.save_submission(best_model.predict(df_test), 'submissions/onehot.xgb.csv')