## Build Model Library

Compiles list of 587 different models total, varying classifier type and parameters

In [5]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline

In [75]:
def build_models(model, param_grid):
    print('Building %s models' % str(model).split('.')[-1][:-2]) 
    
    return list(model(**params) for params in ParameterGrid(param_grid))

In [76]:
def xgb_models():
    param_grid = {
        'n_estimators': [100, 200], 
        'objective': ['binary:logistic'],   
        'max_depth': [3,5,7,10], 
        'colsample_bytree': [0.5, 0.8, 0.9],
        'subsample': [0.5, 0.8, 0.9]
    }
    
    
    return build_models(XGBClassifier, param_grid)

In [77]:
def rf_models():
    param_grid = {
        'n_estimators' : [20, 50, 100],
        'criterion':  ['gini', 'entropy'],
        'max_features': [None, 'auto', 'sqrt', 'log2'],
        'max_depth': [3, 5, 10, 20, 25] 
    }
    
    return build_models(RandomForestClassifier, param_grid)

In [78]:
def linsvm_models():
    Cs = np.logspace(-7, 2, 10)
    
    param_grid = {
        'C': Cs,
        'loss': ['hinge', 'squared_hinge'],
    }
    
    return build_models(LinearSVC, param_grid)

In [79]:
def rbfsvm_models():
    Cs = np.logspace(-7, 0, 9)
    gammas = np.logspace(-6, 2, 9, base=2)
    
    param_grid = {
        'kernel': ['rbf'],
        'C': Cs,
        'gamma': gammas
    }
    
    models = []
    f_select = SelectKBest(f_classif, k=50)
    for params in ParameterGrid(param_grid):
        models.append(Pipeline([('filter', f_select), ('svc', SVC(**params))]))
    
    return models

In [80]:
def dt_models():
    param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_features': [None, 'auto', 'sqrt', 'log2'],
        'max_depth': [None,1,2,5,10],
        'random_state': np.random.randint(100, size=3)
    }
    
    return build_models(DecisionTreeClassifier, param_grid)

In [81]:
def log_models():
    Cs = np.logspace(-4, 4, 5)
    
    param_grid = {
        'C': Cs
    }
    
    return build_models(LogisticRegression, param_grid)

In [82]:
def knn_models():
    param_grid = {
        'n_neighbors': np.linspace(1, 300, num=25, dtype='int')
    }
    
    return build_models(KNeighborsClassifier, param_grid)

In [83]:
def sgd_models():
    param_grid = {
        'loss': ['log', 'modified_huber'],
        'penalty': ['elasticnet'],
        'alpha': [0.0001, 0.001, 0.01, 0.1],
        'learning_rate': ['constant', 'optimal'],
        'l1_ratio': np.linspace(0.0, 1.0, 3),
        'eta0': [0.001, 0.01, 0.1]
    }
    
    return build_models(SGDClassifier, param_grid)

In [84]:
models_dict = {}
models_dict = {
    'xgb': xgb_models(),
    'rf': rf_models(),
    'linsvm': linsvm_models(),
    'rbfsvm': rbfsvm_models(),
    'dt': dt_models(),
    'log': log_models(),
    'knn': knn_models(),
    'sgd': sgd_models()
}

Building XGBClassifier models
Building RandomForestClassifier models
Building LinearSVC models
Building DecisionTreeClassifier models
Building LogisticRegression models
Building KNeighborsClassifier models
Building SGDClassifier models


## Build Ensemble Classifier

In [6]:
from data_utils import load_train, load_test, write_test

In [7]:
X, y = load_train('data/train_2008.csv', False)

### Train all models on a training set

We will store all the trained models in a directory so this only needs to be done once

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [15]:
X_train.shape, y_train.shape

((58200, 366), (58200,))

In [86]:
models = []
for k in models_dict:
    i = 0
    for m in models_dict[k]:
        models.append(('%s_%d' %(k, i), m))
        i += 1

In [93]:
len(models)

587

In [91]:
models[371][0]

'xgb_0'

Originally attempted a parallelized approach (see ensemble.py), but my computer couldn't take it

In [72]:
import os

Started at 371 because I had previously trained models and interrupted process midway

In [92]:
SAVE_DIR = 'models/ensemble_models'

for model in models[371:]:
    model[1].fit(X_train, y_train)

    #print '%s - %s\n' %(model[0], model[1])
    joblib.dump(model[1], os.path.join(SAVE_DIR, '%s.pkl'%(model[0])))

xgb_0 - XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.5)

xgb_1 - XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8)

xgb_2 - XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, se

### Train Ensemble Classifier
Used log probabilities as a metric for scoring the ensemble. Encapsulated everything into a EnsembleClassifier class later (see ensemble.py)

In [10]:
from sklearn.metrics import accuracy_score

Load all the trained models from directory

In [11]:
import glob
list_m = glob.glob("models/ensemble_models/*.pkl")

In [16]:
models = [joblib.load(m) for m in list_m]   

In [17]:
len(models)

578