## Build Model Library

Compiles list of 587 different models total, varying classifier type and parameters

In [5]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline

In [75]:
def build_models(model, param_grid):
    print('Building %s models' % str(model).split('.')[-1][:-2]) 
    
    return list(model(**params) for params in ParameterGrid(param_grid))

In [76]:
def xgb_models():
    param_grid = {
        'n_estimators': [100, 200], 
        'objective': ['binary:logistic'],   
        'max_depth': [3,5,7,10], 
        'colsample_bytree': [0.5, 0.8, 0.9],
        'subsample': [0.5, 0.8, 0.9]
    }
    
    
    return build_models(XGBClassifier, param_grid)

In [77]:
def rf_models():
    param_grid = {
        'n_estimators' : [20, 50, 100],
        'criterion':  ['gini', 'entropy'],
        'max_features': [None, 'auto', 'sqrt', 'log2'],
        'max_depth': [3, 5, 10, 20, 25] 
    }
    
    return build_models(RandomForestClassifier, param_grid)

In [78]:
def linsvm_models():
    Cs = np.logspace(-7, 2, 10)
    
    param_grid = {
        'C': Cs,
        'loss': ['hinge', 'squared_hinge'],
    }
    
    return build_models(LinearSVC, param_grid)

In [79]:
def rbfsvm_models():
    Cs = np.logspace(-7, 0, 9)
    gammas = np.logspace(-6, 2, 9, base=2)
    
    param_grid = {
        'kernel': ['rbf'],
        'C': Cs,
        'gamma': gammas
    }
    
    models = []
    f_select = SelectKBest(f_classif, k=50)
    for params in ParameterGrid(param_grid):
        models.append(Pipeline([('filter', f_select), ('svc', SVC(**params))]))
    
    return models

In [80]:
def dt_models():
    param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_features': [None, 'auto', 'sqrt', 'log2'],
        'max_depth': [None,1,2,5,10],
        'random_state': np.random.randint(100, size=3)
    }
    
    return build_models(DecisionTreeClassifier, param_grid)

In [81]:
def log_models():
    Cs = np.logspace(-4, 4, 5)
    
    param_grid = {
        'C': Cs
    }
    
    return build_models(LogisticRegression, param_grid)

In [82]:
def knn_models():
    param_grid = {
        'n_neighbors': np.linspace(1, 300, num=25, dtype='int')
    }
    
    return build_models(KNeighborsClassifier, param_grid)

In [83]:
def sgd_models():
    param_grid = {
        'loss': ['log', 'modified_huber'],
        'penalty': ['elasticnet'],
        'alpha': [0.0001, 0.001, 0.01, 0.1],
        'learning_rate': ['constant', 'optimal'],
        'l1_ratio': np.linspace(0.0, 1.0, 3),
        'eta0': [0.001, 0.01, 0.1]
    }
    
    return build_models(SGDClassifier, param_grid)

In [84]:
models_dict = {}
models_dict = {
    'xgb': xgb_models(),
    'rf': rf_models(),
    'linsvm': linsvm_models(),
    'rbfsvm': rbfsvm_models(),
    'dt': dt_models(),
    'log': log_models(),
    'knn': knn_models(),
    'sgd': sgd_models()
}

Building XGBClassifier models
Building RandomForestClassifier models
Building LinearSVC models
Building DecisionTreeClassifier models
Building LogisticRegression models
Building KNeighborsClassifier models
Building SGDClassifier models


## Build Ensemble Classifier

In [6]:
from data_utils import load_train, load_test, write_test

In [7]:
X, y = load_train('data/train_2008.csv', False)

### Train all models on a training set

We will store all the trained models in a directory so this only needs to be done once

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

In [15]:
X_train.shape, y_train.shape

((58200, 366), (58200,))

In [86]:
models = []
for k in models_dict:
    i = 0
    for m in models_dict[k]:
        models.append(('%s_%d' %(k, i), m))
        i += 1

In [93]:
len(models)

587

In [91]:
models[371][0]

'xgb_0'

Originally attempted a parallelized approach (see ensemble.py), but my computer couldn't take it

In [72]:
import os

Started at 371 because I had previously trained models and interrupted process midway

In [92]:
SAVE_DIR = 'models/ensemble_models'

for model in models[371:]:
    model[1].fit(X_train, y_train)

    print '%s - %s\n' %(model[0], model[1])
    joblib.dump(model[1], os.path.join(SAVE_DIR, '%s.pkl'%(model[0])))

xgb_0 - XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.5)

xgb_1 - XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8)

xgb_2 - XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, se

### Train Ensemble Classifier
Used log probabilities as a metric for scoring the ensemble. Encapsulated everything into a EnsembleClassifier class later (see ensemble.py)

In [10]:
from sklearn.metrics import accuracy_score

Load all the trained models from directory

In [11]:
import glob
list_m = glob.glob("models/ensemble_models/*.pkl")

In [16]:
models = [joblib.load(m) for m in list_m]   

In [17]:
len(models)

578

Use hillclimbing algorithm to select a single model each time, based on the log loss of the ensemble model.

In [18]:
selected_models = []

In [267]:
classes = np.array([])
while len(selected_models) < 100:
    best_loss = 1e6
    best_h = None                                      
    
    for i in xrange(len(models)):
        if ((i + 1) % 25 == 0):
            print 'Checking Model: %d/%d' %(i+1, len(models))
            print 'Current Loss: %f' %(best_loss)
        
        if len(classes) == 0:
            new_classes = np.asarray([models[i].predict(X_val)])
        else:
            new_classes = np.vstack([classes, models[i].predict(X_val)])
        
        n = new_classes.shape[0]
        conf = np.sum(y_val == new_classes, axis=0)/ float(n)
        log_probs = -1 * np.log(conf.clip(min=1e-6))
        loss = np.mean(log_probs)
        
        # Checks if model produces a better score                                                     
        if (loss < best_loss):
            best_loss = loss
            best_h = models[i]
        
        
    print 'Adding to Ensemble: %s' %(best_h)
    print 'Loss: %f' %(best_loss)

    selected_models.append(best_h)
    if len(classes) == 0:
        classes = np.asarray([best_h.predict(X_val)])
    else:
        classes = np.vstack([classes, best_h.predict(X_val)])
    
    y_pred = np.asarray([np.argmax(np.bincount(classes[:,c])) for c in range(classes.shape[1])])
    acc = accuracy_score(y_pred, y_val)
    print 'Accuracy: %f' %(acc)

KeyboardInterrupt: 

Should have shuffled the models each time, since it looks like same models are being chosen. Changed metric to just pure accuracy.

In [22]:
selected_models = []
predictions = np.asarray([clf.predict(X_val) for clf in models])
classes = np.array([])

while len(selected_models) < 50:
    best_acc = 0.0
    best_h = None  
    best_pred = []
    
    np.random.shuffle(models)
    for i in xrange(len(models)):
        if ((i + 1) % 25 == 0):
            print 'Checking Model: %d/%d' %(i+1, len(models))
            print 'Current Acc: %f' %(best_acc)
        
        if len(classes) == 0:
            new_classes = np.asarray([predictions[i]])
        else:
            new_classes = np.vstack([classes, predictions[i]])
        
        y_pred = np.asarray([np.argmax(np.bincount(new_classes[:,c])) for c in range(new_classes.shape[1])])
        acc = accuracy_score(y_pred, y_val)
        
        # Checks if model produces a better score                                                     
        if (acc > best_acc):
            best_acc = acc
            best_h = models[i]
            best_pred = predictions[i]
        
        
    print 'Adding to Ensemble: %s' %(best_h)
    print 'Acc: %f' %(best_acc)

    selected_models.append(best_h)
    if len(classes) == 0:
        classes = np.asarray([best_pred])
    else:
        classes = np.vstack([classes, best_pred])

Checking Model: 25/578
Current Acc: 0.936137
Checking Model: 50/578
Current Acc: 0.971857
Checking Model: 75/578
Current Acc: 0.971857
Checking Model: 100/578
Current Acc: 0.971857
Checking Model: 125/578
Current Acc: 0.971857
Checking Model: 150/578
Current Acc: 0.971857
Checking Model: 175/578
Current Acc: 0.971857
Checking Model: 200/578
Current Acc: 0.971857
Checking Model: 225/578
Current Acc: 0.971857
Checking Model: 250/578
Current Acc: 0.971857
Checking Model: 275/578
Current Acc: 0.971857
Checking Model: 300/578
Current Acc: 0.971857
Checking Model: 325/578
Current Acc: 0.971857
Checking Model: 350/578
Current Acc: 0.971857
Checking Model: 375/578
Current Acc: 0.971857
Checking Model: 400/578
Current Acc: 0.971857
Checking Model: 425/578
Current Acc: 0.971857
Checking Model: 450/578
Current Acc: 0.971857
Checking Model: 475/578
Current Acc: 0.971857
Checking Model: 500/578
Current Acc: 0.971857
Checking Model: 525/578
Current Acc: 0.971857
Checking Model: 550/578
Current Acc: 

In [23]:
from collections import Counter

In [26]:
Counter(selected_models)

Counter({DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                     max_features='sqrt', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=17, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                     max_features='sqrt', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=97, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                     max_features='auto', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0

Models (50) selected using accuracy

In [29]:
models_dict_1 = {DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                     max_features='sqrt', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=17, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                     max_features='sqrt', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=97, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                     max_features='auto', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=97, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                     max_features='log2', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=17, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                     max_features=None, max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=97, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
                     max_features='log2', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=31, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                     max_features=None, max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=17, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                     max_features='log2', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=17, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
                     max_features='log2', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=97, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
                     max_features='sqrt', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=31, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
                     max_features=None, max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=97, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                     max_features='sqrt', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=31, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
                     max_features=None, max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=31, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
                     max_features='auto', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=17, splitter='best'): 1,
         KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=1, n_neighbors=162, p=2,
                    weights='uniform'): 1,
         KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=1, n_neighbors=212, p=2,
                    weights='uniform'): 1,
         KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=1, n_neighbors=38, p=2,
                    weights='uniform'): 1,
         KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=1, n_neighbors=63, p=2,
                    weights='uniform'): 1,
         KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=1, n_neighbors=113, p=2,
                    weights='uniform'): 1,
         LinearSVC(C=9.9999999999999995e-07, class_weight=None, dual=True,
              fit_intercept=True, intercept_scaling=1, loss='hinge', max_iter=1000,
              multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
              verbose=0): 1,
         Pipeline(steps=[('filter', SelectKBest(k=50, score_func=f_classif)), ('svc', SVC(C=9.9999999999999995e-07, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=0.03125, kernel='rbf',
           max_iter=-1, probability=False, random_state=None, shrinking=True,
           tol=0.001, verbose=False))]): 1,
         Pipeline(steps=[('filter', SelectKBest(k=50, score_func=f_classif)), ('svc', SVC(C=1.0000000000000001e-05, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=0.0625, kernel='rbf',
           max_iter=-1, probability=False, random_state=None, shrinking=True,
           tol=0.001, verbose=False))]): 1,
         Pipeline(steps=[('filter', SelectKBest(k=50, score_func=f_classif)), ('svc', SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=0.015625, kernel='rbf',
           max_iter=-1, probability=False, random_state=None, shrinking=True,
           tol=0.001, verbose=False))]): 1,
         Pipeline(steps=[('filter', SelectKBest(k=50, score_func=f_classif)), ('svc', SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=4.0, kernel='rbf',
           max_iter=-1, probability=False, random_state=None, shrinking=True,
           tol=0.001, verbose=False))]): 1,
         Pipeline(steps=[('filter', SelectKBest(k=50, score_func=f_classif)), ('svc', SVC(C=0.001, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=0.03125, kernel='rbf',
           max_iter=-1, probability=False, random_state=None, shrinking=True,
           tol=0.001, verbose=False))]): 1,
         Pipeline(steps=[('filter', SelectKBest(k=50, score_func=f_classif)), ('svc', SVC(C=9.9999999999999995e-08, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=0.5, kernel='rbf',
           max_iter=-1, probability=False, random_state=None, shrinking=True,
           tol=0.001, verbose=False))]): 1,
         Pipeline(steps=[('filter', SelectKBest(k=50, score_func=f_classif)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=0.125, kernel='rbf',
           max_iter=-1, probability=False, random_state=None, shrinking=True,
           tol=0.001, verbose=False))]): 1,
         RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                     max_depth=20, max_features='log2', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
                     verbose=0, warm_start=False): 1,
         RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                     max_depth=20, max_features='log2', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
                     verbose=0, warm_start=False): 1,
         RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                     max_depth=5, max_features=None, max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
                     verbose=0, warm_start=False): 1,
         SGDClassifier(alpha=0.01, average=False, class_weight=None, epsilon=0.1,
                eta0=0.1, fit_intercept=True, l1_ratio=0.5,
                learning_rate='constant', loss='modified_huber', n_iter=5, n_jobs=1,
                penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
                verbose=0, warm_start=False): 1,
         SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
                eta0=0.01, fit_intercept=True, l1_ratio=0.0,
                learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
                penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
                verbose=0, warm_start=False): 1,
         SGDClassifier(alpha=0.1, average=False, class_weight=None, epsilon=0.1,
                eta0=0.1, fit_intercept=True, l1_ratio=0.5,
                learning_rate='constant', loss='log', n_iter=5, n_jobs=1,
                penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
                verbose=0, warm_start=False): 1,
         SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
                eta0=0.001, fit_intercept=True, l1_ratio=0.0,
                learning_rate='optimal', loss='modified_huber', n_iter=5, n_jobs=1,
                penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
                verbose=0, warm_start=False): 1,
         SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
                eta0=0.001, fit_intercept=True, l1_ratio=0.5,
                learning_rate='optimal', loss='modified_huber', n_iter=5, n_jobs=1,
                penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
                verbose=0, warm_start=False): 1,
         SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
                eta0=0.01, fit_intercept=True, l1_ratio=0.5,
                learning_rate='constant', loss='log', n_iter=5, n_jobs=1,
                penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
                verbose=0, warm_start=False): 1,
         SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
                eta0=0.001, fit_intercept=True, l1_ratio=0.0,
                learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
                penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
                verbose=0, warm_start=False): 1,
         SGDClassifier(alpha=0.1, average=False, class_weight=None, epsilon=0.1,
                eta0=0.001, fit_intercept=True, l1_ratio=0.5,
                learning_rate='constant', loss='modified_huber', n_iter=5, n_jobs=1,
                penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
                verbose=0, warm_start=False): 2,
         SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
                eta0=0.001, fit_intercept=True, l1_ratio=1.0,
                learning_rate='optimal', loss='modified_huber', n_iter=5, n_jobs=1,
                penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
                verbose=0, warm_start=False): 1,
         SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
                eta0=0.1, fit_intercept=True, l1_ratio=0.0,
                learning_rate='constant', loss='log', n_iter=5, n_jobs=1,
                penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
                verbose=0, warm_start=False): 1,
         SGDClassifier(alpha=0.01, average=False, class_weight=None, epsilon=0.1,
                eta0=0.01, fit_intercept=True, l1_ratio=1.0,
                learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
                penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
                verbose=0, warm_start=False): 1,
         XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
                gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
                min_child_weight=1, n_estimators=200, nthread=-1,
                objective='binary:logistic', reg_alpha=0, reg_lambda=1,
                scale_pos_weight=1, seed=0, silent=True, subsample=0.9): 1,
         XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
                gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
                min_child_weight=1, n_estimators=100, nthread=-1,
                objective='binary:logistic', reg_alpha=0, reg_lambda=1,
                scale_pos_weight=1, seed=0, silent=True, subsample=0.8): 2,
         XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
                gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
                min_child_weight=1, n_estimators=100, nthread=-1,
                objective='binary:logistic', reg_alpha=0, reg_lambda=1,
                scale_pos_weight=1, seed=0, silent=True, subsample=0.9): 1,
         XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
                gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
                min_child_weight=1, n_estimators=200, nthread=-1,
                objective='binary:logistic', reg_alpha=0, reg_lambda=1,
                scale_pos_weight=1, seed=0, silent=True, subsample=0.9): 1,
         XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
                gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=7,
                min_child_weight=1, n_estimators=100, nthread=-1,
                objective='binary:logistic', reg_alpha=0, reg_lambda=1,
                scale_pos_weight=1, seed=0, silent=True, subsample=0.5): 1,
         XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
                gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=7,
                min_child_weight=1, n_estimators=200, nthread=-1,
                objective='binary:logistic', reg_alpha=0, reg_lambda=1,
                scale_pos_weight=1, seed=0, silent=True, subsample=0.9): 1,
         XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
                gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
                min_child_weight=1, n_estimators=100, nthread=-1,
                objective='binary:logistic', reg_alpha=0, reg_lambda=1,
                scale_pos_weight=1, seed=0, silent=True, subsample=0.9): 1}

In [30]:
ens = []
for m in models_dict_1:
    m = m.fit(X, y)
    print 'Adding Classifier: %s' %(m)
    
    ens.extend([m] * models_dict_1[m])

Adding Classifier: Pipeline(steps=[('filter', SelectKBest(k=50, score_func=<function f_classif at 0x109fe80c8>)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.125, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
Adding Classifier: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=20, max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Adding Classifier: XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='b

In [31]:
X_test = load_test('data/test_2008.csv')

In [32]:
classes = np.asarray([clf.predict(X_test) for clf in models])
y_pred = np.asarray([np.argmax(np.bincount(classes[:,c])) 
                     for c in range(classes.shape[1])])

In [34]:
write_test('predictions/ensemble.csv', y_pred)

Use models until accuracy stopped increasing (25)

In [35]:
Counter(selected_models[:25])

Counter({DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                     max_features='auto', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=97, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
                     max_features='log2', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=31, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
                     max_features='log2', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_lea

In [53]:
models_dict_1 = {DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                     max_features='auto', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=97, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
                     max_features='log2', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=31, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
                     max_features='log2', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=97, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                     max_features='sqrt', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=31, splitter='best'): 1,
         DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
                     max_features=None, max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     presort=False, random_state=31, splitter='best'): 1,
         KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=1, n_neighbors=63, p=2,
                    weights='uniform'): 1,
         KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=1, n_neighbors=113, p=2,
                    weights='uniform'): 1,
         Pipeline(steps=[('filter', SelectKBest(k=50, score_func=f_classif)), ('svc', SVC(C=9.9999999999999995e-07, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=0.03125, kernel='rbf',
           max_iter=-1, probability=True, random_state=None, shrinking=True,
           tol=0.001, verbose=False))]): 1,
         Pipeline(steps=[('filter', SelectKBest(k=50, score_func=f_classif)), ('svc', SVC(C=1.0000000000000001e-05, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=0.0625, kernel='rbf',
           max_iter=-1, probability=True, random_state=None, shrinking=True,
           tol=0.001, verbose=False))]): 1,
         Pipeline(steps=[('filter', SelectKBest(k=50, score_func=f_classif)), ('svc', SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=0.015625, kernel='rbf',
           max_iter=-1, probability=True, random_state=None, shrinking=True,
           tol=0.001, verbose=False))]): 1,
         Pipeline(steps=[('filter', SelectKBest(k=50, score_func=f_classif)), ('svc', SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=4.0, kernel='rbf',
           max_iter=-1, probability=True, random_state=None, shrinking=True,
           tol=0.001, verbose=False))]): 1,
         Pipeline(steps=[('filter', SelectKBest(k=50, score_func=f_classif)), ('svc', SVC(C=9.9999999999999995e-08, cache_size=200, class_weight=None, coef0=0.0,
           decision_function_shape=None, degree=3, gamma=0.5, kernel='rbf',
           max_iter=-1, probability=True, random_state=None, shrinking=True,
           tol=0.001, verbose=False))]): 1,
         RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                     max_depth=20, max_features='log2', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
                     verbose=0, warm_start=False): 1,
         RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                     max_depth=20, max_features='log2', max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
                     verbose=0, warm_start=False): 1,
         RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                     max_depth=5, max_features=None, max_leaf_nodes=None,
                     min_impurity_split=1e-07, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
                     verbose=0, warm_start=False): 1,
         SGDClassifier(alpha=0.01, average=False, class_weight=None, epsilon=0.1,
                eta0=0.1, fit_intercept=True, l1_ratio=0.5,
                learning_rate='constant', loss='modified_huber', n_iter=5, n_jobs=1,
                penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
                verbose=0, warm_start=False): 1,
         SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
                eta0=0.01, fit_intercept=True, l1_ratio=0.0,
                learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
                penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
                verbose=0, warm_start=False): 1,
         SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
                eta0=0.001, fit_intercept=True, l1_ratio=0.5,
                learning_rate='optimal', loss='modified_huber', n_iter=5, n_jobs=1,
                penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
                verbose=0, warm_start=False): 1,
         SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
                eta0=0.001, fit_intercept=True, l1_ratio=0.0,
                learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
                penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
                verbose=0, warm_start=False): 1,
         SGDClassifier(alpha=0.1, average=False, class_weight=None, epsilon=0.1,
                eta0=0.001, fit_intercept=True, l1_ratio=0.5,
                learning_rate='constant', loss='modified_huber', n_iter=5, n_jobs=1,
                penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
                verbose=0, warm_start=False): 1,
         SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
                eta0=0.001, fit_intercept=True, l1_ratio=1.0,
                learning_rate='optimal', loss='modified_huber', n_iter=5, n_jobs=1,
                penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
                verbose=0, warm_start=False): 1,
         SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
                eta0=0.1, fit_intercept=True, l1_ratio=0.0,
                learning_rate='constant', loss='log', n_iter=5, n_jobs=1,
                penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
                verbose=0, warm_start=False): 1,
         XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
                gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
                min_child_weight=1, n_estimators=200, nthread=-1,
                objective='binary:logistic', reg_alpha=0, reg_lambda=1,
                scale_pos_weight=1, seed=0, silent=True, subsample=0.9): 1,
         XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
                gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=7,
                min_child_weight=1, n_estimators=100, nthread=-1,
                objective='binary:logistic', reg_alpha=0, reg_lambda=1,
                scale_pos_weight=1, seed=0, silent=True, subsample=0.5): 1,
         XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
                gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
                min_child_weight=1, n_estimators=100, nthread=-1,
                objective='binary:logistic', reg_alpha=0, reg_lambda=1,
                scale_pos_weight=1, seed=0, silent=True, subsample=0.9): 1}

In [54]:
models_list = list(models_dict_1)

In [55]:
m = zip(range(len(models_list)), models_list)

In [43]:
from sklearn.ensemble import VotingClassifier

In [56]:
vclf = VotingClassifier(estimators=m, voting='soft', n_jobs=4)

In [57]:
vclf.fit(X, y)

VotingClassifier(estimators=[(0, Pipeline(steps=[('filter', SelectKBest(k=50, score_func=<function f_classif at 0x109fe80c8>)), ('svc', SVC(C=1e-07, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.5, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shr...  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]))],
         n_jobs=4, voting='soft', weights=None)

In [58]:
y_pred = vclf.predict(X_test)

In [59]:
write_test('predictions/ensemble2.csv', y_pred)

Models (50) selected using the log loss as a measure for choosing models

In [9]:
models_dict_2 = {
         KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                    n_neighbors=1, p=2, weights='uniform'): 10,
        KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                    n_neighbors=10, p=2, weights='uniform'): 3,
         Pipeline(steps=[('filter', SelectKBest(k=50, score_func=f_classif)), 
                         ('svc', SVC(C=1.0, cache_size=2000, 
                                    degree=3, gamma=2.0, kernel='rbf'))]): 7,
         Pipeline(steps=[('filter', SelectKBest(k=50, score_func=f_classif)), 
                         ('svc', SVC(C=1.0, cache_size=2000, 
                                    degree=3, gamma=4.0, kernel='rbf'))]): 1,
         RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                     max_depth=25, max_features='auto', 
                     n_estimators=100, n_jobs=2, verbose=2): 2,
         RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                     max_depth=25, max_features=None,
                     n_estimators=100, n_jobs=2, verbose=2): 14,
         RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                     max_depth=25, max_features=None, 
                     n_estimators=50, n_jobs=2, verbose=2): 6,
         RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                     max_depth=25, max_features=None,
                     n_estimators=20, n_jobs=2, verbose=2): 1,
         RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                     max_depth=25, max_features='auto',
                     n_estimators=100, n_jobs=2, verbose=2): 1,
         SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
                eta0=0.01, fit_intercept=True, l1_ratio=0.0,
                learning_rate='constant', loss='log', n_iter=5, n_jobs=1,
                penalty='elasticnet'): 1,
         XGBClassifier(base_score=0.5, colsample_bytree=0.9,
                   gamma=0, learning_rate=0.1, max_depth=10, 
                       n_estimators=300, subsample=0.5): 4
         }