In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
%matplotlib inline



In [4]:
df = pd.read_csv('numerai_training_data.csv')

In [5]:
df.describe()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature42,feature43,feature44,feature45,feature46,feature47,feature48,feature49,feature50,target
count,535713.0,535713.0,535713.0,535713.0,535713.0,535713.0,535713.0,535713.0,535713.0,535713.0,...,535713.0,535713.0,535713.0,535713.0,535713.0,535713.0,535713.0,535713.0,535713.0,535713.0
mean,0.472921,0.482357,0.538887,0.489979,0.536681,0.531812,0.465629,0.486717,0.532687,0.49588,...,0.469072,0.506042,0.509686,0.491432,0.50567,0.525091,0.539248,0.482097,0.496385,0.49996
std,0.113607,0.117309,0.100929,0.129855,0.095137,0.111722,0.112765,0.114449,0.113772,0.115777,...,0.122965,0.116461,0.118462,0.125189,0.105082,0.134487,0.0967,0.128133,0.127438,0.5
min,0.0,0.0,0.0,0.0,0.03872,0.0,0.00641,0.0,0.0,0.0,...,0.0,0.0,0.06421,0.0,0.0,0.00412,0.02749,0.0,0.01803,0.0
25%,0.39263,0.4019,0.47058,0.39831,0.47713,0.45502,0.38807,0.40673,0.45645,0.41675,...,0.38276,0.42908,0.42767,0.40571,0.43463,0.43288,0.47491,0.39087,0.40731,0.0
50%,0.4679,0.48196,0.53791,0.48105,0.54293,0.53165,0.46717,0.48368,0.53504,0.49529,...,0.46646,0.50789,0.50699,0.49125,0.50469,0.52939,0.5402,0.47766,0.48877,0.0
75%,0.54884,0.56151,0.6062,0.57415,0.60204,0.60895,0.54342,0.56375,0.61121,0.57436,...,0.55235,0.58466,0.58897,0.57701,0.57601,0.62077,0.60472,0.56966,0.57985,1.0
max,0.98256,1.0,1.0,1.0,0.96961,0.98257,1.0,0.98217,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9942,1.0,1.0


In [8]:
feat_cols = df.columns[3:-1]
target_col = df.columns[-1]
print feat_cols

Index([u'feature1', u'feature2', u'feature3', u'feature4', u'feature5',
       u'feature6', u'feature7', u'feature8', u'feature9', u'feature10',
       u'feature11', u'feature12', u'feature13', u'feature14', u'feature15',
       u'feature16', u'feature17', u'feature18', u'feature19', u'feature20',
       u'feature21', u'feature22', u'feature23', u'feature24', u'feature25',
       u'feature26', u'feature27', u'feature28', u'feature29', u'feature30',
       u'feature31', u'feature32', u'feature33', u'feature34', u'feature35',
       u'feature36', u'feature37', u'feature38', u'feature39', u'feature40',
       u'feature41', u'feature42', u'feature43', u'feature44', u'feature45',
       u'feature46', u'feature47', u'feature48', u'feature49', u'feature50'],
      dtype='object')


In [6]:
def era_split(df, col, test_size=0.1, seed=0):
    # sample a bunch of eras to use as the test set
    uniques = df[col].unique()
    num_unique = len(df[col].unique())
    test_count = int(num_unique * test_size)
    train_count = num_unique - test_count

    test_eras = np.random.choice(uniques, size=test_count, replace=False)
    test_set = df[df[col].isin(test_eras)]
    train_set = df[~df[col].isin(test_eras)]
    return train_set, test_set

In [10]:
def get_feat_target(df, feat_col, target_col):
    return df[feat_col], df[target_col]

In [11]:
train, test = era_split(df, 'era', test_size=0.2)
x_train, y_train = get_feat_target(train, feat_cols, target_col)
x_test, y_test = get_feat_target(test, feat_cols, target_col)
data_dict = {
    'x_train': x_train,
    'x_test': x_test,
    'y_train': y_train,
    'y_test': y_test
}

smaller_size = 1000
small_data_dict = {
    'x_train': x_train[:smaller_size],
    'x_test': x_test[smaller_size: 2*smaller_size],
    'y_train': y_train[:smaller_size],
    'y_test': y_test[smaller_size:2*smaller_size]
}

In [12]:
# cross validation params
def get_cv_params(model_func):
    lr_params = {
        'C': [10**i for i in range(-3, 3)]
    }

    rf_dict = {
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 10, 100],
        'max_depth': [5, 10, 15],
    }

    gbt_dict = {                                                                                      
        'learning_rate': [0.1, 0.25, 0.5],                                                              
        'max_depth': [5, 10, 15],                                                                            
        'min_samples_split': [2, 5],                                                                    
        'min_samples_leaf': [1, 10, 100],
    }
    
    params = {
        LogisticRegression: lr_params,
        RandomForestClassifier: rf_dict,
        GradientBoostingClassifier: gbt_dict
    }
    
    return params.get(model_func, {})

In [14]:
def cv_model(model, param_dict, scoring, data_dict, folds=10, n_jobs=4):
    x_train = data_dict['x_train']
    y_train = data_dict['y_train']
    x_test = data_dict['x_test']
    y_test = data_dict['y_test']

    gridcv = GridSearchCV(model, param_dict, scoring=scoring, cv=folds, n_jobs=n_jobs)
    gridcv.fit(x_train, y_train)

    best_model = gridcv.best_estimator_
    y_pred = best_model.predict_proba(x_test)
    loss = log_loss(y_test, y_pred)
    return loss, best_model, gridcv

In [15]:
def run_all_cv(data_dict, folds=5, n_jobs=4):
    # Run cross validation for each model.
    model_funcs = [LogisticRegression, RandomForestClassifier, GradientBoostingClassifier]
    for m in model_funcs:
        print '=' * 80
        print m.__name__
        model = m()
        cv_params = get_cv_params(m)
        loss, best_model, gridcv = cv_model(model, cv_params, 'accuracy', data_dict, folds=folds, n_jobs=n_jobs)
        print '{}\nLoss: {}'.format(best_model, loss)
        print gridcv.best_params_

In [None]:
run_all_cv(data_dict, n_jobs=4)

LogisticRegression
