# numer.ai

Numerai is using breakthroughs in encryption to allow data scientists to participate in solving a stock market machine learning
problem.

https://numer.ai/about

In [1]:
import pandas as pd
import numpy as np
workdir = "/home/ubuntu/data/"

In [2]:
traindata = pd.read_csv(workdir+'numerai_training_data.csv')

In [3]:
traindata.head(2)

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature42,feature43,feature44,feature45,feature46,feature47,feature48,feature49,feature50,target
0,0.523989,0.450736,0.642132,0.28339,0.489685,0.365986,0.750027,0.441617,0.705939,0.298719,...,0.560302,0.568412,0.796156,0.298466,0.537179,0.394989,0.278652,0.366975,0.624429,1
1,0.361765,0.356233,0.695955,0.52563,0.710569,0.256446,0.597307,0.346839,0.460739,0.64252,...,0.351684,0.418981,0.469897,0.477661,0.392151,0.323126,0.695662,0.483652,0.472957,1


In [4]:
target = traindata["target"]
traindata.drop("target", axis=1, inplace=True)

In [5]:
traindata.describe()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature41,feature42,feature43,feature44,feature45,feature46,feature47,feature48,feature49,feature50
count,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0,...,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0
mean,0.503315,0.528337,0.523637,0.507853,0.459675,0.523603,0.464908,0.503458,0.505179,0.507997,...,0.493254,0.52487,0.489271,0.542465,0.492873,0.48093,0.495088,0.515655,0.526513,0.517014
std,0.119358,0.13599,0.140591,0.111798,0.144374,0.143353,0.136017,0.126,0.156709,0.158817,...,0.156236,0.129495,0.131664,0.13619,0.16832,0.115552,0.134214,0.140444,0.135894,0.119244
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.423868,0.436758,0.422698,0.432625,0.353949,0.416925,0.370033,0.415512,0.390768,0.392041,...,0.377072,0.432102,0.400755,0.447635,0.361996,0.398145,0.396579,0.420975,0.434276,0.439332
50%,0.504471,0.532099,0.523808,0.511219,0.455057,0.524664,0.458604,0.504341,0.502569,0.510177,...,0.490121,0.526524,0.489265,0.545842,0.492573,0.478766,0.495653,0.514638,0.529256,0.519898
75%,0.582147,0.623133,0.62387,0.585665,0.563213,0.630734,0.552903,0.592677,0.617613,0.628496,...,0.609254,0.619213,0.576567,0.640741,0.62224,0.561584,0.594166,0.609921,0.622923,0.598703
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.963337,1.0,1.0,1.0,1.0


In [6]:
target.describe()

count    136573.000000
mean          0.503343
std           0.499991
min           0.000000
25%           0.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: target, dtype: float64

In [7]:
X_train = traindata.as_matrix()
y_train = target.as_matrix().ravel()

### Creating a model with eXtreme Gradient Boosting

XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. It implements machine learning algorithms under the Gradient Boosting framework. XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solve many data science problems in a fast and accurate way. 

In [9]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

In [10]:
model = XGBClassifier()

In [14]:
print("XGBoost default model expected logloss = %.5f" % ( -cross_val_score(model, X_train, y_train, 
                                                                          scoring="neg_log_loss", cv=3).mean()))
print("XGBoost default model expected accuracy = %.4f" % ( cross_val_score(model, X_train, y_train, 
                                                                           scoring="accuracy", cv=3).mean()))

XGBoost default model expected logloss = 0.69204
XGBoost default model expected accuracy = 0.5204


### Hyperparameter Optimization

hyperopt is a Python library for optimizing over awkward search spaces with real-valued, discrete, and conditional dimensions, using the Tree of Parzen Estimators (TPE) algorithm

In [19]:
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import StratifiedKFold, train_test_split

In [21]:
def cross_validated_scorer(X_train, y_train, model_class, params, loss, kfolds=3):
    params["n_estimators"] = int(params["n_estimators"])
    params["max_depth"] = int(params["max_depth"])
    print("Training with params : %s" % (params))
    mod = model_class(**params)
    cv_score = -1 * cross_val_score(mod, X_train, y=y_train, scoring=loss, cv=kfolds, n_jobs=1).mean()
    print(cv_score)
    return cv_score

def optimize(trials):
    hyperopt_grid = {
            'max_depth' : hp.quniform('max_depth', 1, 10, 1),
            'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
            'n_estimators' : hp.quniform('n_estimators', 25, 525, 25),
            'gamma' : hp.quniform('gamma', 0.0, 1.0, 0.05),
            'min_child_weight' : hp.quniform('min_child_weight', 1, 4, 1),
            'subsample' : hp.quniform('subsample', 0.2, 1, 0.1),
            'colsample_bytree' : hp.quniform('colsample_bytree', 0.2, 1.0, 0.1)
    }
    
    def objective(params):
        err = cross_validated_scorer(X_train, y_train, XGBClassifier, params, loss="neg_log_loss")
        return {'loss': err, 'params': params, 'status': STATUS_OK}
    best = fmin(objective, hyperopt_grid, algo=tpe.suggest, trials=trials, max_evals=25)
    return best

In [22]:
%%time
trials = Trials()
best = optimize(trials)

Training with params : {'n_estimators': 325, 'subsample': 0.5, 'colsample_bytree': 0.9, 'gamma': 0.45, 'learning_rate': 0.22, 'max_depth': 10, 'min_child_weight': 3.0}
0.842002856784
Training with params : {'n_estimators': 375, 'subsample': 0.5, 'colsample_bytree': 0.6000000000000001, 'gamma': 0.4, 'learning_rate': 0.3, 'max_depth': 8, 'min_child_weight': 4.0}
0.85304831221
Training with params : {'n_estimators': 150, 'subsample': 0.9, 'colsample_bytree': 0.8, 'gamma': 0.65, 'learning_rate': 0.37, 'max_depth': 5, 'min_child_weight': 3.0}
0.711107875332
Training with params : {'n_estimators': 350, 'subsample': 0.9, 'colsample_bytree': 0.5, 'gamma': 0.30000000000000004, 'learning_rate': 0.23, 'max_depth': 10, 'min_child_weight': 3.0}
0.768937417351
Training with params : {'n_estimators': 400, 'subsample': 0.4, 'colsample_bytree': 0.2, 'gamma': 0.05, 'learning_rate': 0.15, 'max_depth': 2, 'min_child_weight': 1.0}
0.694207613611
Training with params : {'n_estimators': 400, 'subsample': 0.7

In [23]:
best["n_estimators"] = int(best["n_estimators"])
best["max_depth"] = int(best["max_depth"])

In [24]:
print(best)

{'colsample_bytree': 0.30000000000000004, 'learning_rate': 0.01, 'min_child_weight': 3.0, 'n_estimators': 400, 'subsample': 0.2, 'max_depth': 5, 'gamma': 0.9500000000000001}


In [11]:
model = XGBClassifier(**best)

In [26]:
print("XBBoost optimized model expected logloss = %.5f" % ( -cross_val_score(model, X_train, y_train, 
                                                                             scoring="neg_log_loss", cv=3).mean()))
print("XBBoost optimized model expected accuracy = %.4f" % ( cross_val_score(model, X_train, y_train, 
                                                                              scoring="accuracy", cv=3).mean()))

XBBoost optimized model expected logloss = 0.69175
XBBoost optimized model expected accuracy = 0.5208


### Bagging

A Bagging classifier is an ensemble meta-estimator that fits base classifiers each on random subsets of the original dataset and then aggregate their individual predictions to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator, by introducing randomization into its construction procedure and then making an ensemble out of it.

When base estimators are built on subsets of both samples and features, then the method is known as Random Patches

In [13]:
from sklearn.ensemble import BaggingClassifier
bagged_model = BaggingClassifier(model, n_estimators=10, max_samples=0.9, max_features=0.9, bootstrap=False)

In [14]:
print("XBBoost bagged model expected logloss = %.5f" % ( -cross_val_score(bagged_model, X_train, y_train, 
                                                                             scoring="neg_log_loss", cv=3).mean()))
print("XBBoost bagged model expected accuracy = %.4f" % ( cross_val_score(bagged_model, X_train, y_train, 
                                                                              scoring="accuracy", cv=3).mean()))

XBBoost bagged model expected logloss = 0.69160
XBBoost bagged model expected accuracy = 0.5214


### Probability Calibration 

The calibration module allows you to better calibrate the probabilities of a given model, or to add support for probability prediction.
Well calibrated classifiers are probabilistic classifiers for which the output of the predict_proba method can be directly interpreted as a confidence level.

We are using a parametric approach based on Platt’s sigmoid model 

In [15]:
from sklearn.calibration import CalibratedClassifierCV
best_model = CalibratedClassifierCV(base_estimator=bagged_model, method='sigmoid', cv=5)

In [16]:
print("XGBoost calibrated model expected logloss = %.5f" % ( -cross_val_score(best_model, X_train, y_train, 
                                                                             scoring="neg_log_loss", cv=3).mean()))
print("XGBoost calibrated model expected accuracy = %.4f" % ( cross_val_score(best_model, X_train, y_train, 
                                                                              scoring="accuracy", cv=3).mean()))

XGBoost calibrated model expected logloss = 0.69160
XGBoost calibrated model expected accuracy = 0.5222


### Obatining final predictions

In [17]:
%%time
best_model = best_model.fit(X_train, y_train)

CPU times: user 1h 12min 45s, sys: 4.25 s, total: 1h 12min 50s
Wall time: 9min 28s


In [18]:
testdata = pd.read_csv(workdir+'numerai_tournament_data.csv')
ids = testdata['t_id']
testdata.drop('t_id', axis=1, inplace=True)
X_test = testdata.as_matrix()

In [21]:
predictions = best_model.predict_proba(X_test)[:,1]
results = pd.read_csv(workdir+"example_predictions.csv")
results["probability"] = predictions
model_name = "XGB_opt_bag_calib"
results.to_csv("%ssubmission_%s.csv" % (workdir, model_name), index=False)

*submission_XGB_opt_bag_calib.csv has logloss of 0.690*