# numer.ai

Numerai is using breakthroughs in encryption to allow data scientists to participate in solving a stock market machine learning
problem.

https://numer.ai/about

In [1]:
import pandas as pd
import numpy as np
workdir = "G:/numer.ai/"

In [2]:
traindata = pd.read_csv(workdir+'numerai_training_data.csv')

In [4]:
traindata.head(2)

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20,feature21,target
0,0.017687,0.843548,0.814934,0.276967,0.912423,0.988795,0.858665,0.179334,0.80715,0.073117,...,0.902173,0.995616,0.688415,0.555081,0.875725,0.879434,0.095635,0.070577,0.003269,0
1,0.355228,0.565411,0.75971,0.907185,0.717064,0.806277,0.888097,0.963211,0.821191,0.261696,...,0.856744,0.651405,0.836491,0.617654,0.832884,0.341864,0.348834,0.406405,0.427224,1


In [5]:
target = traindata["target"]
traindata.drop("target", axis=1, inplace=True)

In [6]:
traindata.describe()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature12,feature13,feature14,feature15,feature16,feature17,feature18,feature19,feature20,feature21
count,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0,...,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0,136573.0
mean,0.473665,0.512631,0.511716,0.502332,0.498231,0.489776,0.493125,0.50237,0.501107,0.487738,...,0.503058,0.495264,0.508009,0.498999,0.489074,0.516172,0.510164,0.483868,0.503173,0.4904
std,0.291512,0.290201,0.288814,0.293186,0.293537,0.291347,0.288602,0.290478,0.278691,0.290059,...,0.288714,0.287911,0.286618,0.28814,0.287188,0.292215,0.292922,0.288383,0.287108,0.285497
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.4e-05,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.204125,0.266122,0.255894,0.241572,0.228471,0.22894,0.253284,0.252685,0.269538,0.229783,...,0.257962,0.238967,0.267147,0.247278,0.239038,0.262311,0.260789,0.236508,0.250369,0.247879
50%,0.456418,0.529997,0.527758,0.516066,0.504222,0.489689,0.497935,0.509452,0.491312,0.48875,...,0.501828,0.490198,0.505247,0.508547,0.478298,0.533366,0.500247,0.474127,0.512241,0.484387
75%,0.727077,0.761698,0.763739,0.752599,0.74593,0.740191,0.741812,0.745669,0.738147,0.746375,...,0.763938,0.746661,0.752855,0.741561,0.723346,0.766626,0.762757,0.727005,0.743788,0.732487
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
target.describe()

count    136573.000000
mean          0.503343
std           0.499991
min           0.000000
25%           0.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: target, dtype: float64

In [8]:
X_train = traindata.as_matrix()
y_train = target.as_matrix().ravel()

### Creating a model with eXtreme Gradient Boosting

XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. It implements machine learning algorithms under the Gradient Boosting framework. XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solve many data science problems in a fast and accurate way. 

In [16]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

In [10]:
model = XGBClassifier()

In [19]:
print("XGBoost default model expected logloss = %.4f" % (-cross_val_score(model, X_train, y_train, 
                                                                          scoring="neg_log_loss", cv=3).mean()))
print("XGBoost default model expected accuracy = %.3f" % ( cross_val_score(model, X_train, y_train, 
                                                                           scoring="accuracy", cv=3).mean()))

XGBoost default model expected logloss = 0.6921
XGBoost default model expected accuracy = 0.520


### Hyperparameter Optimization

hyperopt is a Python library for optimizing over awkward search spaces with real-valued, discrete, and conditional dimensions, using the Tree of Parzen Estimators (TPE) algorithm

In [20]:
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import StratifiedKFold, train_test_split

In [21]:
#We'll use a small random subset of the data to speed up the optimization
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.333)

In [22]:
def cross_validated_scorer(X_train, y_train, model_class, params, loss, kfolds=3):
    params["n_estimators"] = int(params["n_estimators"])
    params["max_depth"] = int(params["max_depth"])
    print("Training with params : %s" % (params))
    mod = model_class(**params)
    cv_score = -1 * cross_val_score(mod, X_train, y=y_train, scoring=loss, cv=kfolds, n_jobs=1).mean()
    print(cv_score)
    return cv_score

def optimize(trials):
    hyperopt_grid = {
            'max_depth' : hp.quniform('max_depth', 1, 10, 1),
            'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
            'n_estimators' : hp.quniform('n_estimators', 25, 525, 25),
            'gamma' : hp.quniform('gamma', 0.0, 1.0, 0.05),
            'min_child_weight' : hp.quniform('min_child_weight', 1, 4, 1),
            'subsample' : hp.quniform('subsample', 0.2, 1, 0.1),
            'colsample_bytree' : hp.quniform('colsample_bytree', 0.2, 1.0, 0.1)
    }
    
    def objective(params):
        err = cross_validated_scorer(X_train, y_train, XGBClassifier, params, loss="neg_log_loss")
        return {'loss': err, 'params': params, 'status': STATUS_OK}
    best = fmin(objective, hyperopt_grid, algo=tpe.suggest, trials=trials, max_evals=25)
    return best

In [23]:
%%time
trials = Trials()
best = optimize(trials)

Training with params : {'min_child_weight': 3.0, 'max_depth': 9, 'subsample': 0.9, 'learning_rate': 0.22, 'n_estimators': 75, 'gamma': 0.8500000000000001, 'colsample_bytree': 0.8}
0.729789670704
Training with params : {'min_child_weight': 3.0, 'max_depth': 8, 'subsample': 0.7000000000000001, 'learning_rate': 0.26, 'n_estimators': 225, 'gamma': 0.8, 'colsample_bytree': 0.6000000000000001}
0.808850793342
Training with params : {'min_child_weight': 2.0, 'max_depth': 8, 'subsample': 0.9, 'learning_rate': 0.42, 'n_estimators': 250, 'gamma': 0.5, 'colsample_bytree': 0.4}
0.880625596133
Training with params : {'min_child_weight': 2.0, 'max_depth': 3, 'subsample': 0.30000000000000004, 'learning_rate': 0.48, 'n_estimators': 275, 'gamma': 0.8, 'colsample_bytree': 0.2}
0.772507310438
Training with params : {'min_child_weight': 3.0, 'max_depth': 10, 'subsample': 0.5, 'learning_rate': 0.5, 'n_estimators': 500, 'gamma': 0.30000000000000004, 'colsample_bytree': 0.30000000000000004}
1.38009342922
Trai

In [24]:
best["n_estimators"] = int(best["n_estimators"])
best["max_depth"] = int(best["max_depth"])

In [25]:
print(best)

{'min_child_weight': 4.0, 'max_depth': 1, 'subsample': 1.0, 'learning_rate': 0.04, 'n_estimators': 125, 'gamma': 0.05, 'colsample_bytree': 0.4}


In [26]:
model = XGBClassifier(**best)

In [27]:
X_train = traindata.as_matrix()
y_train = target.as_matrix().ravel()

In [30]:
print("XBBoost optimized model expected logloss = %.4f" % ( -cross_val_score(model, X_train, y_train, 
                                                                             scoring="neg_log_loss", cv=3).mean()))
print("XBBoost optimized model expected accuracy = %.3f" % ( cross_val_score(model, X_train, y_train, 
                                                                              scoring="accuracy", cv=3).mean()))

XBBoost optimized model expected logloss = 0.6918
XBBoost optimized model expected accuracy = 0.521


### Bagging

A Bagging classifier is an ensemble meta-estimator that fits base classifiers each on random subsets of the original dataset and then aggregate their individual predictions to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator, by introducing randomization into its construction procedure and then making an ensemble out of it.

When base estimators are built on subsets of both samples and features, then the method is known as Random Patches

In [31]:
from sklearn.ensemble import BaggingClassifier
bagged_model = BaggingClassifier(model, n_estimators=10, max_samples=0.9, max_features=0.9, bootstrap=False, n_jobs=4)

In [33]:
print("XBBoost bagged model expected logloss = %.4f" % ( - cross_val_score(bagged_model, X_train, y_train, 
                                                                             scoring="neg_log_loss", cv=3).mean()))
print("XBBoost bagged model expected accuracy = %.3f" % ( cross_val_score(bagged_model, X_train, y_train, 
                                                                              scoring="accuracy", cv=3).mean()))

XBBoost bagged model expected logloss = 0.6918
XBBoost bagged model expected accuracy = 0.521


### Probability Calibration 

The calibration module allows you to better calibrate the probabilities of a given model, or to add support for probability prediction.
Well calibrated classifiers are probabilistic classifiers for which the output of the predict_proba method can be directly interpreted as a confidence level.

We are using a parametric approach based on Platt’s sigmoid model 

In [34]:
from sklearn.calibration import CalibratedClassifierCV
best_model = CalibratedClassifierCV(base_estimator=bagged_model, method='sigmoid', cv=5)

In [36]:
print("XGBoost calibrated model expected logloss = %.4f" % ( - cross_val_score(best_model, X_train, y_train, 
                                                                             scoring="neg_log_loss", cv=3).mean()))
print("XGBoost calibrated model expected accuracy = %.3f" % ( cross_val_score(best_model, X_train, y_train, 
                                                                              scoring="accuracy", cv=3).mean()))

XGBoost calibrated model expected logloss = 0.6916
XGBoost calibrated model expected accuracy = 0.522


### Obatining final predictions

In [37]:
%%time
best_model = best_model.fit(X_train, y_train)

Wall time: 2min 11s


In [38]:
testdata = pd.read_csv(workdir+'numerai_tournament_data.csv')

In [39]:
ids = testdata['t_id']
testdata.drop('t_id', axis=1, inplace=True)
X_test = testdata.as_matrix()

In [40]:
predictions = best_model.predict_proba(X_test)[:,1]

In [41]:
results = pd.read_csv(workdir+"example_predictions.csv")
results["probability"] = predictions
results.to_csv(workdir+"submission_xgb_calib.csv", index=False)

*submission_xgb_calib.csv has logloss of 0.68955*