Stacking (also called meta ensembling) is a model ensembling technique used to combine information from multiple predictive models to generate a new model. Often times the stacked model (also called 2nd-level model) will outperform each of the individual models due its smoothing nature and ability to highlight each base model where it performs best and discredit each base model where it performs poorly. For this reason, stacking is most effective when the base models are significantly different. 

http://blog.kaggle.com/2016/12/27/a-kagglers-guide-to-model-stacking-in-practice/

In [1]:
import pandas as pd
import numpy as np
workdir = "/home/ubuntu/data/"

In [2]:
train = pd.read_csv(workdir+'numerai_training_data.csv')

In [3]:
train.columns

Index([u'feature1', u'feature2', u'feature3', u'feature4', u'feature5',
       u'feature6', u'feature7', u'feature8', u'feature9', u'feature10',
       u'feature11', u'feature12', u'feature13', u'feature14', u'feature15',
       u'feature16', u'feature17', u'feature18', u'feature19', u'feature20',
       u'feature21', u'feature22', u'feature23', u'feature24', u'feature25',
       u'feature26', u'feature27', u'feature28', u'feature29', u'feature30',
       u'feature31', u'feature32', u'feature33', u'feature34', u'feature35',
       u'feature36', u'feature37', u'feature38', u'feature39', u'feature40',
       u'feature41', u'feature42', u'feature43', u'feature44', u'feature45',
       u'feature46', u'feature47', u'feature48', u'feature49', u'feature50',
       u'target'],
      dtype='object')

In [4]:
n_folds = 3

In [5]:
train["Fold"] = np.random.choice(range(1, n_folds + 1), train.shape[0])

In [6]:
folds_col = "Fold"
target_col = "target"
features = [u'feature1', u'feature2', u'feature3', u'feature4', u'feature5',
       u'feature6', u'feature7', u'feature8', u'feature9', u'feature10',
       u'feature11', u'feature12', u'feature13', u'feature14', u'feature15',
       u'feature16', u'feature17', u'feature18', u'feature19', u'feature20',
       u'feature21', u'feature22', u'feature23', u'feature24', u'feature25',
       u'feature26', u'feature27', u'feature28', u'feature29', u'feature30',
       u'feature31', u'feature32', u'feature33', u'feature34', u'feature35',
       u'feature36', u'feature37', u'feature38', u'feature39', u'feature40',
       u'feature41', u'feature42', u'feature43', u'feature44', u'feature45',
       u'feature46', u'feature47', u'feature48', u'feature49', u'feature50']

In [7]:
def predict_by_folds(model, model_name, train_df1, folds_col, target_col, features_col): 
    train_df = train_df1.copy()
    
    folds = train_df[folds_col]
    target = train_df[target_col]
    data = train_df[features_col]
    
    for f in folds.unique():
        train_i = folds != f
        train1 = data[train_i]
        test1 = data[-train_i]
        y1 = target[train_i]
        model.fit(train1.as_matrix(), y1.as_matrix().ravel())
        y1_pred = model.predict_proba(test1.as_matrix())
        train_df.loc[-train_i, model_name] = y1_pred[:,1]
    
    return train_df

In [8]:
def make_predictions(model, train_df, test_df, target_col, features_col): 
    X_train = train_df[features_col].as_matrix()
    y_train = train_df[target_col].as_matrix().ravel()
    X_test = test_df[features_col].as_matrix()
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)
    y_pred = y_pred[:,1]
    return y_pred

In [9]:
testdata = pd.read_csv(workdir+'numerai_tournament_data.csv')

In [10]:
predictions = testdata[["t_id"]]

In [11]:
l1models = []

In [12]:
train.to_csv(workdir+'numerai_training_data_folds.csv', index=False)

### Level 1 Models

Model 1: XGBoost

In [21]:
from xgboost import XGBClassifier
best_params = {'colsample_bytree': 0.3, 'learning_rate': 0.01, 'min_child_weight': 3.0, 'n_estimators': 400, 'subsample': 0.2, 'max_depth': 5, 'gamma': 0.95}

In [14]:
model = XGBClassifier(**best_params)

from sklearn.ensemble import BaggingClassifier
bagged_model = BaggingClassifier(model, n_estimators=10, max_samples=0.9, max_features=0.9, bootstrap=False, n_jobs=1)

from sklearn.calibration import CalibratedClassifierCV
best_model = CalibratedClassifierCV(base_estimator=bagged_model, method='sigmoid', cv=5)

In [15]:
%%time
train1 = predict_by_folds(best_model, "XGBClassifier", train, folds_col, target_col, features)

CPU times: user 2h 56min 2s, sys: 6.8 s, total: 2h 56min 9s
Wall time: 11min 25s


In [16]:
%%time
prediction = make_predictions(best_model, train, testdata, target_col, features)

CPU times: user 1h 25min 49s, sys: 18.8 s, total: 1h 26min 7s
Wall time: 5min 48s


In [19]:
predictions.loc[:, "XGBClassifier"] = prediction

In [20]:
l1models.append("XGBClassifier")

In [22]:
#train1.to_csv(workdir+'numerai_training_data_folds.csv', index=False)
#predictions.to_csv(workdir+'numerai_predictions.csv', index=False)

Model 2: RF

In [23]:
from sklearn.ensemble import RandomForestClassifier
best_params = {'min_samples_split': 4, 'n_estimators': 1500}
best_params["n_jobs"] =  -1

In [24]:
model = RandomForestClassifier(**best_params)
best_model = CalibratedClassifierCV(base_estimator=model, method='sigmoid', cv=3)

In [25]:
%%time
train1 = predict_by_folds(best_model, "RandomForestClassifier", train1, folds_col, target_col, features)

CPU times: user 3h 52min 26s, sys: 25.4 s, total: 3h 52min 51s
Wall time: 15min 20s


In [26]:
%%time
prediction = make_predictions(best_model, train, testdata, target_col, features)

CPU times: user 2h 5min 56s, sys: 12.4 s, total: 2h 6min 8s
Wall time: 8min 9s


In [27]:
predictions.loc[:, "RandomForestClassifier"] = prediction
l1models.append("RandomForestClassifier")

In [28]:
train1.to_csv(workdir+'numerai_training_data_folds.csv', index=False)
predictions.to_csv(workdir+'numerai_predictions.csv', index=False)

Model 3: k-NN

In [29]:
from sklearn.neighbors import KNeighborsClassifier
best_params = {'n_neighbors': 900}
best_params["n_jobs"] = -1

In [30]:
model = KNeighborsClassifier(**best_params)
best_model = CalibratedClassifierCV(base_estimator=model, method='sigmoid', cv=3)

In [31]:
%%time
train1 = predict_by_folds(best_model, "KNeighborsClassifier", train1, folds_col, target_col, features)

CPU times: user 4h 18min 21s, sys: 2.92 s, total: 4h 18min 24s
Wall time: 17min 7s


In [34]:
%%time
prediction = make_predictions(best_model, train, testdata, target_col, features)
predictions.loc[:, "KNeighborsClassifier"] = prediction

CPU times: user 5h 11min 1s, sys: 2.35 s, total: 5h 11min 4s
Wall time: 20min 16s


In [35]:
l1models.append("KNeighborsClassifier")

In [36]:
train1.to_csv(workdir+'numerai_training_data_folds.csv', index=False)
predictions.to_csv(workdir+'numerai_predictions.csv', index=False)

Model 4: Naive Bayes

In [37]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [38]:
bagged_model = BaggingClassifier(model, n_estimators=10, max_samples=0.9, max_features=0.9, bootstrap=False, n_jobs=-1)
best_model = CalibratedClassifierCV(base_estimator=bagged_model, method='sigmoid', cv=5)

In [39]:
%%time
train1 = predict_by_folds(best_model, "NaiveBayes", train1, folds_col, target_col, features)

CPU times: user 17.8 s, sys: 10.2 s, total: 28 s
Wall time: 33.3 s


In [40]:
%%time
prediction = make_predictions(best_model, train, testdata, target_col, features)

CPU times: user 9.4 s, sys: 3.44 s, total: 12.8 s
Wall time: 15.5 s


In [41]:
predictions.loc[:, "NaiveBayes"] = prediction
l1models.append("NaiveBayes")

In [42]:
train1.to_csv(workdir+'numerai_training_data_folds.csv', index=False)
predictions.to_csv(workdir+'numerai_predictions.csv', index=False)

Model 5: LogisticRegression

In [43]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
model = LogisticRegressionCV(max_iter=500, n_jobs=-1)

In [44]:
model.fit(train[features], train[target_col])

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=500,
           multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [45]:
C = model.C_[0]
C

0.0059948425031894088

In [46]:
model = LogisticRegression(max_iter=500, C=C) 

In [47]:
bagged_model = BaggingClassifier(model, n_estimators=10, max_samples=0.9, max_features=0.9, bootstrap=False, n_jobs=-1)
best_model = CalibratedClassifierCV(base_estimator=bagged_model, method='sigmoid', cv=5)

In [48]:
%%time
train1 = predict_by_folds(best_model, "LogisticRegression", train1, folds_col, target_col, features)

CPU times: user 18.2 s, sys: 10.2 s, total: 28.4 s
Wall time: 53.6 s


In [49]:
%%time
prediction = make_predictions(best_model, train, testdata, target_col, features)
predictions.loc[:, "LogisticRegression"] = prediction

CPU times: user 9.38 s, sys: 3.34 s, total: 12.7 s
Wall time: 25.8 s


In [50]:
l1models.append("LogisticRegression")

In [51]:
train1.to_csv(workdir+'numerai_training_data_folds.csv', index=False)
predictions.to_csv(workdir+'numerai_predictions.csv', index=False)

### Level 2 Model

In [52]:
train2 = train1[l1models]
target = train[target_col]
test2 = predictions[l1models]

In [57]:
model2 = LogisticRegressionCV(max_iter=500, n_jobs=-1)
model2.fit(train2, target)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=500,
           multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [58]:
zip(l1models, model2.coef_[0])

[('XGBClassifier', 1.4024466949162022),
 ('RandomForestClassifier', 0.8456356192421759),
 ('KNeighborsClassifier', 0.56326595638255461),
 ('NaiveBayes', 0.31091337980485001),
 ('LogisticRegression', 1.7201707132887152)]

In [59]:
prediction2 = model2.predict_proba(test2)

In [60]:
results = pd.read_csv(workdir+"example_predictions.csv")
results["probability"] = prediction2[:,1]
results.to_csv(workdir+"submission_stacked_lr_1.csv", index=False)

*submission_stacked_lr_1.csv has logloss of 0.689.*