In [29]:
import pandas as pd
from sklearn import preprocessing
from mlxtend.classifier import StackingCVClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn import model_selection
import numpy as np
from xgboost import XGBClassifier

In [30]:
#data for training 
train = pd.read_csv('train_s.csv', sep=',')
# Select only 5000 obs. to show demo
train = train.head(5000)
train = train.drop('ID_code', 1)

In [31]:
# prediction data
preddata = pd.read_csv('test_s.csv', sep=',')
predids = preddata[['ID_code']] 
preddata = preddata.drop('ID_code', 1)

In [32]:
# Format train data
y_train = train['target']
x_train = train.drop('target', 1)

In [33]:
# Scaling the data
scaler = preprocessing.StandardScaler()
scaled_df = scaler.fit_transform(x_train)
x_train = pd.DataFrame(scaled_df)
scaled_df = scaler.fit_transform(preddata)
preddata = pd.DataFrame(scaled_df)

In [2]:
# x,y to np 
x_train = x_train.values
y_train = y_train.values

The next step is to train and stack some models. Here I use Logistic Regression, RF. The tree models will be stacked using xgboost. In the code below, the models and the stacking classifier are defined first. Then each model is trained using CV.

Finally,the stacking classifier is fitted and predictions are obtained.

In [35]:
# 1st set of models
clf1 = LogisticRegression()
clf2 = RandomForestClassifier(random_state=1, n_estimators=10) # just for demo have taken 10 trees 
xgb = XGBClassifier()

stacking_demo = StackingCVClassifier(classifiers=[clf1, clf2], meta_classifier=xgb, use_probas=True, cv=3)

In [36]:
# Do CV
for clf, label in zip([clf1, clf2, stacking_demo], 
                      ['lr', 
                       'Random Forest', 
                       'StackingClassifier']):

    scores = model_selection.cross_val_score(clf, x_train, y_train, cv=3, scoring='roc_auc')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))


Accuracy: 0.79 (+/- 0.01) [lr]
Accuracy: 0.64 (+/- 0.01) [Random Forest]
Accuracy: 0.76 (+/- 0.00) [StackingClassifier]


In [37]:
# Fit on train data / predict on test data
sclf_fit = stacking_demo.fit(x_train, y_train)
mypreds = sclf_fit.predict_proba(preddata)
# "predict" give us classes, "predict_proba" give us probabilities

# Probabilities for classes (1,0)
zeros = [i[0] for i in mypreds]
ones  = [i[1] for i in mypreds]

# Get IDs and predictions
y_id = predids.values.tolist()
preddf = pd.DataFrame({'ID_code': y_id,'target': ones})
preddf['ID_code'] = preddf['ID_code'].map(lambda x: str(x)[:-2])
preddf['ID_code'] = preddf['ID_code'].map(lambda x: str(x)[2:])

# Look at predictions
print(preddf.head())

  ID_code    target
0  test_0  0.117983
1  test_1  0.071644
2  test_2  0.011566
3  test_3  0.083270
4  test_4  0.017479
