# Stack and ensemble various models

In [6]:
import random
import os
import pickle
import datetime


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse import csr_matrix, hstack

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from keras.models import load_model, Sequential

import xgboost as xgb

from ml_toolbox.kaggle import KaggleResult


Using Theano backend.


In [7]:
def ensemble_preds(preds, scores, w=None):
    # preds: numpy array (n, m, k), n: samples, m: classes, k: models
    # scores: numpy array
    # w: 0, None -> mean
    # w==1: weighted by score
    # w==2: weighted by rank
    if not w or w==0:
        return preds.sum(axis=2)/preds.shape[2]
    
    if w==1:
        tmp = np.zeros(preds.shape)
        
        for i in range(preds.shape[2]):
            tmp[:,:,i] = preds[:,:,i] * (1/scores[i])
            
        return tmp.sum(axis=2) / np.divide(1,scores).sum()
    
    if w==2:
        w = pd.Series(scores).rank(ascending=False)
        
        tmp = np.zeros(preds.shape)
        
        for i in range(preds.shape[2]):
            tmp[:,:,i] = preds[:,:,i] * w[i]
            
        return tmp.sum(axis=2) / w.sum()

In [8]:
data_dir = './data_ori/'
feat_dir = './data/'
out_dir = './model_stack/'

description = 'ensemble models level 0 V0'

In [9]:
def open_feature_file(fname, samples='train'):
    if fname[-3:] == 'csv':
        if samples=='train':
            X = gatrain[['device_id']].merge( pd.read_csv(os.path.join(feat_dir, fname)), 
                                             on='device_id', 
                                             how='left')
        else:
            X = gatest[['device_id']].merge( pd.read_csv(os.path.join(feat_dir, fname)), 
                                            on='device_id', 
                                            how='left')
            
        X.drop('device_id', axis=1, inplace=True)
        X.fillna(0, inplace=True)
        
        for c in X.columns:
            if X[c].max()>1:
                X[c] = MinMaxScaler().fit_transform(X)
            
        #print X.shape
        return csr_matrix(X.values)
    else:
        # Assume it is a pickle file
        with open(os.path.join(feat_dir, '{}_{}.pickle'.format(fname,samples)), 'rb') as f:
            return pickle.load(f)

In [10]:
# Load CV sets
train_cv = pd.read_csv(os.path.join(data_dir, 'gender_age_train_cv.csv'))
test_cv = pd.read_csv(os.path.join(data_dir, 'gender_age_test_cv.csv'))

In [11]:
test_cv.shape

(7416, 5)

In [12]:
gatrain = pd.read_csv('./data_ori/gender_age_train.csv')
gatest = pd.read_csv('./data_ori/gender_age_test.csv')

In [13]:
y = gatrain['group']
letarget = LabelEncoder().fit(y)
y = letarget.transform(y)
n_classes = len(letarget.classes_)

## Level 1 models
Models only on brand and device model info

In [14]:
feature_files = ['features_brand_model_bag',
                 'features_brand_bag',
                 'features_brand_model.csv']

### Load data for level 1 models

In [15]:
Xtrain = hstack([open_feature_file(f) for f in feature_files], format='csr')
Xtest = hstack([open_feature_file(f,'test') for f in feature_files], format='csr')

In [16]:
X_train, X_val = Xtrain[train_cv.sample_nr.values, :], Xtrain[test_cv.sample_nr.values, :]
y_train, y_val = y[train_cv.sample_nr], y[test_cv.sample_nr]

### Load NN model

In [12]:
modelsfile = './model_0_nn/models_nn_0_V1_2016-08-19-17-01_2.3878_2.3870.pickle'
nfeatures = range(1800)

with open(modelsfile, 'rb') as f:
    nn_models = pickle.load(f)

In [13]:
preds_val_0_nn = np.zeros((X_val.shape[0], 12, 5))
preds_test_0_nn = np.zeros((Xtest.shape[0], 12, 5))

scores = np.zeros(5)

for i,m in enumerate(nn_models['models']):
    model = load_model(m['model']) 
    
    pred_val = model.predict_proba(X_val[:, nfeatures].todense(), batch_size = 128, verbose = 0)
    
    pred_test = model.predict_proba(Xtest[:, nfeatures].todense(), batch_size = 128, verbose = 0)
    
    score = log_loss(y_val, pred_val)
    
    print('CV Score: {:.4f}'.format(score))
    print('Other score: {:.4f}'.format(m['score']))
    
    preds_val_0_nn[:, :, i] = pred_val
    preds_test_0_nn[:, :, i] = pred_test
    scores[i] = score

CV Score: 2.4344
Other score: 2.3885
CV Score: 2.4398
Other score: 2.3866
CV Score: 2.4412
Other score: 2.3924
CV Score: 2.4479
Other score: 2.3842
CV Score: 2.4472
Other score: 2.3871


In [None]:
print('Average score: {:.4f}'.format(np.mean(scores)))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_nn, scores, w=0))))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_nn, scores, w=1))))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_nn, scores, w=2))))


In [None]:
cv_score_0_nn = log_loss(y_val, ensemble_preds(preds_val_0_nn, scores, w=2))
preds_val_0_nn = ensemble_preds(preds_val_0_nn, scores, w=2)
preds_test_0_nn = ensemble_preds(preds_test_0_nn, scores, w=2)

### Load logistic model

In [None]:
modelsfile = './model_0_logistic/models_logistic_0_V3_2016-08-18-16-17_2.3903_-1.0000.pickle'

with open(modelsfile, 'rb') as f:
    log_models = pickle.load(f)
    
nfeatures = range(1803)

In [None]:
preds_val_0_log = np.zeros((X_val.shape[0], 12, 5))
preds_test_0_log = np.zeros((Xtest.shape[0], 12, 5))

scores = np.zeros(5)

for i,m in enumerate(log_models['models']):
    clf = m['model']
    
    pred_val = clf.predict_proba(X_val[:, nfeatures])
    
    pred_test = clf.predict_proba(Xtest[:, nfeatures])
    
    score = log_loss(y_val, pred_val)
    
    print('CV Score: {:.4f}'.format(score))
    print('Other score: {:.4f}'.format(m['score']))
    
    preds_val_0_log[:, :, i] = pred_val
    preds_test_0_log[:, :, i] = pred_test
    scores[i] = score

In [None]:
print('Average score: {:.4f}'.format(np.mean(scores)))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_log, scores, w=0))))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_log, scores, w=1))))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_log, scores, w=2))))

In [None]:
cv_score_0_log = log_loss(y_val, ensemble_preds(preds_val_0_log, scores, w=2))
preds_val_0_log = ensemble_preds(preds_val_0_log, scores, w=2)
preds_test_0_log = ensemble_preds(preds_test_0_log, scores, w=2)

### Load XGBoost model

In [None]:
modelsfile = './model_0_xgboost/models_xgboost_0_V3_2016-08-19-16-36_2.3905_2.3902.pickle'

with open(modelsfile, 'rb') as f:
    xgb_models = pickle.load(f)
    
nfeatures = xgb_models['features']

In [None]:
preds_val_0_xgb = np.zeros((X_val.shape[0], 12, 5))
preds_test_0_xgb = np.zeros((Xtest.shape[0], 12, 5))

scores = np.zeros(5)

for i,m in enumerate(xgb_models['models']):
    clf = m['model']
    
    pred_val = clf.predict(xgb.DMatrix(X_val[:, nfeatures]))
    
    pred_test = clf.predict(xgb.DMatrix(Xtest[:, nfeatures]))
    
    score = log_loss(y_val, pred_val)
    
    print('CV Score: {:.4f}'.format(score))
    print('Other score: {:.4f}'.format(m['score']))
    
    preds_val_0_xgb[:, :, i] = pred_val
    preds_test_0_xgb[:, :, i] = pred_test
    scores[i] = score

In [None]:
print('Average score: {:.4f}'.format(np.mean(scores)))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_xgb, scores, w=0))))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_xgb, scores, w=1))))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_xgb, scores, w=2))))

In [None]:
cv_score_0_xgb = log_loss(y_val, ensemble_preds(preds_val_0_xgb, scores, w=2))
preds_val_0_xgb = ensemble_preds(preds_val_0_xgb, scores, w=2)
preds_test_0_xgb = ensemble_preds(preds_test_0_xgb, scores, w=2)

### Load Bayesian model

In [28]:
class GenderAgeGroupProbCombined(object):
    def __init__(self, prior_weight=(30,20), w=(1,1.3), by=('brand', 'model')):
        self.prior_weight_brand = prior_weight[0]
        self.prior_weight_model = prior_weight[1]
        
        self.w_brand = w[0]
        self.w_model = w[1]
        
        self.by_brand = by[0]
        self.by_model = by[1]
    
    def fit(self, df):
        
        self.prior_brand = df['group'].value_counts().sort_index()/df.shape[0]
        self.prior_model = df['group'].value_counts().sort_index()/df.shape[0]
        
        c_brand = df.groupby([self.by_brand, 'group']).size().unstack().fillna(0)
        c_model = df.groupby([self.by_model, 'group']).size().unstack().fillna(0)
        
        self.prob_brand = (c_brand.add(self.prior_weight_brand*self.prior_brand)).div(c_brand.sum(axis=1)+self.prior_weight_brand, axis=0)
        self.prob_model = (c_model.add(self.prior_weight_model*self.prior_model)).div(c_model.sum(axis=1)+self.prior_weight_model, axis=0)
        
        return self
    
    def predict_proba(self, df):
        pred_brand = df[[self.by_brand]].merge(self.prob_brand, 
                                   how='left', 
                                   left_on=self.by_brand, 
                                   right_index=True).fillna(self.prior_brand)[self.prob_brand.columns]
        pred_model = df[[self.by_model]].merge(self.prob_model, 
                                   how='left', 
                                   left_on=self.by_model, 
                                   right_index=True).fillna(self.prior_model)[self.prob_model.columns]
        
        pred_brand.loc[pred_brand.iloc[:,0].isnull(),:] = self.prior_brand
        pred_model.loc[pred_model.iloc[:,0].isnull(),:] = self.prior_model
        return ((pred_brand*self.w_brand + pred_model*self.w_model) / (self.w_brand + self.w_model)).values

In [29]:
phone = pd.read_csv('./data_ori/phone_brand_device_model.csv',encoding='utf-8')
phone = phone.drop_duplicates('device_id', keep='first')

lebrand = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = lebrand.transform(phone.phone_brand)
m = phone.phone_brand.str.cat(phone.device_model)
lemodel = LabelEncoder().fit(m)
phone['model'] = lemodel.transform(m)

Xtrain_bay = gatrain.merge(phone[['device_id','brand','model']], how='left',on='device_id')
Xtest_bay = gatest.merge(phone[['device_id','brand','model']], how='left',on='device_id')

X_train_bay, X_val_bay = Xtrain_bay.loc[train_cv.sample_nr.values, :], Xtrain_bay.loc[test_cv.sample_nr.values, :]

X_train_bay.head(3)

Unnamed: 0,device_id,gender,age,group,brand,model
68487,-1000369272589010951,F,26,F24-26,13,254
1280,-1000572055892391496,F,27,F27-28,7,141
56524,-1000643208750517791,M,29,M29-31,120,1581


In [30]:
modelsfile = './model_0_bayes/models_bayes_0_V1_2016-08-19-22-13_2.3892_-1.0000.pickle'

with open(modelsfile, 'rb') as f:
    bay_models = pickle.load(f)
    

In [31]:
preds_val_0_bay = np.zeros((X_val.shape[0], 12, 1))
preds_test_0_bay = np.zeros((Xtest.shape[0], 12, 1))

scores = np.zeros(1)

for i,m in enumerate(bay_models['models']):
    clf = m['model']
    
    pred_val = clf.predict_proba(X_val_bay)
    
    pred_test = clf.predict_proba(Xtest_bay)
    
    score = log_loss(y_val, pred_val)
    
    print('CV Score: {:.4f}'.format(score))
    print('Other score: {:.4f}'.format(m['score']))
    
    preds_val_0_bay[:, :, i] = pred_val
    preds_test_0_bay[:, :, i] = pred_test
    scores[i] = score

CV Score: 2.3892
Other score: 2.3892


In [32]:
print('Average score: {:.4f}'.format(np.mean(scores)))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_bay, scores, w=0))))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_bay, scores, w=1))))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_bay, scores, w=2))))

Average score: 2.3892
Average score: 2.3892
Average score: 2.3892
Average score: 2.3892


In [None]:
cv_score_0_bay = score
preds_val_0_bay = ensemble_preds(preds_val_0_bay, scores, w=2)
preds_test_0_bay = ensemble_preds(preds_test_0_bay, scores, w=2)

## Merge level 0 models

In [None]:
preds_all = np.zeros((X_val.shape[0], 12, 4))
preds_all[:,:,0] = preds_val_0_nn
preds_all[:,:,1] = preds_val_0_log
preds_all[:,:,2] = preds_val_0_xgb
preds_all[:,:,3] = preds_val_0_bay

In [None]:
scores = [1,4,3,2]
scores

In [None]:
preds_all_val = ensemble_preds(preds_all, scores, w=2)
cv_score_all = log_loss(y_val, preds_all_val)
print(cv_score_all)

In [None]:
preds_all_test = np.zeros((Xtest.shape[0], 12, 4))
preds_all_test[:,:,0] = preds_test_0_nn
preds_all_test[:,:,1] = preds_test_0_log
preds_all_test[:,:,2] = preds_test_0_xgb
preds_all_test[:,:,3] = preds_test_0_bay
preds_all_test = ensemble_preds(preds_all_test, scores, w=2)

### Upload to Kaggle

In [None]:
kag = KaggleResult(preds_all_test, 
                   gatest.device_id.values, 
                   cv_score=cv_score_all, 
                   description=description, 
                   subdir=out_dir)

In [None]:
if kag.validate()[0]:
    kag.upload()
print kag.lb_score

## Store predictions using all models on train set for features

In [None]:
Sequential

In [None]:
clf = load_model(nn_models['models'][0]['model']) 

In [None]:
isinstance(clf, Sequential)

In [None]:
isinstance(nn_models['models'][0],'str')

In [45]:
def predict_using_stored_models(models_file, X, y=None, scorer=log_loss):

    with open(models_file, 'rb') as f:
        models = pickle.load(f)
        
    n_models = len(models['models'])
    n_classes = len(np.unique(y))

    preds = np.zeros((X.shape[0], n_classes, n_models))
    scores = np.zeros(n_models)

    for i,m in enumerate(models['models']):
        if isinstance(m['model'],str):
            # Assume link to nn model file
            clf = load_model(m['model']) 
        else:
            clf = m['model']
        
        if isinstance(clf, Sequential):
            ## NN network
            pred = clf.predict_proba(X.todense(), batch_size = 128, verbose = 0)
        elif isinstance(clf, xgb.Booster):
            ## XGBoost
            pred = clf.predict(xgb.DMatrix(X))
        else:
            pred = clf.predict_proba(X)
        
        if y.size:
            score = scorer(y, pred)
            print('CV score: {:.4f}, calculated score: {:4f} '.format(m['score'], score))
            
            scores[i] = score
            
        preds[:, :, i] = pred
        
    return preds, scores

In [None]:
f = './model_0_logistic/models_logistic_0_V3_2016-08-18-16-17_2.3903_-1.0000.pickle'
a,b = predict_using_stored_models(f, X_val[:, 0:1803], y_val)

In [35]:
modelfiles = ['./model_0_nn/models_nn_0_V1_2016-08-19-17-01_2.3878_2.3870.pickle',
              './model_0_logistic/models_logistic_0_V3_2016-08-18-16-17_2.3903_-1.0000.pickle',
              './model_0_xgboost/models_xgboost_0_V3_2016-08-19-16-36_2.3905_2.3902.pickle']
nf = [1800, 1803, 1800]

In [36]:
for f,n in zip(modelfiles, nf):
    a,b = predict_using_stored_models(f, X_val[:, range(n)], y_val)

CV score: 2.3885, calculated score: 2.434427 
CV score: 2.3866, calculated score: 2.439837 
CV score: 2.3924, calculated score: 2.441217 
CV score: 2.3842, calculated score: 2.447946 
CV score: 2.3871, calculated score: 2.447213 
CV score: 2.3904, calculated score: 2.439151 
CV score: 2.3903, calculated score: 2.441327 
CV score: 2.3902, calculated score: 2.442071 
CV score: 2.3904, calculated score: 2.443019 
CV score: 2.3906, calculated score: 2.446342 
CV score: 2.3870, calculated score: 2.450949 
CV score: 2.3912, calculated score: 2.449825 
CV score: 2.3957, calculated score: 2.449556 
CV score: 2.3888, calculated score: 2.447271 
CV score: 2.3898, calculated score: 2.450122 


In [41]:
a,b = predict_using_stored_models('./model_0_bayes/models_bayes_0_V1_2016-08-19-22-13_2.3892_-1.0000.pickle', X_val_bay, y_val)

CV score: 2.3892, calculated score: 2.389166 


In [46]:
for f,n in zip(modelfiles, nf):
    a,b = predict_using_stored_models(f, Xtrain[:, range(n)], y)

CV score: 2.3885, calculated score: 2.436351 
CV score: 2.3866, calculated score: 2.441388 
CV score: 2.3924, calculated score: 2.440594 
CV score: 2.3842, calculated score: 2.447777 
CV score: 2.3871, calculated score: 2.444650 
CV score: 2.3904, calculated score: 2.439844 
CV score: 2.3903, calculated score: 2.441916 
CV score: 2.3902, calculated score: 2.442943 
CV score: 2.3904, calculated score: 2.443930 
CV score: 2.3906, calculated score: 2.447259 
CV score: 2.3870, calculated score: 2.451598 
CV score: 2.3912, calculated score: 2.451175 
CV score: 2.3957, calculated score: 2.450838 
CV score: 2.3888, calculated score: 2.448749 
CV score: 2.3898, calculated score: 2.451016 


In [37]:
a = ensemble_preds(a, b, 1)

In [39]:
a.shape

(7416, 12)