# Stack and ensemble various models

In [17]:
import random
import os
import pickle
import datetime


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse import csr_matrix, hstack

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold


from keras.models import load_model

import xgboost as xgb

from ml_toolbox.kaggle import KaggleResult


In [18]:

def ensemble_preds(preds, scores, w=None):
    # preds: numpy array (n, m, k), n: samples, m: classes, k: models
    # scores: numpy array
    # w: 0, None -> mean
    # w==1: weighted by score
    # w==2: weighted by rank
    if not w or w==0:
        return preds.sum(axis=2)/preds.shape[2]
    
    if w==1:
        tmp = np.zeros(preds.shape)
        
        for i in range(preds.shape[2]):
            tmp[:,:,i] = preds[:,:,i] * (1/scores[i])
            
        return tmp.sum(axis=2) / np.divide(1,scores).sum()
    
    if w==2:
        w = pd.Series(scores).rank(ascending=False)
        
        tmp = np.zeros(preds.shape)
        
        for i in range(preds.shape[2]):
            tmp[:,:,i] = preds[:,:,i] * w[i]
            
        return tmp.sum(axis=2) / w.sum()

In [19]:
data_dir = './data_ori/'
feat_dir = './data/'
out_dir = './model_stack/'

description = 'ensemble models level 1 V0'

In [20]:
def open_feature_file(fname, samples='train'):
    if fname[-3:] == 'csv':
        if samples=='train':
            X = gatrain[['device_id']].merge( pd.read_csv(os.path.join(feat_dir, fname)), 
                                             on='device_id', 
                                             how='left')
        else:
            X = gatest[['device_id']].merge( pd.read_csv(os.path.join(feat_dir, fname)), 
                                            on='device_id', 
                                            how='left')
            
        X.drop('device_id', axis=1, inplace=True)
        X.fillna(0, inplace=True)
        
        for c in X.columns:
            if X[c].max()>1:
                X[c] = MinMaxScaler().fit_transform(X)
            
        #print X.shape
        return csr_matrix(X.values)
    else:
        # Assume it is a pickle file
        with open(os.path.join(feat_dir, '{}_{}.pickle'.format(fname,samples)), 'rb') as f:
            return pickle.load(f)

In [21]:
# Load CV sets
train_cv = pd.read_csv(os.path.join(data_dir, 'gender_age_train_cv.csv'))
test_cv = pd.read_csv(os.path.join(data_dir, 'gender_age_test_cv.csv'))

In [22]:
gatrain = pd.read_csv('./data_ori/gender_age_train.csv')
gatest = pd.read_csv('./data_ori/gender_age_test.csv')

In [23]:
y = gatrain['group']
letarget = LabelEncoder().fit(y)
y = letarget.transform(y)
n_classes = len(letarget.classes_)

## Level 1 models
Models only on brand and device model info

In [24]:
feature_files = ['features_brand_model_bag',
                 'features_brand_bag',
                 'features_appid_installed',
                 'features_label_app_installed']

### Load data for level 1 models

In [25]:
Xtrain = hstack([open_feature_file(f) for f in feature_files], format='csr')
Xtest = hstack([open_feature_file(f,'test') for f in feature_files], format='csr')

In [26]:
# With selection (15xxx features): 2.27427
# Without selection (21527 features): 2.27427
selector = VarianceThreshold().fit(Xtrain)
Xtrain = selector.transform(Xtrain)
Xtest = selector.transform(Xtest)

In [27]:
X_train, X_val = Xtrain[train_cv.sample_nr.values, :], Xtrain[test_cv.sample_nr.values, :]
y_train, y_val = y[train_cv.sample_nr], y[test_cv.sample_nr]

In [28]:
X_train.shape

(67229, 15853)

### Load NN model

In [29]:
modelsfile = './model_1_nn/models_nn_1_V2_2016-08-20-00-47_2.2521_-1.0000.pickle'

with open(modelsfile, 'rb') as f:
    nn_models = pickle.load(f)

In [30]:
preds_val_0_nn = np.zeros((X_val.shape[0], 12, 5))
preds_test_0_nn = np.zeros((Xtest.shape[0], 12, 5))

scores = np.zeros(5)

for i,m in enumerate(nn_models['models']):
    model = load_model(m['model']) 
    
    pred_val = model.predict_proba(X_val.todense(), batch_size = 128, verbose = 0)
    
    pred_test = model.predict_proba(Xtest.todense(), batch_size = 128, verbose = 0)
    
    score = log_loss(y_val, pred_val)
    
    print('CV Score: {:.4f}'.format(score))
    print('Other score: {:.4f}'.format(m['score']))
    
    preds_val_0_nn[:, :, i] = pred_val
    preds_test_0_nn[:, :, i] = pred_test
    scores[i] = score

CV Score: 2.2529
Other score: 2.2529
CV Score: 2.2508
Other score: 2.2508
CV Score: 2.2507
Other score: 2.2507
CV Score: 2.2546
Other score: 2.2546
CV Score: 2.2521
Other score: 2.2521


In [31]:
print('Average score: {:.4f}'.format(np.mean(scores)))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_nn, scores, w=0))))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_nn, scores, w=1))))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_nn, scores, w=2))))


Average score: 2.2522
Average score: 2.2491
Average score: 2.2491
Average score: 2.2488


In [32]:
cv_score_0_nn = log_loss(y_val, ensemble_preds(preds_val_0_nn, scores, w=2))
preds_val_0_nn = ensemble_preds(preds_val_0_nn, scores, w=2)
preds_test_0_nn = ensemble_preds(preds_test_0_nn, scores, w=2)

### Load logistic model

In [None]:
modelsfile = './model_0_logistic/models_logistic_0_V3_2016-08-18-16-17_2.3903_-1.0000.pickle'

with open(modelsfile, 'rb') as f:
    log_models = pickle.load(f)
    
nfeatures = range(1803)

In [None]:
preds_val_0_log = np.zeros((X_val.shape[0], 12, 5))
preds_test_0_log = np.zeros((Xtest.shape[0], 12, 5))

scores = np.zeros(5)

for i,m in enumerate(log_models['models']):
    clf = m['model']
    
    pred_val = clf.predict_proba(X_val[:, nfeatures])
    
    pred_test = clf.predict_proba(Xtest[:, nfeatures])
    
    score = log_loss(y_val, pred_val)
    
    print('CV Score: {:.4f}'.format(score))
    print('Other score: {:.4f}'.format(m['score']))
    
    preds_val_0_log[:, :, i] = pred_val
    preds_test_0_log[:, :, i] = pred_test
    scores[i] = score

In [None]:
print('Average score: {:.4f}'.format(np.mean(scores)))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_log, scores, w=0))))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_log, scores, w=1))))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_log, scores, w=2))))

In [None]:
cv_score_0_log = log_loss(y_val, ensemble_preds(preds_val_0_log, scores, w=2))
preds_val_0_log = ensemble_preds(preds_val_0_log, scores, w=2)
preds_test_0_log = ensemble_preds(preds_test_0_log, scores, w=2)

### Load XGBoost model

In [34]:
modelsfile = './model_1_xgboost/models_xgboost_0_V2_2016-08-18-21-24_2.2740_2.26536.pickle'

with open(modelsfile, 'rb') as f:
    xgb_models = pickle.load(f)
    

In [35]:
preds_val_0_xgb = np.zeros((X_val.shape[0], 12, 5))
preds_test_0_xgb = np.zeros((Xtest.shape[0], 12, 5))

scores = np.zeros(5)

for i,m in enumerate(xgb_models['models']):
    clf = m['model']
    
    pred_val = clf.predict(xgb.DMatrix(X_val[:, nfeatures]))
    
    pred_test = clf.predict(xgb.DMatrix(Xtest[:, nfeatures]))
    
    score = log_loss(y_val, pred_val)
    
    print('CV Score: {:.4f}'.format(score))
    print('Other score: {:.4f}'.format(m['score']))
    
    preds_val_0_xgb[:, :, i] = pred_val
    preds_test_0_xgb[:, :, i] = pred_test
    scores[i] = score

CV Score: 2.4481
Other score: 2.2759
CV Score: 2.4483
Other score: 2.2735
CV Score: 2.4481
Other score: 2.2692
CV Score: 2.4489
Other score: 2.2782
CV Score: 2.4482
Other score: 2.2733


In [36]:
print('Average score: {:.4f}'.format(np.mean(scores)))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_xgb, scores, w=0))))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_xgb, scores, w=1))))
print('Average score: {:.4f}'.format(log_loss(y_val, 
                                              ensemble_preds(preds_val_0_xgb, scores, w=2))))

Average score: 2.4483
Average score: 2.4480
Average score: 2.4480
Average score: 2.4479


In [37]:
cv_score_0_xgb = log_loss(y_val, ensemble_preds(preds_val_0_xgb, scores, w=2))
preds_val_0_xgb = ensemble_preds(preds_val_0_xgb, scores, w=2)
preds_test_0_xgb = ensemble_preds(preds_test_0_xgb, scores, w=2)

## Merge level 1 models

In [38]:
preds_all = np.zeros((X_val.shape[0], 12, 1))
preds_all[:,:,0] = preds_val_0_nn

In [39]:
scores = [1]
scores

[1]

In [40]:
preds_all_val = ensemble_preds(preds_all, scores, w=2)
cv_score_all = log_loss(y_val, preds_all_val)
print(cv_score_all)

2.248813447


In [42]:
preds_all_test = np.zeros((Xtest.shape[0], 12, 1))
preds_all_test[:,:,0] = preds_test_0_nn
preds_all_test = ensemble_preds(preds_all_test, scores, w=2)

### Upload to Kaggle

In [43]:
kag = KaggleResult(preds_all_test, 
                   gatest.device_id.values, 
                   cv_score=cv_score_all, 
                   description=description, 
                   subdir=out_dir)

In [44]:
if kag.validate()[0]:
    kag.upload()
print kag.lb_score

2.24184


## Merge with level 0  models

In [69]:
preds_lvl_0 = pd.read_csv(os.path.join(out_dir,'submission_2.3713_2016-08-19-22-36_2.3859.csv'), index_col='device_id')

In [70]:
# Load CV sets with events
train_cv_w = pd.read_csv(os.path.join(data_dir, 'gender_age_train_cv_w.csv'))
test_cv_w = pd.read_csv(os.path.join(data_dir, 'gender_age_val_cv_w.csv'))

In [71]:
test_cv_w.head()

Unnamed: 0,device_id,sample_nr
0,1002079943728939269,0
1,-1547860181818787117,1
2,7374582448058474277,2
3,-6220210354783429585,3
4,6873889408535437611,10


In [73]:
preds_lvl_0.iloc[test_cv_w.sample_nr.values,:] = preds_all_test[test_cv_w.sample_nr.values,:]

In [74]:
kag = KaggleResult(preds_lvl_0, 
                   gatest.device_id.values, 
                   cv_score=cv_score_all, 
                   description='models lvl 0 all - lvl 1 nn', 
                   subdir=out_dir)

In [75]:
if kag.validate()[0]:
    kag.upload()
print kag.lb_score

2.23768
