# Train level 0: XGBoost model

- V1:Optimize xgboost for training on brand / device
    - Alpha: increases for alpha value 1 to 5 and from 0.1 to 0.9, 0 is best value
    - Lambda: decreases for [-0.05, 0,  0.05, 0.1, 0.2, 0.5, 1, 2, 5], it decreases after five: [6, 7, 8, 10, 12, 15, 20]. 6.2 optimal 2.39044903984
    - Number rounds: 25, 50, 75, 100: increases with number of rounds, 75 seems optimal
    - Select best data for model (onehotencoded both, separate, or just one)
        - Brands onehot, device label: 2.4304884100135387
        - Seperate onehot encoded: 2.3946668507651609 (requires regularization)
        - Combined onehot encoded: 2.402189399699036
- V2: Used couple of creative features based on brand and device model, also included brand when encoded device
    - Without scaler: 2.38983383101
    - With scaler: 2.3901336994178406
    - Alpha: score decreases, but train score decreases further, so less overfitting
    - Lambda: score decreases, but train score decreases further, so less overfitting (less dramatic than alpha)
    - Weet niet waar ik ga aan doen: ga maar voor gemiddeld: alpha: 1, lambda: 3
- V3: Generated five different models with different seeds without scaler and first 1800 features
    - CV Score: 2.3905, LB Score: 2.39022

In [1]:
import os
import pickle
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse import csr_matrix, hstack

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import StandardScaler

from xgboost.sklearn import XGBClassifier
import xgboost as xgb

from ml_toolbox.kaggle import KaggleResult


%matplotlib inline


### Loading and preparing data

In [4]:
data_dir = './data_ori/'
feat_dir = './data/'
sub_dir = './model_0_xgboost'

description = 'models_xgboost_0_V3'

use_scaler = False

In [5]:
def open_feature_file(fname, samples='train'):
    if fname[-3:] == 'csv':
        if samples=='train':
            X = gatrain[['device_id']].merge( pd.read_csv(os.path.join(feat_dir, fname)), on='device_id', how='left')
        else:
            X = gatest[['device_id']].merge( pd.read_csv(os.path.join(feat_dir, fname)), on='device_id', how='left')
            
        X.drop('device_id', axis=1, inplace=True)
        X.fillna(0, inplace=True)
        
        if use_scaler:
            for c in X.columns:
                if X[c].max()>1:
                    X[c] = StandardScaler().fit_transform(X)
            
        #print X.shape
        return csr_matrix(X.values)
    else:
        # Assume it is a pickle file
        with open(os.path.join(feat_dir, '{}_{}.pickle'.format(fname,samples)), 'rb') as f:
            return pickle.load(f)
            
feature_files = ['features_brand_bag',
                 'features_brand_model_bag',
                 'features_brand_model.csv']

# Which features
n_features = range(1800)

In [6]:
gatrain = pd.read_csv('./data_ori/gender_age_train.csv')
gatest = pd.read_csv('./data_ori/gender_age_test.csv')

In [7]:
Xtrain = hstack([open_feature_file(f) for f in feature_files], format='csr')
Xtest = hstack([open_feature_file(f,'test') for f in feature_files], format='csr')
y = gatrain['group']

In [8]:
Xtrain = Xtrain[:, n_features]
Xtest = Xtest[:, n_features]

In [9]:
letarget = LabelEncoder().fit(gatrain.group.values)
y = letarget.transform(gatrain.group.values)
n_classes = len(letarget.classes_)

In [10]:
# Load CV sets
train_cv = pd.read_csv(os.path.join(data_dir, 'gender_age_train_cv.csv'))
test_cv = pd.read_csv(os.path.join(data_dir, 'gender_age_test_cv.csv'))

X_train, X_val = Xtrain[train_cv.sample_nr.values, :], Xtrain[test_cv.sample_nr.values, :]
y_train, y_val = y[train_cv.sample_nr], y[test_cv.sample_nr]

In [11]:
print 'X_train', X_train.shape, 'X_val:', X_val.shape
print 'y_train', y_train.shape, 'y_val', y_val.shape

X_train (67229, 1800) X_val: (7416, 1800)
y_train (67229,) y_val (7416,)


## Using XGBoost

In [12]:
n_models = 5

scores = []
models_out = []

for s in np.random.randint(99999,size=n_models):
    
    kf = list(StratifiedKFold(y, n_folds=10, shuffle=True, random_state=s))[0]

    Xtr, Xte = Xtrain[kf[0], :], Xtrain[kf[1], :]
    ytr, yte = y[kf[0]], y[kf[1]]
    
    params = {
            "objective": "multi:softprob",
            'booster': 'gblinear',
            'num_class': 12,
            "eta": 0.01,
            "silent": 1,
            'alpha':1,
            'lambda': 3,
            'n_estimators': 250,
            'seed': s,
            'eval_metric': 'mlogloss'
        }
    
    xg_train = xgb.DMatrix(Xtr, label=ytr)
    xg_val = xgb.DMatrix(Xte, label=yte)
    
    watchlist = [ (xg_train,'train'), (xg_val, 'val') ]
    
    clf = xgb.train(params, xg_train, params['n_estimators'], watchlist, verbose_eval=50 )
    
    pred_val = clf.predict(xg_val)
    
    score = log_loss(yte, pred_val)
    
    print('Validation score on {}: {:.4f}'.format(s, score))

    model_out = {'model': clf,
                 'score': score,
                 'params': params}
    
    models_out.append(model_out)
    scores.append(score)

[0]	train-mlogloss:2.481582	val-mlogloss:2.481968
[50]	train-mlogloss:2.402277	val-mlogloss:2.411722
[100]	train-mlogloss:2.379335	val-mlogloss:2.395078
[150]	train-mlogloss:2.369483	val-mlogloss:2.389797
[200]	train-mlogloss:2.364329	val-mlogloss:2.387778
[249]	train-mlogloss:2.361276	val-mlogloss:2.386955


Validation score on 6598: 2.3870


[0]	train-mlogloss:2.481571	val-mlogloss:2.481990
[50]	train-mlogloss:2.401988	val-mlogloss:2.413305
[100]	train-mlogloss:2.378865	val-mlogloss:2.397761
[150]	train-mlogloss:2.369007	val-mlogloss:2.393200
[200]	train-mlogloss:2.363744	val-mlogloss:2.391690
[249]	train-mlogloss:2.360715	val-mlogloss:2.391236


Validation score on 8999: 2.3912


[0]	train-mlogloss:2.481572	val-mlogloss:2.482058
[50]	train-mlogloss:2.401570	val-mlogloss:2.415254
[100]	train-mlogloss:2.378293	val-mlogloss:2.400645
[150]	train-mlogloss:2.368340	val-mlogloss:2.396770
[200]	train-mlogloss:2.363139	val-mlogloss:2.395773
[249]	train-mlogloss:2.360070	val-mlogloss:2.395693


Validation score on 20347: 2.3957


[0]	train-mlogloss:2.481569	val-mlogloss:2.481998
[50]	train-mlogloss:2.401962	val-mlogloss:2.412853
[100]	train-mlogloss:2.379029	val-mlogloss:2.396592
[150]	train-mlogloss:2.369150	val-mlogloss:2.391467
[200]	train-mlogloss:2.363961	val-mlogloss:2.389560
[249]	train-mlogloss:2.360815	val-mlogloss:2.388834


Validation score on 12690: 2.3888


[0]	train-mlogloss:2.481554	val-mlogloss:2.481987
[50]	train-mlogloss:2.402125	val-mlogloss:2.412875
[100]	train-mlogloss:2.379049	val-mlogloss:2.396916
[150]	train-mlogloss:2.369217	val-mlogloss:2.392071
[200]	train-mlogloss:2.364056	val-mlogloss:2.390374


Validation score on 73471: 2.3898


[249]	train-mlogloss:2.360983	val-mlogloss:2.389785


In [13]:
cv_score = np.mean(scores)
print('CV Score: {:.4f}'.format(cv_score))

CV Score: 2.3905


## Upload result to Kaggle

In [14]:
preds = []
scores = []
for m in models_out:
    clf = m['model']
    
    pred = clf.predict(xgb.DMatrix(Xtest))
    
    preds.append(pred)
    
pred_test = sum(preds)/len(models_out)

pred = pd.DataFrame(pred, index = gatest.device_id, columns=letarget.classes_)

In [15]:
kag = KaggleResult(pred.values, 
                   pred.index.values, 
                   cv_score=cv_score, 
                   description=description, 
                   subdir=sub_dir)

In [16]:
if kag.validate():
    kag.upload()
print kag.lb_score

2.39022


## Store models...

In [17]:
outputfile = '{}_{}_{:.4f}_{:.4f}.pickle'.format(description, 
                                                 datetime.datetime.now().strftime("%Y-%m-%d-%H-%M"),
                                                 cv_score,
                                                 kag.lb_score)

output = {'script': 'train_model_0_xgboost',
          'features': n_features,
          'feature_sets': feature_files,
          'model_params': params,
          'no_models': 5,
          'cross_validation': {'type': 'randomsplit_0.1'},
          'models': models_out}


with open(os.path.join(sub_dir, outputfile), 'wb') as f:
    pickle.dump(output,f)

## Below is WIP....

In [None]:
for a in np.linspace(3,6,3):
    params = {
        "objective": "multi:softprob",
        'booster': 'gblinear',
        'num_class': 12,
        "eta": 0.01,
        "silent": 1,
        'alpha':1,
        'lambda': a,
        'n_estimators': 250,
        'eval_metric': 'mlogloss'
    }
    kf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0)
    pred = np.zeros((Xtrain.shape[0],n_classes))
    for itrain, itest in kf:
        ytrain, ytest = y[itrain], y[itest]
        xg_train = xgb.DMatrix( Xtrain[itrain, :], label=ytrain)
        xg_test = xgb.DMatrix(Xtrain[itest, :], label=ytest)
        watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
        bst = xgb.train(params, xg_train, params['n_estimators'], watchlist, verbose_eval=50 )
        pred[itest,:] = bst.predict(xg_test)
    print a,':',log_loss(y, pred)

In [None]:
log_loss(y, pred)