## Train and Submit

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import xgboost as xgb
from sklearn.metrics import log_loss
from datetime import datetime

### Load Data

In [2]:
path = './droped/'

train_X = sp.sparse.load_npz(path + 'train_X.npz')
test_X = sp.sparse.load_npz(path + 'test_X.npz')
    
train_y = np.load(path + 'train_y.npy')
    
train_VN = np.load(path + 'train_VN.npy')
test_VN = np.load(path + 'test_VN.npy')

### Single Model

In [3]:
y = pd.get_dummies(train_y).values.argmax(1)
N = train_X.shape[0]

num_round = 550
xgb_params = {'objective':'multi:softprob', 'num_class':38,
              'eta':.2, 'max_depth':5, 'colsample_bytree':.4, 'subsample':.7, 'lambda': 5, 
              'silent':1}

dtrain = xgb.DMatrix(train_X, label = y)
dtest = xgb.DMatrix(test_X)

t0 = datetime.now()
bst = xgb.train(xgb_params, dtrain, num_round)
test_pr = bst.predict(dtest)
train_pr = bst.predict(dtrain)
print(datetime.now() - t0, '\t', log_loss(y, train_pr))

col_list = ['TripType_' + str(lb) for lb in np.unique(train_y)]
result = pd.DataFrame(test_pr, columns=col_list, index=test_VN)
result.index.name = 'VisitNumber'
result.to_csv('submission_single.csv', columns=result.columns)

0:16:59.916187 	 0.398145999052


### Preparation for Ensembling

In [3]:
y = pd.get_dummies(train_y).values.argmax(1)
N = train_X.shape[0]

num_round = 550
xgb_params = {'objective':'multi:softprob', 'num_class':38,
              'eta':.2, 'max_depth':5, 'colsample_bytree':.4, 'subsample':.7, 'lambda': 5, 
              'silent':1}

dtrain = xgb.DMatrix(train_X, label = y)
dtest = xgb.DMatrix(test_X)

nModels = 20

In [5]:
# np.save('D:/id' + str(0) + '.npy', np.array([0,0,0]))

### Training and Ensembling

In [4]:
sum_test_pr = np.zeros((N, 38))
sum_train_pr = np.zeros((N, 38))

print('Training Time\tTrLoss\tTrLoss_Avg')
for j in range(nModels):
    t0 = datetime.now()
    xgb_params['seed'] = 10202*(j+20) + 50604
    bst = xgb.train(xgb_params, dtrain, num_round)
    test_pr = bst.predict(dtest)
    train_pr = bst.predict(dtrain)
    sum_test_pr += test_pr
    sum_train_pr += train_pr
    print(datetime.now() - t0, '\t', log_loss(y, train_pr), '\t', log_loss(y, sum_train_pr / (j+1)))
    np.save('D:/id' + str(j) + '.npy', sum_test_pr)

avg_test_pr = sum_test_pr / nModels
col_list = ['TripType_' + str(lb) for lb in np.unique(train_y)]
result = pd.DataFrame(avg_test_pr, columns=col_list, index=test_VN)
result.index.name = 'VisitNumber'
result.to_csv('submission_ensembled.csv', columns=result.columns)

Training Time	TrLoss	TrLoss_Avg
0:17:02.695142 	 0.397155757107 	 0.397155764435
0:16:52.571981 	 0.396869206683 	 0.394937178513
0:16:42.079456 	 0.397561608956 	 0.394430622093
0:16:49.643178 	 0.397116021226 	 0.394044632143
0:16:20.010673 	 0.397409789408 	 0.393858244713
0:16:14.431630 	 0.398201574036 	 0.393894542623
0:16:16.080893 	 0.398036888655 	 0.393887390626
0:16:40.176910 	 0.396813525298 	 0.393746483766
0:16:15.534205 	 0.397062320532 	 0.393650676189
0:16:16.912702 	 0.397948578945 	 0.393629908058
0:16:16.741121 	 0.396708322783 	 0.393533656977
0:16:13.625159 	 0.398331009799 	 0.393588509271
0:16:15.300443 	 0.398784390808 	 0.3936738411
0:16:14.394209 	 0.396831317169 	 0.393610557325
0:16:14.568555 	 0.397745801634 	 0.393605931857
0:16:14.731823 	 0.397490621855 	 0.393593726708
0:16:14.472152 	 0.397065674535 	 0.393556259505
0:16:14.026529 	 0.397628028085 	 0.393541025436
0:16:17.101162 	 0.398347886939 	 0.393553138011
0:16:19.838642 	 0.39864769092 	 0.3935

### Make 40-ensemble Submission
**Another 20 xgboost models were trained on another PC, with different random seeds.
Now ensemble all the 40 models, submitting the final result.**

In [7]:
another_20_ensemble = pd.read_csv('D:/Grain_JGY/ML_Lab/submission_ensembled.csv').set_index('VisitNumber')
ensemble_40 = (another_20_ensemble.values + avg_test_pr) / 2

In [9]:
col_list = ['TripType_' + str(lb) for lb in np.unique(train_y)]
result = pd.DataFrame(ensemble_40, columns=col_list, index=test_VN)
result.index.name = 'VisitNumber'
result.to_csv('submission_ensembled_40.csv', columns=result.columns)