![train](http://cliparting.com/wp-content/uploads/2016/06/Train-clipart-for-kids-free-free-clipart-images.gif)

In [1]:
import numpy as np
np.random.seed(1019)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import xgboost

import sklearn
from sklearn.model_selection import train_test_split 

import sys, os, gc, types
import time
from subprocess import check_output

In [2]:
sys.path.append('./utils')

from training import cv, train
from plotting import plot_importance
from data import Data

In [3]:
root_paths = [
    "/data/kaggle-instacart",
    "/Users/jiayou/Dropbox/珺珺的程序/Kaggle/Instacart",
    "/Users/jiayou/Dropbox/Documents/珺珺的程序/Kaggle/Instacart"
]
root = None
for p in root_paths:
    if os.path.exists(p):
        root = p
        break

In [4]:
name = 'v9-r0'
down_sample = 999
folds=5

boosting_rounds = 100
stopping_rounds = 10
xgb_params = {
    "booster"          : "gbtree",
    "tree_method"      : "hist",
    "learning_rate"    : 0.1,
    "max_depth"        : 6,
    "min_child_weight" : 10, # hessian weight
    "subsample"        : 0.7,
    "colsample_bytree" : 0.9,
        
    "objective"        : "reg:logistic",
    "eval_metric"      : "logloss",
    
    "min_split_loss"   : 0.7, # ?
    "reg_alpha"        : 2e-05,
    "reg_lambda"       : 10
#     "grow_policy"      : ["lossguide"]
}

schema = {
    'product_id': np.int32,
    'up_order_count': np.int16,
    'up_first_order_number': np.int16,
    'up_last_order_number': np.int16,
    'up_average_cart_position': np.float32,
    'up_days_since_last_order': np.float32,
    'prod_total_cnt': np.int32,
    'prod_reorder_total_cnt': np.float32,
    'prod_user_cnt': np.int32,
    'prod_return_user_cnt': np.int32,
    'prod_user_reorder_ratio': np.float32,
    'prod_product_reorder_ratio': np.float32,
    'user_total_orders': np.int16,
    'user_sum_days_since_prior_order': np.float32,
    'user_mean_days_since_prior_order': np.float32,
    'user_reorder_ratio': np.float32,
    'user_total_products': np.int16,
    'user_distinct_products': np.int16,
    'user_average_basket': np.float32,
    'order_id': np.int32,
    'eval_set': str,
    'days_since_prior_order': np.float32,
    'cat_total_bought_cnt': np.int32,
    'cat_reorder_total_cnt': np.float32,
    'cat_user_cnt': np.int32,
    'cat_return_user_cnt': np.int32,
    'cat_user_reorder_ratio': np.float32,
    'cat_product_reorder_ratio': np.float32,
    'cat_num_of_prods_a_user_buys_in_this_cat_mean': np.float32,
    'cat_num_of_prods_a_user_buys_in_this_cat_std': np.float32,
    'cat_num_of_prods_a_user_buys_in_this_cat_max': np.int16,
    'up_order_rate': np.float32,
    'up_order_since_last_order': np.int16,
    'up_order_rate_since_first_order': np.float32,
    'reordered': np.float32,
    'prod_market_share_hod': np.float32,
    'prod_market_share_dow': np.float32,
    'up_days_since_last_not_order': np.float32,
    'up_order_since_last_not_order': np.float16
}

# Fold split, training on folds, merge reorder prob to abt

In [11]:
# Predict cur_fold as validation, using the rest 4 folds as training
# Make training and validation data
def split_test_train(cur_fold):
    X_val= X_train_df[cur_fold]
    y_val= y_train_df[cur_fold]
    X_train = pd.concat([X_train_df[i] for i in range(folds) if i != cur_fold])
    y_train = pd.concat([y_train_df[i] for i in range(folds) if i != cur_fold])
    return (X_train, X_val, y_train, y_val)

def train_predict_one_fold(cur_fold):
    X_train, X_val, y_train, y_val = split_test_train(cur_fold)
    drop_list = [
    'product_id', 
    'order_id',
    ]

    dtrain = xgboost.DMatrix(X_train.drop(drop_list, axis=1), y_train)
    dval = xgboost.DMatrix(X_val.drop(drop_list, axis=1), y_val)

    gc.collect()
    print(dtrain.num_row(), dtrain.num_col())
    print(dval.num_row(), dval.num_col())

    bst = train(
        xgb_params, dtrain, num_boost_round=boosting_rounds,
        evals=[(dtrain, 'train'), (dval, 'val')])
    
    print('Training cv fold{} is done'.format(cur_fold))

    pred = bst.predict(dval)
    pred_df = pd.DataFrame({'order_id':X_val.order_id,'product_id':X_val.product_id, 'reorder_prob':pred})
    pred_df.to_csv(
        os.path.join(root, 'val_prediction_cv{}_{}.csv'.format(cur_fold, name)), index=False)

In [10]:
train_df = pd.read_csv(
            os.path.join(root, 'abt', 'abt_train.csv'),
            dtype=schema)

if down_sample is not None:
    train_df = train_df[train_df.order_id % down_sample == 0]

In [12]:
X_train_df, y_train_df = Data.train_for_stacking(down_sample=down_sample,folds=folds, aug=False)
for i in range(folds):
    train_predict_one_fold(i)
print('All cv folds are done')

824 33
272 33
Round 0 update starts...
Round 0 update: 0.00s
Round 0 eval starts...
Round 0 eval: 0.00s
[0]	train-logloss:0.63539	val-logloss:0.633741
Round 1 update starts...
Round 1 update: 0.00s
Round 1 eval starts...
Round 1 eval: 0.00s
[1]	train-logloss:0.586065	val-logloss:0.582934
Round 2 update starts...
Round 2 update: 0.00s
Round 2 eval starts...
Round 2 eval: 0.00s
[2]	train-logloss:0.545635	val-logloss:0.54204
Round 3 update starts...
Round 3 update: 0.00s
Round 3 eval starts...
Round 3 eval: 0.00s
[3]	train-logloss:0.511021	val-logloss:0.507268
Round 4 update starts...
Round 4 update: 0.00s
Round 4 eval starts...
Round 4 eval: 0.00s
[4]	train-logloss:0.481888	val-logloss:0.478829
Round 5 update starts...
Round 5 update: 0.00s
Round 5 eval starts...
Round 5 eval: 0.00s
[5]	train-logloss:0.456088	val-logloss:0.452645
Round 6 update starts...
Round 6 update: 0.00s
Round 6 eval starts...
Round 6 eval: 0.00s
[6]	train-logloss:0.433804	val-logloss:0.428306
Round 7 update starts.

-----------

# Train on full abt and predict test abt

In [13]:
X_train, X_val, y_train, y_val = Data.train(down_sample=down_sample, test_size=0, aug=False)
# X_train, X_val no order_id, product_id
drop_list = [
'rand_uniform',
'rand_normal',
'aug'
]

dtrain = xgboost.DMatrix(X_train.drop(drop_list, axis=1), y_train)
dval = xgboost.DMatrix(X_val.drop(drop_list, axis=1), y_val)

gc.collect()
print(dtrain.num_row(), dtrain.num_col())
print(dval.num_row(), dval.num_col())

bst = train(
    xgb_params, dtrain, num_boost_round=boosting_rounds,
    evals=[(dtrain, 'train')])

test = Data.test(down_sample=down_sample)
test.drop(drop_list, axis=1, inplace=True)
dtest = xgboost.DMatrix(
test.drop(['eval_set', 'order_id', 'reordered', 'product_id'], axis=1))
test['reorder_prob'] = bst.predict(dtest)
test[['order_id','product_id','reorder_prob']].to_csv(
    os.path.join(root, 'test_prediction_full_{}.csv'.format(name)), index=False)


1096 33
0 33
Round 0 update starts...
Round 0 update: 0.00s
Round 0 eval starts...
Round 0 eval: 0.00s
[0]	train-logloss:0.632937
Round 1 update starts...
Round 1 update: 0.00s
Round 1 eval starts...
Round 1 eval: 0.00s
[1]	train-logloss:0.583439
Round 2 update starts...
Round 2 update: 0.00s
Round 2 eval starts...
Round 2 eval: 0.00s
[2]	train-logloss:0.542105
Round 3 update starts...
Round 3 update: 0.00s
Round 3 eval starts...
Round 3 eval: 0.00s
[3]	train-logloss:0.50532
Round 4 update starts...
Round 4 update: 0.00s
Round 4 eval starts...
Round 4 eval: 0.00s
[4]	train-logloss:0.47415
Round 5 update starts...
Round 5 update: 0.00s
Round 5 eval starts...
Round 5 eval: 0.00s
[5]	train-logloss:0.446978
Round 6 update starts...
Round 6 update: 0.00s
Round 6 eval starts...
Round 6 eval: 0.00s
[6]	train-logloss:0.42494
Round 7 update starts...
Round 7 update: 0.00s
Round 7 eval starts...
Round 7 eval: 0.00s
[7]	train-logloss:0.40432
Round 8 update starts...
Round 8 update: 0.00s
Round 8 

----

# xgb top model training and prediction

In [14]:
# merge reorder prob to train_df
pred=[]
for i in range(folds):
    pred.append(pd.read_csv(os.path.join(root, 'val_prediction_cv{}_{}.csv'.format(i, name))))
pred_df = pd.concat([pred[i] for i in range(folds)])
train_df = train_df.merge(pred_df, on=['order_id','product_id'], how='left')

In [27]:
train_df.loc[:, 'reordered'] = train_df.reordered.fillna(0)

X_train, X_val, y_train, y_val = train_test_split(
    train_df.drop(['eval_set', 'product_id', 'order_id', 'reordered'], axis=1),
    train_df.reordered,
    test_size=0.2, random_state=1019)


X_train.sort_index(axis=1, inplace=True)
X_val.sort_index(axis=1, inplace=True)


dtrain = xgboost.DMatrix(X_train, y_train)
dval = xgboost.DMatrix(X_val, y_val)

gc.collect()
print(dtrain.num_row(), dtrain.num_col())
print(dval.num_row(), dval.num_col())

results = []

bst = train(
    xgb_params, dtrain, num_boost_round=boosting_rounds,
    evals=[(dtrain, 'train'), (dval, 'val')])

test.sort_index(axis=1, inplace=True)

dtest = xgboost.DMatrix(test.drop(['eval_set', 'order_id', 'reordered', 'product_id'], axis=1))
pred_t = bst.predict(dtest)

876 34
220 34
Round 0 update starts...
Round 0 update: 0.00s
Round 0 eval starts...
Round 0 eval: 0.00s
[0]	train-logloss:0.634388	val-logloss:0.632855
Round 1 update starts...
Round 1 update: 0.00s
Round 1 eval starts...
Round 1 eval: 0.00s
[1]	train-logloss:0.585663	val-logloss:0.583604
Round 2 update starts...
Round 2 update: 0.00s
Round 2 eval starts...
Round 2 eval: 0.00s
[2]	train-logloss:0.54466	val-logloss:0.542088
Round 3 update starts...
Round 3 update: 0.00s
Round 3 eval starts...
Round 3 eval: 0.00s
[3]	train-logloss:0.509428	val-logloss:0.504836
Round 4 update starts...
Round 4 update: 0.00s
Round 4 eval starts...
Round 4 eval: 0.00s
[4]	train-logloss:0.479083	val-logloss:0.473608
Round 5 update starts...
Round 5 update: 0.00s
Round 5 eval starts...
Round 5 eval: 0.00s
[5]	train-logloss:0.453872	val-logloss:0.447388
Round 6 update starts...
Round 6 update: 0.00s
Round 6 eval starts...
Round 6 eval: 0.00s
[6]	train-logloss:0.43208	val-logloss:0.424904
Round 7 update starts.

In [29]:
test['reordered'] = pred_t
test_result = test[['product_id', 'order_id', 'reordered']]
test_result.to_csv(os.path.join(root, 'test_prob_prediction_{}.csv'.format(name)), index=False)