![train](http://cliparting.com/wp-content/uploads/2016/06/Train-clipart-for-kids-free-free-clipart-images.gif)

In [54]:
import numpy as np
np.random.seed(1019)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import xgboost

import sklearn
from sklearn.model_selection import train_test_split 

import sys, os, gc, types
import time
from subprocess import check_output

In [73]:
sys.path.append('./utils')
from training import cv

In [7]:
root_paths = [
    "/data/kaggle-instacart",
    "/Users/jiayou/Dropbox/珺珺的程序/Kaggle/Instacart",
    "/Users/jiayou/Dropbox/Documents/珺珺的程序/Kaggle/Instacart"
]
root = None
for p in root_paths:
    if os.path.exists(p):
        root = p
        break

# Hyper-Parameter Search

In [65]:
name = 'r1'
down_sample = 10
num_searches = 2
boosting_rounds = 100
stopping_rounds = 5

xgb_params_search = {
#     "learning_rate"    : lambda: int(10**np.random.uniform(-2, -1) * 1e4) / 1e4,
#     "max_depth"        : lambda: np.random.randint(3, 7),
#     "min_split_loss"   : [0, 0.70],
#     "min_child_weight" : [1, 10],
}

-----------

In [11]:
def get_data(down_sample=None):
    data = pd.read_csv(os.path.join(root, 'abt.csv'))
    
    n = data.shape[0]
    data['rand_uniform'] = np.random.uniform(0, 1, n)
    data['rand_normal'] = np.random.normal(0, 1, n)
    
    for col in data.columns:
        if col in ['user_id', 'product_id', 'order_id']:
            continue
        if data[col].dtypes == 'float64':
            data[col] = data[col].astype('float32')
        elif data[col].dtypes == 'int64':
            data[col] = data[col].astype('int32')
    
    train = data.loc[data.eval_set == "train",:]
    test = data.loc[data.eval_set == "test",:]
    
    if down_sample is not None:
        train = train[train.user_id % down_sample == 0]
        test = test[test.user_id % down_sample == 0]
    
    return (train, test)

In [None]:
train, test = get_data(down_sample)
gc.collect()
train['reordered'] = train.reordered.fillna(0)

In [14]:
X_train, X_val, y_train, y_val = train_test_split(
    train.drop(['eval_set', 'user_id', 'product_id', 'order_id', 'aisle_id', 'reordered'], axis=1), 
    train.reordered,
    test_size=0, random_state=1019)

d_train = xgboost.DMatrix(X_train, y_train)

In [None]:
print(train.shape)
print(test.shape)
print(X_train.shape)
print(X_val.shape)

In [55]:
def get_params(default, search):
    np.random.seed(int(time.time()))
    p = dict(default)
    for k, gen in search.items():
        v = None
        if type(gen) == list:
            v = gen[np.random.randint(0, len(gen))]
        elif type(gen) == types.LambdaType:
            v = gen()
        p[k] = v
    return p

def print_params(params, keys):
    print()
    print(["{} = {}".format(k, params[k]) for k in keys])
    print()

In [69]:
xgb_params_default = {
    "booster"          : "gbtree",
    "tree_method"      : "auto",
    "learning_rate"    : 0.1,
    "min_split_loss"   : 0.7, # ?
    "max_depth"        : 6,
    "min_child_weight" : 10, # hessian weight
    "subsample"        : 0.7,
    "colsample_bytree" : 0.9,
    "reg_alpha"        : 2e-05,
    "reg_lambda"       : 10,
    
    "objective"        : "reg:logistic",
    "eval_metric"      : "logloss"
}

xgb_params = get_params(default=xgb_params_default, search=xgb_params_search)
print_params(xgb_params, keys=xgb_params_search.keys())


['learning_rate = 0.0389', 'max_depth = 3', 'min_split_loss = 0', 'min_child_weight = 1']



In [10]:
print('Running random param search with cross validation...')
results = []
for i in range(num_searches):
    xgb_params = get_params(default=xgb_params_default, search=xgb_params_search)
    print_params(xgb_params, keys=xgb_params_search.keys())
    h, cvfolds = cv(
        xgb_params, d_train, num_boost_round=boosting_rounds, nfold=4,
        metrics={'logloss'}, seed = 1019,
        callbacks=[
            xgboost.callback.print_evaluation(show_stdv=True),
            xgboost.callback.early_stop(stopping_rounds=stopping_rounds)
        ])
    results.append([xgb_params, h])
    
    for f in cvfolds:
        plt.figure()
        xgboost.plot_importance(f.bst)
        plt.show()

In [64]:
# Save search results
params = []
histories = []
for i in range(num_searches):
    p = dict(results[i][0])
    h = results[i][1].copy()
    
    p['search_id'] = i
    params.append(p)
    
    h['search_id'] = i
    h['boost_round'] = range(h.shape[0])
    histories.append(h)
    
p = pd.DataFrame(params)
p.to_csv(os.path.join(root, 'search-{}-params.csv'.format(name)), index=False)

h = pd.concat(histories)
h.to_csv(os.path.join(root, 'search-{}-histories.csv'.format(name)), index=False)

In [None]:
# watchlist= [(d_train, "train")]
# bst = xgboost.train(params=xgb_params, dtrain=d_train, num_boost_round=80, evals=watchlist, verbose_eval=10)
# xgboost.plot_importance(bst)

# Prediction and Submission

In [12]:
# Prediction
d_test = xgboost.DMatrix(test.drop(['eval_set', 'user_id', 'order_id', 'aisle_id', 'reordered', 'product_id'], axis=1))
pred = bst.predict(d_test)

In [None]:
# Thresholding
test['reordered'] = (pred > 0.21).astype(int)

In [13]:
# Submission file
test['product_id'] = test.product_id.astype(str)
submit = test[test.reordered == 1]
    .groupby('order_id')['product_id']
    .agg([lambda x: ' '.join(set(x))])
    .reset_index()
sample_submission = pd.read_csv(os.path.join(root, 'sample_submission.csv'))
submit.columns = sample_submission.columns.tolist()
submit_final = sample_submission[['order_id']].merge(submit, how='left').fillna('None')
submit_final.to_csv("submission.csv", index=False)

In [100]:
# Stats
print('{} pred orders; {} of them non-empty'.format(submit_final.shape[0], submit.shape[0]))
empty_order_ratio = (submit_final.shape[0] - submit.shape[0]) * 100. / submit_final.shape[0]
print('Empty order ratio is {:.2f}%'.format(empty_order_ratio)