![train](http://cliparting.com/wp-content/uploads/2016/06/Train-clipart-for-kids-free-free-clipart-images.gif)

In [54]:
import numpy as np
np.random.seed(1019)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import xgboost

import sklearn
from sklearn.model_selection import train_test_split 

import sys, os, gc, types
import time
from subprocess import check_output

In [73]:
sys.path.append('./utils')

from training import cv
from plotting import plot_importance

In [7]:
root_paths = [
    "/data/kaggle-instacart",
    "/Users/jiayou/Dropbox/珺珺的程序/Kaggle/Instacart",
    "/Users/jiayou/Dropbox/Documents/珺珺的程序/Kaggle/Instacart"
]
root = None
for p in root_paths:
    if os.path.exists(p):
        root = p
        break

In [11]:
def get_data(down_sample=None):
    data = pd.read_csv(
        os.path.join(root, 'abt.csv'),
        dtype={
            'product_id': np.int32,
            'up_order_count': np.int16,
            'up_first_order_number': np.int16,
            'up_last_order_number': np.int16,
            'up_average_cart_position': np.float32,
            'up_days_since_last_order': np.float32,
            'prod_total_cnt': np.int32,
            'prod_reorder_total_cnt': np.float32,
            'prod_user_cnt': np.int32,
            'prod_return_user_cnt': np.int32,
            'prod_user_reorder_ratio': np.float32,
            'prod_product_reorder_ratio': np.float32,
            'user_total_orders': np.int16,
            'user_sum_days_since_prior_order': np.float32,
            'user_mean_days_since_prior_order': np.float32,
            'user_reorder_ratio': np.float32,
            'user_total_products': np.int16,
            'user_distinct_products': np.int16,
            'user_average_basket': np.float32,
            'order_id': np.int32,
            'eval_set': str,
            'time_since_last_order': np.float32,
            'cat_total_bought_cnt': np.int32,
            'cat_reorder_total_cnt': np.float32,
            'cat_user_cnt': np.int32,
            'cat_return_user_cnt': np.int32,
            'cat_user_reorder_ratio': np.float32,
            'cat_product_reorder_ratio': np.float32,
            'cat_num_of_prods_a_user_buys_in_this_cat_mean': np.float32,
            'cat_num_of_prods_a_user_buys_in_this_cat_std': np.float32,
            'cat_num_of_prods_a_user_buys_in_this_cat_max': np.int16,
            'cat_num_of_prods_a_user_buys_in_this_cat_median': np.int16,
            'up_order_rate': np.float32,
            'up_order_since_last_order': np.int16,
            'up_order_rate_since_first_order': np.float32,
            'reordered': np.float32,
            'prod_market_share_hod': np.float32,
            'prod_market_share_dow': np.float32
        })
    
    n = data.shape[0]
    data['rand_uniform'] = np.random.uniform(0, 1, n)
    data['rand_normal'] = np.random.normal(0, 1, n)
    
    train = data.loc[data.eval_set == "train",:]
    test = data.loc[data.eval_set == "test",:]
    
    if down_sample is not None:
        train = train[train.user_id % down_sample == 0]
        test = test[test.user_id % down_sample == 0]
        
    train['reordered'] = train.reordered.fillna(0)
    
    X_train, X_val, y_train, y_val = train_test_split(
        train.drop(['eval_set', 'product_id', 'order_id', 'reordered'], axis=1), 
        train.reordered,
        test_size=0, random_state=1019)

    d_train = xgboost.DMatrix(X_train, y_train)
    
    return (d_train, test)

# Hyper-Parameter Search

In [65]:
name = 'v2-r0'
down_sample = None
num_searches = 1
boosting_rounds = 150
stopping_rounds = 5

xgb_params_search = {
#     "learning_rate"    : lambda: int(10**np.random.uniform(-2, -1) * 1e4) / 1e4,
#     "max_depth"        : lambda: np.random.randint(3, 7),
#     "min_split_loss"   : [0, 0.70],
#     "min_child_weight" : [1, 10],
}

-----------

In [None]:
d_train, test = get_data(down_sample)

gc.collect()
print(d_train.num_row(), d_train.num_col())
print(test.shape)

In [55]:
def get_params(default, search):
    np.random.seed(int(time.time()))
    p = dict(default)
    for k, gen in search.items():
        v = None
        if type(gen) == list:
            v = gen[np.random.randint(0, len(gen))]
        elif type(gen) == types.LambdaType:
            v = gen()
        p[k] = v
    return p

def print_params(params, keys):
    print()
    print(["{} = {}".format(k, params[k]) for k in keys])
    print()

In [74]:
xgb_params_default = {
    "booster"          : "gbtree",
    "tree_method"      : "auto",
    "learning_rate"    : 0.1,
    "min_split_loss"   : 0.7, # ?
    "max_depth"        : 6,
    "min_child_weight" : 10, # hessian weight
    "subsample"        : 0.7,
    "colsample_bytree" : 0.9,
    "reg_alpha"        : 2e-05,
    "reg_lambda"       : 10,
    
    "objective"        : "reg:logistic",
    "eval_metric"      : "logloss"
}

In [10]:
results = []
cvfolds = None
for i in range(num_searches):
    xgb_params = get_params(default=xgb_params_default, search=xgb_params_search)
    print_params(xgb_params, keys=xgb_params_search.keys())
    h, cvfolds = cv(
        xgb_params, d_train, num_boost_round=boosting_rounds, nfold=5,
        metrics={'logloss'}, seed = 1019,
        callbacks=[
            xgboost.callback.print_evaluation(show_stdv=True),
            xgboost.callback.early_stop(stopping_rounds=stopping_rounds)
        ])
    results.append([xgb_params, h])
    
    for f in cvfolds:
        _, axes = plt.subplots(nrows=1, ncols=3, figsize=(18,18))
        measures = ['weight', 'gain', 'cover']
        for i in range(3):
            plot_importance(f.bst, height=1, ax=axes[i], importance_type=measures[i], title=measures[i])
        plt.show()

----

In [64]:
# Save search results
params = []
histories = []
for i in range(num_searches):
    p = dict(results[i][0])
    h = results[i][1].copy()
    
    p['search_id'] = i
    params.append(p)
    
    h['search_id'] = i
    h['boost_round'] = range(h.shape[0])
    histories.append(h)
    
p = pd.DataFrame(params)
p.to_csv(os.path.join(root, 'train-{}-params.csv'.format(name)), index=False)

h = pd.concat(histories)
h.to_csv(os.path.join(root, 'train-{}-histories.csv'.format(name)), index=False)

# Prediction and Submission

In [75]:
def predict(bst):
    d_test = xgboost.DMatrix(
        test.drop(['eval_set', 'order_id', 'reordered', 'product_id'], axis=1))
    return bst.predict(d_test)

def ensemble(preds):
    # Average ensemble
    r = None
    for p in preds:
        if r is None:
            r = p
        else:
            r += p
    return r / len(preds)

def ensemble_predict(bsts):
    preds = []
    for bst in bsts:
        preds.append(predict(bst))
    return ensemble(preds)

def thresholding(pred):
    return (pred > 0.21).astype(int)

In [None]:
# Prediction
test['reordered'] = thresholding(ensemble_predict([f.bst for f in cvfolds]))

In [13]:
# Submission file
test['product_id'] = test.product_id.astype(str)
submit = test[test.reordered == 1].groupby('order_id')['product_id'].agg([lambda x: ' '.join(set(x))]).reset_index()
sample_submission = pd.read_csv(os.path.join(root, 'sample_submission.csv'))
submit.columns = sample_submission.columns.tolist()
submit_final = sample_submission[['order_id']].merge(submit, how='left').fillna('None')
submit_final.to_csv("submission-{}.csv".format(name), index=False)

In [100]:
# Stats
print('{} pred orders; {} of them non-empty'.format(submit_final.shape[0], submit.shape[0]))
empty_order_ratio = (submit_final.shape[0] - submit.shape[0]) * 100. / submit_final.shape[0]
print('Empty order ratio is {:.2f}%'.format(empty_order_ratio))

In [77]:
t = pd.DataFrame([{'a': 1, 'b': 'bbb'}])
t.to_csv('test.csv', index=False)

In [86]:
t = pd.read_csv('test.csv', dtype={'a':np.int64, 'b':np.object})
t

Unnamed: 0,a,b
0,1,bbb


In [87]:
t.dtypes

a     int64
b    object
dtype: object