In [2]:
import numpy as np
np.random.seed(1019)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import lightgbm as lgb

import sklearn
from sklearn.model_selection import train_test_split 

import sys, os, gc, types
import time
from subprocess import check_output

In [3]:
sys.path.append('./utils')

from training import cv, train
from plotting import plot_importance
from data import Data

ModuleNotFoundError: No module named 'pandas.core.indexes'

In [3]:
root_paths = [
    "/data/kaggle-instacart",
    "/Users/jiayou/Dropbox/珺珺的程序/Kaggle/Instacart",
    "/Users/jiayou/Dropbox/Documents/珺珺的程序/Kaggle/Instacart"
]
root = None
for p in root_paths:
    if os.path.exists(p):
        root = p
        break

In [47]:
name = 'v13-b1'


down_sample = 999
folds=5
aug=False

boosting_rounds = 100
stopping_rounds = 10
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss', 'auc'},
    'num_leaves': 256,
    'min_sum_hessian_in_leaf': 20,
    'max_depth': 12,
    'learning_rate': 0.05,
    'feature_fraction': 0.6,
    # 'bagging_fraction': 0.9,
    # 'bagging_freq': 3,
    'verbose': 1
}



# bottom models training and prediction

In [48]:
# Predict cur_fold as validation, using the rest 4 folds as training
# Make training and validation data
def train_for_stacking(down_sample=down_sample, folds=folds, aug=aug):
    X_train_df = []
    y_train_df = []
    train = Data.train(down_sample=down_sample, aug=aug)
    for i in range(folds):
        X_train_df.append(train.drop(
            ['eval_set', 'reordered'], axis=1
        )[train.order_id % folds == i])
        X_train_df[i].sort_index(axis=1, inplace=True)
        y_train_df.append(train.reordered[train.order_id % folds == i])

    return (X_train_df, y_train_df)
    
def split_test_train(cur_fold):
    X_val= X_train_df[cur_fold]
    y_val= y_train_df[cur_fold]
    X_train = pd.concat([X_train_df[i] for i in range(folds) if i != cur_fold])
    y_train = pd.concat([y_train_df[i] for i in range(folds) if i != cur_fold])
    return (X_train, X_val, y_train, y_val)

def train_predict_one_fold(cur_fold):
#     global xgb_params
    X_train, X_val, y_train, y_val = split_test_train(cur_fold)
    drop_list = [
    'product_id', 
    'order_id'
    ]

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

    gc.collect()

    print('Start training...')
    # train
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=boosting_rounds,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=stopping_rounds)

    print('Save model...')
    
    print('Training cv fold{} is done'.format(cur_fold))

    pred = gbm.predict(lgb_eval, num_iteration=gbm.best_iteration)
    pred_df = pd.DataFrame({'order_id':X_val.order_id,'product_id':X_val.product_id, 'reorder_prob':pred})
    pred_df.to_csv(
        os.path.join(root, 'stacking', 'val_prediction.{}.cv{}.csv'.format(name, cur_fold)), index=False)

In [49]:
X_train_df, y_train_df = train_for_stacking(down_sample=down_sample,folds=folds, aug=aug)
for i in range(folds):
    train_predict_one_fold(i)
print('All cv folds are done')

824 33
272 33
{'booster': 'gbtree', 'tree_method': 'hist', 'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 10, 'subsample': 0.7, 'colsample_bytree': 0.9, 'objective': 'reg:logistic', 'eval_metric': 'logloss', 'min_split_loss': 0.7, 'reg_alpha': 2e-05, 'reg_lambda': 10}
Round 0 update starts...
Round 0 update: 0.00s
Round 0 eval starts...
Round 0 eval: 0.00s
[0]	train-logloss:0.63539	val-logloss:0.633741
Round 1 update starts...
Round 1 update: 0.00s
Round 1 eval starts...
Round 1 eval: 0.00s
[1]	train-logloss:0.586065	val-logloss:0.582934
Round 2 update starts...
Round 2 update: 0.00s
Round 2 eval starts...
Round 2 eval: 0.00s
[2]	train-logloss:0.545635	val-logloss:0.54204
Round 3 update starts...
Round 3 update: 0.00s
Round 3 eval starts...
Round 3 eval: 0.00s
[3]	train-logloss:0.511021	val-logloss:0.507268
Round 4 update starts...
Round 4 update: 0.00s
Round 4 eval starts...
Round 4 eval: 0.00s
[4]	train-logloss:0.481888	val-logloss:0.478829
Round 5 update starts...
Round 5

In [50]:
# Train on full abt and predict test abt

X_train, X_val, y_train, y_val = Data.train(down_sample=down_sample, test_size=0, aug=aug)
# X_train, X_val no order_id, product_id

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

gc.collect()

print('Start training...')
    # train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=boosting_rounds,
                valid_sets=lgb_eval,
                early_stopping_rounds=stopping_rounds)

print('Save model...')

test = Data.test(down_sample=down_sample)

dtest = test.drop(['eval_set', 'order_id', 'reordered', 'product_id'], axis=1)

pred_t = gbm.predict(dtest, num_iteration=gbm.best_iteration)
pred_t_df = pd.DataFrame({'order_id':test.order_id,'product_id':test.product_id, 'reordered':pred_t})
pred_t_df.to_csv(
    os.path.join(root, 'stacking', 'test_prediction.{}.csv'.format(name)), index=False)


1096 33
0 33
Round 0 update starts...
Round 0 update: 0.00s
Round 0 eval starts...
Round 0 eval: 0.00s
[0]	train-logloss:0.632937
Round 1 update starts...
Round 1 update: 0.00s
Round 1 eval starts...
Round 1 eval: 0.00s
[1]	train-logloss:0.583439
Round 2 update starts...
Round 2 update: 0.00s
Round 2 eval starts...
Round 2 eval: 0.00s
[2]	train-logloss:0.542215
Round 3 update starts...
Round 3 update: 0.00s
Round 3 eval starts...
Round 3 eval: 0.00s
[3]	train-logloss:0.505497
Round 4 update starts...
Round 4 update: 0.00s
Round 4 eval starts...
Round 4 eval: 0.00s
[4]	train-logloss:0.474375
Round 5 update starts...
Round 5 update: 0.00s
Round 5 eval starts...
Round 5 eval: 0.00s
[5]	train-logloss:0.447397
Round 6 update starts...
Round 6 update: 0.00s
Round 6 eval starts...
Round 6 eval: 0.00s
[6]	train-logloss:0.425368
Round 7 update starts...
Round 7 update: 0.00s
Round 7 eval starts...
Round 7 eval: 0.00s
[7]	train-logloss:0.40485
Round 8 update starts...
Round 8 update: 0.00s
Round

----