# sample train

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline

import xgboost

import sklearn
from sklearn.model_selection import train_test_split 

In [2]:
import gc
import time
from subprocess import check_output
print(check_output(["ls", "/Users/jiayou/Dropbox/珺珺的程序/Kaggle/Instacart"]).decode("utf8"))

Instacart Data Exploration.ipynb
Sample Train.ipynb
Toy DF.ipynb
aisles.csv
departments.csv
order_products__prior.csv
order_products__train.csv
orders.csv
products.csv
sample_submission.csv
sample_train_submission.csv



In [3]:
def load_data(path_data):
    priors = pd.read_csv(path_data + 'order_products__prior.csv', 
                     dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    train = pd.read_csv(path_data + 'order_products__train.csv', 
                    dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    orders = pd.read_csv(path_data + 'orders.csv', 
                         dtype={
                                'order_id': np.int32,
                                'user_id': np.int64,
                                'eval_set': 'category',
                                'order_number': np.int16,
                                'order_dow': np.int8,
                                'order_hour_of_day': np.int8,
                                'days_since_prior_order': np.float32})
    
    products = pd.read_csv(path_data + 'products.csv')
    aisles = pd.read_csv(path_data + "aisles.csv")
    departments = pd.read_csv(path_data + "departments.csv")
    sample_submission = pd.read_csv(path_data + "sample_submission.csv")
    
    return priors, train, orders, products, aisles, departments, sample_submission

# orders_df and order_products__prior/train_df shares the 'order_id'
# order_products__prior/train_df and products_df shares the 'product_id'
# products_df and aisles_df shares the 'aisles_id'
# products_df and departments_df shares the 'department id'

In [4]:
class tick_tock:
    def __init__(self, process_name, verbose=1):
        self.process_name = process_name
        self.verbose = verbose
    def __enter__(self):
        if self.verbose:
            print(self.process_name + " begin ......")
            self.begin_time = time.time()
    def __exit__(self, type, value, traceback):
        if self.verbose:
            end_time = time.time()
            print(self.process_name + " end ......")
            print('time lapsing {0} s \n'.format(end_time - self.begin_time))
            
def ka_add_groupby_features_1_vs_n(df, group_columns_list, agg_dict, only_new_feature=True):
    with tick_tock("add stats features"):
        try:
            if type(group_columns_list) == list:
                pass
            else:
                raise TypeError(k + "should be a list")
        except TypeError as e:
            print(e)
            raise

        df_new = df.copy()
        grouped = df_new.groupby(group_columns_list)

        the_stats = grouped.agg(agg_dict)
        the_stats.columns = the_stats.columns.droplevel(0)
        the_stats.reset_index(inplace=True)
        if only_new_feature:
            df_new = the_stats
        else:
            df_new = pd.merge(left=df_new, right=the_stats, on=group_columns_list, how='left')

    return df_new

def ka_add_groupby_features_n_vs_1(df, group_columns_list, target_columns_list, methods_list, keep_only_stats=True, verbose=1):
    with tick_tock("add stats features", verbose):
        dicts = {"group_columns_list": group_columns_list , "target_columns_list": target_columns_list, "methods_list" :methods_list}

        for k, v in dicts.items():
            try:
                if type(v) == list:
                    pass
                else:
                    raise TypeError(k + "should be a list")
            except TypeError as e:
                print(e)
                raise

        grouped_name = ''.join(group_columns_list)
        target_name = ''.join(target_columns_list)
        combine_name = [[grouped_name] + [method_name] + [target_name] for method_name in methods_list]

        df_new = df.copy()
        grouped = df_new.groupby(group_columns_list)

        the_stats = grouped[target_name].agg(methods_list).reset_index()
        the_stats.columns = [grouped_name] + \
                            ['_%s_%s_by_%s' % (grouped_name, method_name, target_name) \
                             for (grouped_name, method_name, target_name) in combine_name]
        if keep_only_stats:
            return the_stats
        else:
            df_new = pd.merge(left=df_new, right=the_stats, on=group_columns_list, how='left')
        return df_new


In [5]:
path_data = "/Users/jiayou/Dropbox/珺珺的程序/Kaggle/Instacart/"
priors, train, orders, products, aisles, departments, sample_submission = load_data(path_data)

In [6]:
priors.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


# Product feature engineering

In [6]:
priors_orders_detail = orders.merge(right=priors, how='inner', on='order_id')
priors_orders_detail.loc[:,'_user_buy_product_times'] = priors_orders_detail.groupby(['user_id', 'product_id']).cumcount() + 1
agg_dict = {'user_id':{'_prod_tot_cnts':'count'}, 
            'reordered':{'_prod_reorder_tot_cnts':'sum'}, 
            '_user_buy_product_times': {'_prod_buy_first_time_total_cnt':lambda x: sum(x==1),
                                        '_prod_buy_second_time_total_cnt':lambda x: sum(x==2)}}
prd = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['product_id'], agg_dict)

prd['_prod_reorder_prob'] = prd._prod_buy_second_time_total_cnt / prd._prod_buy_first_time_total_cnt
prd['_prod_reorder_ratio'] = prd._prod_reorder_tot_cnts / prd._prod_tot_cnts
prd['_prod_reorder_times'] = 1 + prd._prod_reorder_tot_cnts / prd._prod_buy_first_time_total_cnt

add stats features begin ......
add stats features end ......
time lapsing 313.7621648311615 s 



In [8]:
prd.head()

Unnamed: 0,product_id,_prod_tot_cnts,_prod_reorder_tot_cnts,_prod_buy_first_time_total_cnt,_prod_buy_second_time_total_cnt,_prod_reorder_prob,_prod_reorder_ratio,_prod_reorder_times
0,1,1852,1136.0,716,276,0.385475,0.613391,2.586592
1,2,90,12.0,78,8,0.102564,0.133333,1.153846
2,3,277,203.0,74,36,0.486486,0.732852,3.743243
3,4,329,147.0,182,64,0.351648,0.446809,1.807692
4,5,15,9.0,6,4,0.666667,0.6,2.5


In [10]:
priors_orders_detail[priors_orders_detail.product_id == 90]

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_user_buy_product_times
991264,3018392,6417,prior,8,3,9,20.0,90,9,0,1
991389,2812235,6417,prior,14,0,19,4.0,90,20,1,2
991492,2653101,6417,prior,22,3,11,3.0,90,9,1,3
991547,2394699,6417,prior,27,6,12,3.0,90,4,1,4
5049943,1990274,32115,prior,1,3,13,,90,6,0,1
7649148,2538143,48545,prior,4,1,23,30.0,90,32,0,1
8433949,1250350,53477,prior,2,5,11,18.0,90,11,0,1
10951794,1473132,69404,prior,3,5,14,29.0,90,11,0,1
10951817,1989750,69404,prior,4,0,17,23.0,90,15,1,2
11893313,2242019,75319,prior,6,5,12,11.0,90,20,0,1


# User feature engineering

In [7]:
agg_dict_2 = {'order_number':{'_user_total_orders':'max'},
              'days_since_prior_order':{'_user_sum_days_since_prior_order':'sum', 
                                        '_user_mean_days_since_prior_order': 'mean'}}
users = ka_add_groupby_features_1_vs_n(orders[orders.eval_set == 'prior'], ['user_id'], agg_dict_2)

agg_dict_3 = {'reordered':
              {'_user_reorder_ratio': 
               lambda x: sum(priors_orders_detail.ix[x.index,'reordered']==1)/
                         sum(priors_orders_detail.ix[x.index,'order_number'] > 1)},
              'product_id':{'_user_total_products':'count', 
                            '_user_distinct_products': lambda x: x.nunique()}}
us = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['user_id'], agg_dict_3)
users = users.merge(us, how='inner')

users['_user_average_basket'] = users._user_total_products / users._user_total_orders

us = orders[orders.eval_set != "prior"][['user_id', 'order_id', 'eval_set', 'days_since_prior_order']]
us.rename(index=str, columns={'days_since_prior_order': 'time_since_last_order'}, inplace=True)

users = users.merge(us, how='inner')

add stats features begin ......
add stats features end ......
time lapsing 0.34899091720581055 s 

add stats features begin ......
add stats features end ......
time lapsing 929.014484167099 s 



In [54]:
priors_orders_detail[priors_orders_detail.order_number == 1].reordered.sum()
# each user's 1st order doesn't have any reordered item!

0

In [56]:
us.head()

Unnamed: 0,user_id,order_id,eval_set,time_since_last_order
10,1,1187899,train,14.0
25,2,1492625,train,30.0
38,3,2774568,test,11.0
44,4,329954,test,30.0
49,5,2196797,train,6.0


In [57]:
users.head()

Unnamed: 0,user_id,_user_total_orders,_user_sum_days_since_prior_order,_user_mean_days_since_prior_order,_user_reorder_ratio,_user_total_products,_user_distinct_products,_user_average_basket,order_id,eval_set,time_since_last_order
0,1,10,176.0,19.555555,0.759259,59,18,5.9,1187899,train,14.0
1,2,14,198.0,15.230769,0.510989,195,102,13.928571,1492625,train,30.0
2,3,12,133.0,12.090909,0.705128,88,33,7.333333,2774568,test,11.0
3,4,5,55.0,13.75,0.071429,18,17,3.6,329954,test,30.0
4,5,4,40.0,13.333333,0.538462,37,23,9.25,2196797,train,6.0


# Aggregate product and user info to final dataframe

In [8]:
agg_dict_4 = {'order_number':{'_up_order_count': 'count', 
                              '_up_first_order_number': 'min', 
                              '_up_last_order_number':'max'}, 
              'add_to_cart_order':{'_up_average_cart_position': 'mean'}}

data = ka_add_groupby_features_1_vs_n(df=priors_orders_detail, 
                                                      group_columns_list=['user_id', 'product_id'], 
                                                      agg_dict=agg_dict_4)

data = data.merge(prd, how='inner', on='product_id').merge(users, how='inner', on='user_id')

data['_up_order_rate'] = data._up_order_count / data._user_total_orders
data['_up_order_since_last_order'] = data._user_total_orders - data._up_last_order_number
data['_up_order_rate_since_first_order'] = data._up_order_count / (data._user_total_orders - data._up_first_order_number + 1)

train = train.merge(right=orders[['order_id', 'user_id']], how='left', on='order_id')
data = data.merge(train[['user_id', 'product_id', 'reordered']], on=['user_id', 'product_id'], how='left')

del priors_orders_detail, orders
gc.collect()

add stats features begin ......
add stats features end ......
time lapsing 19.708887100219727 s 



209

In [13]:
# data.to_csv('analytical_base_table.csv', index = None)
data.shape
# NaN in reordered column of data represent that the product is not purchased by user at the train_set (the last order)

(13307953, 27)

In [74]:
train[(train.user_id == 1)]
# if reordered == 0 in train_set, it won't show up in the data_set 

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id
484420,1187899,196,1,1,1
484421,1187899,25133,2,1,1
484422,1187899,38928,3,1,1
484423,1187899,26405,4,1,1
484424,1187899,39657,5,1,1
484425,1187899,10258,6,1,1
484426,1187899,13032,7,1,1
484427,1187899,26088,8,1,1
484428,1187899,27845,9,0,1
484429,1187899,49235,10,1,1


In [103]:
data.dtypes

user_id                                int64
product_id                             int64
_up_order_count                        int64
_up_first_order_number                 int16
_up_last_order_number                  int16
_up_average_cart_position            float64
_prod_tot_cnts                         int64
_prod_reorder_tot_cnts               float64
_prod_buy_first_time_total_cnt         int64
_prod_buy_second_time_total_cnt        int64
_prod_reorder_prob                   float64
_prod_reorder_ratio                  float64
_prod_reorder_times                  float64
_user_total_orders                     int16
_user_sum_days_since_prior_order     float32
_user_mean_days_since_prior_order    float32
_user_reorder_ratio                  float64
_user_total_products                   int64
_user_distinct_products               uint16
_user_average_basket                 float64
order_id                               int32
eval_set                              object
time_since

# Data split and Training

In [9]:
train = data.loc[data.eval_set == "train",:]
train.drop(['eval_set', 'user_id', 'product_id', 'order_id'], axis=1, inplace=True)
train.loc[:, 'reordered'] = train.reordered.fillna(0)

X_test = data.loc[data.eval_set == "test",:]

X_train, X_val, y_train, y_val = train_test_split(train.drop('reordered', axis=1), train.reordered,
                                                    test_size=0.2, random_state=1019)
# read in data
d_train = xgboost.DMatrix(X_train, y_train)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [10]:
#specify parameters via map
xgb_params = {
    "objective"         : "reg:logistic"
    ,"eval_metric"      : "logloss"
    ,"eta"              : 0.1
    ,"max_depth"        : 6
    ,"min_child_weight" :10
    ,"gamma"            :0.70
    ,"subsample"        :0.76
    ,"colsample_bytree" :0.95
    ,"alpha"            :2e-05
    ,"lambda"           :10
}


# watchlist= [(d_train, "train")]
# bst = xgboost.train(params=xgb_params, dtrain=d_train, num_boost_round=80, evals=watchlist, verbose_eval=10)

print ('running cross validation')
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value+std_value
# std_value is standard deviation of the metric
bst = xgboost.cv(xgb_params, d_train, num_boost_round=80, nfold=5,
       metrics={'logloss'}, seed = 1019,
       callbacks=[xgboost.callback.print_evaluation(show_stdv=True)])

print ('running cross validation, disable standard deviation display')
xgboost.plot_importance(bst)


running cross validation


KeyboardInterrupt: 

In [85]:
X_test.head()

Unnamed: 0,user_id,product_id,_up_order_count,_up_first_order_number,_up_last_order_number,_up_average_cart_position,_prod_tot_cnts,_prod_reorder_tot_cnts,_prod_buy_first_time_total_cnt,_prod_buy_second_time_total_cnt,...,_user_total_products,_user_distinct_products,_user_average_basket,order_id,eval_set,time_since_last_order,_up_order_rate,_up_order_since_last_order,_up_order_rate_since_first_order,reordered
18,15,196,5,15,22,2.2,35791,27791.0,8000,4660,...,72,13,3.272727,2161313,test,7.0,0.227273,0,0.625,
19,15,12427,10,1,20,2.1,6476,4797.0,1679,889,...,72,13,3.272727,2161313,test,7.0,0.454545,2,0.454545,
20,15,1747,4,8,19,3.5,1448,886.0,562,221,...,72,13,3.272727,2161313,test,7.0,0.181818,3,0.266667,
21,15,10441,8,1,22,2.375,2909,2042.0,867,465,...,72,13,3.272727,2161313,test,7.0,0.363636,0,0.363636,
22,15,11266,10,1,19,1.6,4081,3000.0,1081,645,...,72,13,3.272727,2161313,test,7.0,0.454545,3,0.454545,


In [98]:
d_test = xgboost.DMatrix(X_test.drop(['eval_set', 'user_id', 'order_id', 'reordered', 'product_id'], axis=1))
X_test.loc[:,'reordered'] = (bst.predict(d_test) > 0.21).astype(int)
X_test.loc[:, 'product_id'] = X_test.product_id.astype(str)
submit = ka_add_groupby_features_n_vs_1(X_test[X_test.reordered == 1], 
                                               group_columns_list=['order_id'],
                                               target_columns_list= ['product_id'],
                                               methods_list=[lambda x: ' '.join(set(x))], keep_only_stats=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


add stats features begin ......
add stats features end ......
time lapsing 5.928583860397339 s 



In [99]:
submit.columns = sample_submission.columns.tolist()
submit.shape

(71172, 2)

In [100]:
# there are some orders that does not have any predicted items, so the submit df does not contain them
submit_final = sample_submission[['order_id']].merge(submit, how='left').fillna('None')
submit_final.to_csv("sample_train_submission.csv", index=False)

In [101]:
submit_final.shape

(75000, 2)