# Feature engineering

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline

import xgboost

import sklearn
from sklearn.model_selection import train_test_split

import sys, os, gc, types
import time
from subprocess import check_output

In [39]:
root_paths = [
    "/data/kaggle-instacart/",
    "/Users/jiayou/Dropbox/珺珺的程序/Kaggle/Instacart/",
    "/Users/jiayou/Dropbox/Documents/珺珺的程序/Kaggle/Instacart/"
]
root = None
for p in root_paths:
    if os.path.exists(p):
        root = p
        break
# print(check_output(["ls", root]).decode("utf8"))

In [3]:
def load_data(path_data):
    priors = pd.read_csv(path_data + 'order_products__prior.csv', 
                     dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    train = pd.read_csv(path_data + 'order_products__train.csv', 
                    dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    orders = pd.read_csv(path_data + 'orders.csv', 
                         dtype={
                                'order_id': np.int32,
                                'user_id': np.int64,
                                'eval_set': 'category',
                                'order_number': np.int16,
                                'order_dow': np.int8,
                                'order_hour_of_day': np.int8,
                                'days_since_prior_order': np.float32})
    
    products = pd.read_csv(path_data + 'products.csv')
    aisles = pd.read_csv(path_data + "aisles.csv")
    departments = pd.read_csv(path_data + "departments.csv")
    sample_submission = pd.read_csv(path_data + "sample_submission.csv")
    
    return priors, train, orders, products, aisles, departments, sample_submission


In [4]:
class tick_tock:
    def __init__(self, process_name, verbose=1):
        self.process_name = process_name
        self.verbose = verbose
    def __enter__(self):
        if self.verbose:
            print(self.process_name + " begin ......")
            self.begin_time = time.time()
    def __exit__(self, type, value, traceback):
        if self.verbose:
            end_time = time.time()
            print(self.process_name + " end ......")
            print('time lapsing {0} s \n'.format(end_time - self.begin_time))
            
def ka_add_groupby_features_1_vs_n(df, group_columns_list, agg_dict, only_new_feature=True):
    with tick_tock("add stats features"):
        try:
            if type(group_columns_list) == list:
                pass
            else:
                raise TypeError(k + "should be a list")
        except TypeError as e:
            print(e)
            raise

        df_new = df.copy()
        grouped = df_new.groupby(group_columns_list)

        the_stats = grouped.agg(agg_dict)
        the_stats.columns = the_stats.columns.droplevel(0)
        the_stats.reset_index(inplace=True)
        if only_new_feature:
            df_new = the_stats
        else:
            df_new = pd.merge(left=df_new, right=the_stats, on=group_columns_list, how='left')

    return df_new

def ka_add_groupby_features_n_vs_1(df, group_columns_list, target_columns_list, methods_list, keep_only_stats=True, verbose=1):
    with tick_tock("add stats features", verbose):
        dicts = {"group_columns_list": group_columns_list , "target_columns_list": target_columns_list, "methods_list" :methods_list}

        for k, v in dicts.items():
            try:
                if type(v) == list:
                    pass
                else:
                    raise TypeError(k + "should be a list")
            except TypeError as e:
                print(e)
                raise

        grouped_name = ''.join(group_columns_list)
        target_name = ''.join(target_columns_list)
        combine_name = [[grouped_name] + [method_name] + [target_name] for method_name in methods_list]

        df_new = df.copy()
        grouped = df_new.groupby(group_columns_list)

        the_stats = grouped[target_name].agg(methods_list).reset_index()
        the_stats.columns = [grouped_name] + \
                            ['_%s_%s_by_%s' % (grouped_name, method_name, target_name) \
                             for (grouped_name, method_name, target_name) in combine_name]
        if keep_only_stats:
            return the_stats
        else:
            df_new = pd.merge(left=df_new, right=the_stats, on=group_columns_list, how='left')
        return df_new


In [5]:
priors, train, orders, products, aisles, departments, sample_submission = load_data(root)

# Product feature engineering

In [9]:
# build days_to_last_order
dsfo = orders.groupby('user_id').days_since_prior_order.cumsum().fillna(0)
orders['days_since_first_order'] = dsfo
max_days = orders.groupby('user_id').days_since_first_order.agg({'max_days':'max'}).reset_index()
orders = orders.merge(max_days, on = 'user_id', how = 'left')
orders['days_to_last_order'] = orders.max_days - orders.days_since_first_order
orders['hod_group'] = (orders.order_hour_of_day / 4).astype('int')

In [10]:
orders.drop(['days_since_first_order', 'max_days'], axis=1, inplace=True)
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,days_to_last_order,hod_group
0,2539329,1,prior,1,2,8,,190.0,2
1,2398795,1,prior,2,3,7,15.0,175.0,1
2,473747,1,prior,3,3,12,21.0,154.0,3
3,2254736,1,prior,4,4,7,29.0,125.0,1
4,431534,1,prior,5,4,15,28.0,97.0,3


In [11]:
priors_orders_detail = orders.merge(right=priors, how='inner', on='order_id').merge(products[['product_id','aisle_id']], how = 'left', on = 'product_id')
priors_orders_detail.loc[:,'user_buy_product_times'] = priors_orders_detail.groupby(['user_id', 'product_id']).cumcount() + 1
priors_orders_detail.loc[:,'user_buy_category_times'] = priors_orders_detail.groupby(['user_id', 'aisle_id']).cumcount() + 1
agg_dict = {'user_id':{'prod_total_cnt':'count'}, 
            'reordered':{'prod_reorder_total_cnt':'sum'}, 
            'user_buy_product_times': {'prod_user_cnt':lambda x: sum(x==1),
                                        'prod_return_user_cnt':lambda x: sum(x==2)}}
prd = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['product_id'], agg_dict)

prd['prod_user_reorder_ratio'] = prd.prod_return_user_cnt / prd.prod_user_cnt
prd['prod_product_reorder_ratio'] = prd.prod_reorder_total_cnt / prd.prod_total_cnt

add stats features begin ......
add stats features end ......
time lapsing 318.8975019454956 s 



In [12]:
prd.head()

Unnamed: 0,product_id,prod_total_cnt,prod_reorder_total_cnt,prod_user_cnt,prod_return_user_cnt,prod_user_reorder_ratio,prod_product_reorder_ratio
0,1,1852,1136.0,716,276,0.385475,0.613391
1,2,90,12.0,78,8,0.102564,0.133333
2,3,277,203.0,74,36,0.486486,0.732852
3,4,329,147.0,182,64,0.351648,0.446809
4,5,15,9.0,6,4,0.666667,0.6


In [13]:
priors_orders_detail.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,days_to_last_order,hod_group,product_id,add_to_cart_order,reordered,aisle_id,user_buy_product_times,user_buy_category_times
0,2539329,1,prior,1,2,8,,190.0,2,196,1,0,77,1,1
1,2539329,1,prior,1,2,8,,190.0,2,14084,2,0,91,1,1
2,2539329,1,prior,1,2,8,,190.0,2,12427,3,0,23,1,1
3,2539329,1,prior,1,2,8,,190.0,2,26088,4,0,23,1,2
4,2539329,1,prior,1,2,8,,190.0,2,26405,5,0,54,1,1


# Product-Timeline feature engineering

In [24]:
agg_dict_8 = {'order_id':{'prod_cnt_by_hod':'count'}}
prod_hod = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['product_id', 'hod_group'], agg_dict_8)
prod_hod = prod_hod.merge(prd[['product_id', 'prod_total_cnt']], on = 'product_id', how = 'left')
prod_hod['prod_market_share_hod'] = prod_hod['prod_cnt_by_hod'] / prod_hod['prod_total_cnt']
prod_hod.drop(['prod_cnt_by_hod', 'prod_total_cnt'], inplace = True, axis=1)

agg_dict_9 = {'order_id':{'prod_cnt_by_dow':'count'}}
prod_dow = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['product_id', 'order_dow'], agg_dict_9)
prod_dow = prod_dow.merge(prd[['product_id', 'prod_total_cnt']], on = 'product_id', how = 'left')
prod_dow['prod_market_share_dow'] = prod_dow['prod_cnt_by_dow'] / prod_dow['prod_total_cnt']
prod_dow.drop(['prod_cnt_by_dow', 'prod_total_cnt'], inplace = True, axis=1)

add stats features begin ......
add stats features end ......
time lapsing 11.197552919387817 s 

add stats features begin ......
add stats features end ......
time lapsing 11.516964197158813 s 



In [34]:
prod_hod.head()

Unnamed: 0,product_id,hod_group,prod_market_share_hod
0,1,0,0.019978
1,1,1,0.032937
2,1,2,0.315335
3,1,3,0.336393
4,1,4,0.212203


# Category feature engineering

In [16]:
agg_dict_5 = {'user_id':{'cat_total_bought_cnt':'count'}, 
            'reordered':{'cat_reorder_total_cnt':'sum'}, 
            'user_buy_category_times': {'cat_user_cnt':lambda x: sum(x==1),
                                        'cat_return_user_cnt':lambda x: sum(x==2)}}
cat = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['aisle_id'], agg_dict_5)

cat['cat_user_reorder_ratio'] = cat.cat_return_user_cnt / cat.cat_user_cnt
cat['cat_product_reorder_ratio'] = cat.cat_reorder_total_cnt / cat.cat_total_bought_cnt

agg_dict_6 = {'user_buy_category_times':{'cat_user_bought_cnts':'max'}}
cat_agg = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['aisle_id', 'user_id'], agg_dict_6)

agg_dict_7 = {'cat_user_bought_cnts':{'cat_num_of_prods_a_user_buys_in_this_cat_mean':'mean',
                                     'cat_num_of_prods_a_user_buys_in_this_cat_std':'std',
                                     'cat_num_of_prods_a_user_buys_in_this_cat_max':'max',
                                     'cat_num_of_prods_a_user_buys_in_this_cat_median':'median'}}
category_agg = ka_add_groupby_features_1_vs_n(cat_agg, ['aisle_id'], agg_dict_7)
category = cat.merge(category_agg, on = 'aisle_id', how = 'left')
category.head()

add stats features begin ......
add stats features end ......
time lapsing 295.7515540122986 s 

add stats features begin ......
add stats features end ......
time lapsing 13.978679895401001 s 

add stats features begin ......
add stats features end ......
time lapsing 0.8720319271087646 s 



Unnamed: 0,aisle_id,cat_total_bought_cnt,cat_reorder_total_cnt,cat_user_cnt,cat_return_user_cnt,cat_user_reorder_ratio,cat_product_reorder_ratio,cat_num_of_prods_a_user_buys_in_this_cat_mean,cat_num_of_prods_a_user_buys_in_this_cat_std,cat_num_of_prods_a_user_buys_in_this_cat_max,cat_num_of_prods_a_user_buys_in_this_cat_median
0,1,71928,42912.0,20711,10885,0.525566,0.596597,3.472937,6.222165,254,2
1,2,82491,40365.0,31222,14935,0.478349,0.489326,2.642079,3.614434,111,1
2,3,456386,272922.0,63592,46666,0.733834,0.598007,7.176783,14.744941,585,3
3,4,200687,98243.0,53892,33716,0.625622,0.489533,3.723874,4.740843,86,2
4,5,62510,17542.0,32312,12655,0.39165,0.280627,1.934575,2.003322,60,1


# Product, product-timeline, category feature output

In [36]:
def output_feature(data, name):
    for col in data.columns:
        if data[col].dtypes == 'float64':
            data[col] = data[col].astype('float32')
        if data[col].dtypes == 'int64':
            data[col] = data[col].astype('int32')
    data.to_csv(os.path.join(root, 'feature_{}.csv'.format(name)), index = None)

In [37]:
output_feature(prd, 'prod')
output_feature(prod_dow, 'prod_dow')
output_feature(prod_hod, 'prod_hod')
output_feature(category, 'category')

In [38]:
del priors_orders_detail, orders
gc.collect()

885