# Feature engineering

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline

import xgboost

import sklearn
from sklearn.model_selection import train_test_split

import sys, os, gc, types
import time
from subprocess import check_output

In [19]:
root_paths = [
    "/data/kaggle-instacart/",
    "/Users/jiayou/Dropbox/珺珺的程序/Kaggle/Instacart/",
    "/Users/jiayou/Dropbox/Documents/珺珺的程序/Kaggle/Instacart/"
]
root = None
for p in root_paths:
    if os.path.exists(p):
        root = p
        break

In [20]:
class tick_tock:
    def __init__(self, process_name, verbose=1):
        self.process_name = process_name
        self.verbose = verbose
    def __enter__(self):
        if self.verbose:
            print(self.process_name + " starts...")
            self.begin_time = time.time()
    def __exit__(self, type, value, traceback):
        if self.verbose:
            end_time = time.time()
            print('{} done: {:.2f}s'.format(self.process_name, end_time - self.begin_time))
            
def ka_add_groupby_features_1_vs_n(df, group_columns_list, agg_dict, only_new_feature=True):
    try:
        if type(group_columns_list) == list:
            pass
        else:
            raise TypeError(k + "should be a list")
    except TypeError as e:
        print(e)
        raise

    df_new = df.copy()
    grouped = df_new.groupby(group_columns_list)

    the_stats = grouped.agg(agg_dict)
    the_stats.columns = the_stats.columns.droplevel(0)
    the_stats.reset_index(inplace=True)
    if only_new_feature:
        df_new = the_stats
    else:
        df_new = pd.merge(left=df_new, right=the_stats, on=group_columns_list, how='left')

    return df_new

In [21]:
def aug_name(s, ms):
    return 'aug{}-{}'.format(s, ms)

def load_data(root, aug = None, down_sample = None):
    if aug is None:
        pf = os.path.join(root, 'order_products__prior.csv')
        tf = os.path.join(root, 'order_products__train.csv')
        of = os.path.join(root, 'orders.csv')
    else:
        pf = os.path.join(root, 'aug', 'order_products__prior.{}.csv'.format(aug))
        tf = os.path.join(root, 'aug', 'order_products__train.{}.csv'.format(aug))
        of = os.path.join(root, 'aug', 'orders.{}.csv'.format(aug))
    priors = pd.read_csv(pf, 
                     dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    train = pd.read_csv(tf, 
                    dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    orders = pd.read_csv(of, 
                         dtype={
                                'order_id': np.int32,
                                'user_id': np.int64,
                                'eval_set': 'category',
                                'order_number': np.int16,
                                'order_dow': np.int8,
                                'order_hour_of_day': np.int8,
                                'days_since_prior_order': np.float32})
    
    if down_sample is not None:
        priors = priors.merge(orders[['order_id', 'user_id']], on='order_id', how='left')
        train = train.merge(orders[['order_id', 'user_id']], on='order_id', how='left')

        orders = orders[orders.user_id % down_sample == 0]
        priors = priors[priors.user_id % down_sample == 0]
        train = train[train.user_id % down_sample == 0]

        priors.drop('user_id', inplace = True, axis=1)
        train.drop('user_id', inplace = True, axis=1)
    
    return priors, train, orders
    
    
def load_features(root):
    products = pd.read_csv(root + 'products.csv')
    prod_feature = pd.read_csv(os.path.join(root, 'feature_prod.csv'))
    prod_dow_feature = pd.read_csv(os.path.join(root, 'feature_prod_dow.csv'))
    prod_hod_feature = pd.read_csv(os.path.join(root, 'feature_prod_hod.csv'))
    category_feature = pd.read_csv(os.path.join(root, 'feature_category.csv'))
    
    return products, prod_feature, prod_dow_feature, prod_hod_feature, category_feature

In [22]:
products, prod_feature, prod_dow_feature, prod_hod_feature, category_feature = load_features(root)

# User feature engineering

In [23]:
def process_shard(shard=None, down_sample=None):    
    priors, train, orders = load_data(root, down_sample=down_sample, aug=shard)
    global products, prod_feature, prod_dow_feature, prod_hod_feature, category_feature

    
    orders['days_since_first_order'] = \
        orders.groupby('user_id').days_since_prior_order.cumsum().fillna(0)
    orders = orders.merge(
        orders.groupby('user_id').days_since_first_order.agg({'max_days':'max'}).reset_index(),
        on = 'user_id', how = 'left')
    orders['days_to_last_order'] = orders.max_days - orders.days_since_first_order
    orders['hod_group'] = (orders.order_hour_of_day / 4).astype('int')
    orders.drop(['days_since_first_order', 'max_days'], axis=1, inplace=True)
    
    priors_orders_detail = orders.merge(
        right=priors, how='inner', on='order_id'
    ).merge(
        products[['product_id','aisle_id']], how = 'left', on = 'product_id'
    )

    # user features

    agg_dict_2 = {'order_number':{'user_total_orders':'max'},
                  'days_since_prior_order':{'user_sum_days_since_prior_order':'sum', 
                                            'user_mean_days_since_prior_order': 'mean'}}
    users = ka_add_groupby_features_1_vs_n(orders[orders.eval_set == 'prior'], ['user_id'], agg_dict_2)

    agg_dict_3 = {'reordered':
                  {'user_reorder_ratio': 
                   lambda x: sum(priors_orders_detail.loc[x.index,'reordered']==1)/
                             sum(priors_orders_detail.loc[x.index,'order_number'] > 1)},
                  'product_id':{'user_total_products':'count', 
                                'user_distinct_products':'nunique'}}
    us = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['user_id'], agg_dict_3)
    users = users.merge(us, how='inner', on = 'user_id')

    users['user_average_basket'] = users.user_total_products / users.user_total_orders

    us = orders[orders.eval_set != "prior"]
    users = users.merge(us, how='inner', on = 'user_id')

    # user-prod features
    
    agg_dict_4 = {'order_number':{'up_order_count': 'count', 
                                  'up_first_order_number': 'min', 
                                  'up_last_order_number':'max'}, 
                  'add_to_cart_order':{'up_average_cart_position': 'mean'},
                  'days_to_last_order':{'up_days_since_last_order':'min'}}

    data = ka_add_groupby_features_1_vs_n(
        df=priors_orders_detail, 
        group_columns_list=['user_id', 'product_id', 'aisle_id'], 
        agg_dict=agg_dict_4)

    # orders/days since last not order feature
    
    users.set_index('user_id', drop=False, inplace = True)
    up_since_last_not_order = []
    
    for key, group in priors_orders_detail.groupby(['user_id', 'product_id']):
        user_id = key[0]
        current_total_order = users.loc[user_id].user_total_orders
        
        if set(range(1, current_total_order+1)) == set(group.order_number):
            up_since_last_not_order.append(
                {'user_id': user_id, 
                 'product_id': key[1], 
                 'up_order_since_last_not_order': None, 
                 'last_not_order_number': None})

        else:
            v = max(set(range(1, current_total_order+1)) - set(group.order_number))
            up_since_last_not_order.append(
                {'user_id': user_id, 
                 'product_id': key[1], 
                 'up_order_since_last_not_order': current_total_order - v + 1, 
                 'last_not_order_number': v})
            
    up_since_last_not_order_df = pd.DataFrame(up_since_last_not_order)
    data = data.merge(up_since_last_not_order_df, how='left', on=['user_id', 'product_id'])
    orders['last_not_order_number'] = orders.order_number
    orders['up_days_since_last_not_order'] = orders.days_to_last_order
    data = data.merge(
        orders[['user_id', 'last_not_order_number', 'up_days_since_last_not_order']], 
        how='left', on=['user_id', 'last_not_order_number'])
    
    
    # other pre-built features

    data = data.merge(
        prod_feature, how='inner', on='product_id'
    ).merge(
        users, how='inner', on='user_id'
    ).merge(
        category_feature, how = 'inner', on='aisle_id')

    data['up_order_rate'] = data.up_order_count / data.user_total_orders
    data['up_order_since_last_order'] = data.user_total_orders - data.up_last_order_number
    data['up_order_rate_since_first_order'] = \
        data.up_order_count / (data.user_total_orders - data.up_first_order_number + 1)

    # training labels

    train = train.merge(right=orders[['order_id', 'user_id']], how='left', on='order_id')
    data = data.merge(train[['user_id', 'product_id', 'reordered']], on=['user_id', 'product_id'], how='left')
    
    data = data.merge(
        prod_hod_feature, 
        on = ['product_id', 'hod_group'], 
        how = 'left')
    data.prod_market_share_hod.fillna(0)
    data = data.merge(
        prod_dow_feature, 
        on = ['product_id', 'order_dow'], how = 'left')
    data.prod_market_share_dow.fillna(0)

    # abt

    drop_list = [
        'user_id', 'aisle_id', 'order_number', 'order_dow', 
        'order_hour_of_day', 'days_to_last_order', 'hod_group', 
        'cat_num_of_prods_a_user_buys_in_this_cat_median', 'last_not_order_number']
    data.drop(drop_list, inplace = True, axis=1)

    for col in data.columns:
        if data[col].dtypes == 'float64':
            data[col] = data[col].astype('float32')
        if data[col].dtypes == 'int64':
            data[col] = data[col].astype('int32')

    data_train = data[data.eval_set == 'train']
    data_test = data[data.eval_set == 'test']
    
    print('Shard {} train'.format(shard), data_train.shape)
    print('Shard {} test'.format(shard), data_test.shape)

    if shard is None:
        data_train.to_csv(os.path.join(root, 'abt', 'abt_train.csv'), index = None)
        data_test.to_csv(os.path.join(root, 'abt', 'abt_test.csv'), index = None)
    else:
        data_train.to_csv(os.path.join(root, 'abt', 'abt_train.{}.csv'.format(shard)), index = None)

In [36]:
def process_shards(shards, down_sample):
    for s in shards:
        with tick_tock("Process shard {}".format(s)):
            process_shard(shard=s, down_sample=down_sample)
        gc.collect()

In [37]:
from multiprocessing import Process

down_sample = None
n_shards = 32


# n_shards = 1


shards = [aug_name(s, ms) for ms in range(52) for s in range(4)]

jobs = []
for s in range(n_shards):
    cur_shards = [shards[i] for i in range(len(shards)) if i%n_shards == s]
    p = Process(target=process_shards, args=(cur_shards, down_sample))
    p.start()
    jobs.append(p)
    
for p in jobs:
    p.join()

print("\n\nAll done.")

Process shard aug0-0 starts...
Process shard aug0-0: 8.19s
Process shard aug0-1 starts...
Process shard aug0-1: 9.50s
Process shard aug0-2 starts...
Process shard aug0-2: 11.26s
Process shard aug0-3 starts...
Process shard aug0-3: 11.26s


All done.


In [None]:
process_shards([None], down_sample)