# None as a product

## Feature engineering

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline

import xgboost

import sklearn
from sklearn.model_selection import train_test_split

import sys, os, gc, types
import time
from subprocess import check_output

In [2]:
root_paths = [
    "/data/kaggle-instacart/",
    "/Users/jiayou/Dropbox/珺珺的程序/Kaggle/Instacart/",
    "/Users/jiayou/Dropbox/Documents/珺珺的程序/Kaggle/Instacart/"
]
root = None
for p in root_paths:
    if os.path.exists(p):
        root = p
        break
print(check_output(["ls", root]).decode("utf8"))

Build Feature.ipynb
CHANGELOG.md
F1 score optimization.ipynb
Instacart Data Exploration.ipynb
None handling.ipynb
Param Search Results.ipynb
README.md
Tasks.ipynb
Toy DF.ipynb
Training.ipynb
abt_test.csv
abt_train.csv
aisles.csv
departments.csv
order_products__prior.csv
order_products__train.csv
orders.csv
products.csv
sample_submission.csv
submission-v0.csv
submission-v1-r0.csv
submission-v1-r1.csv
training-logs
utils



In [3]:
def load_data(path_data):
    priors = pd.read_csv(path_data + 'order_products__prior.csv', 
                     dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    train = pd.read_csv(path_data + 'order_products__train.csv', 
                    dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    orders = pd.read_csv(path_data + 'orders.csv', 
                         dtype={
                                'order_id': np.int32,
                                'user_id': np.int64,
                                'eval_set': 'category',
                                'order_number': np.int16,
                                'order_dow': np.int8,
                                'order_hour_of_day': np.int8,
                                'days_since_prior_order': np.float32})
    
    products = pd.read_csv(path_data + 'products.csv')
    aisles = pd.read_csv(path_data + "aisles.csv")
    departments = pd.read_csv(path_data + "departments.csv")
    sample_submission = pd.read_csv(path_data + "sample_submission.csv")
    
    return priors, train, orders, products, aisles, departments, sample_submission

In [4]:
class tick_tock:
    def __init__(self, process_name, verbose=1):
        self.process_name = process_name
        self.verbose = verbose
    def __enter__(self):
        if self.verbose:
            print(self.process_name + " begin ......")
            self.begin_time = time.time()
    def __exit__(self, type, value, traceback):
        if self.verbose:
            end_time = time.time()
            print(self.process_name + " end ......")
            print('time lapsing {0} s \n'.format(end_time - self.begin_time))
            
def ka_add_groupby_features_1_vs_n(df, group_columns_list, agg_dict, only_new_feature=True):
    with tick_tock("add stats features"):
        try:
            if type(group_columns_list) == list:
                pass
            else:
                raise TypeError(k + "should be a list")
        except TypeError as e:
            print(e)
            raise

        df_new = df.copy()
        grouped = df_new.groupby(group_columns_list)

        the_stats = grouped.agg(agg_dict)
        the_stats.columns = the_stats.columns.droplevel(0)
        the_stats.reset_index(inplace=True)
        if only_new_feature:
            df_new = the_stats
        else:
            df_new = pd.merge(left=df_new, right=the_stats, on=group_columns_list, how='left')

    return df_new

def ka_add_groupby_features_n_vs_1(df, group_columns_list, target_columns_list, methods_list, keep_only_stats=True, verbose=1):
    with tick_tock("add stats features", verbose):
        dicts = {"group_columns_list": group_columns_list , "target_columns_list": target_columns_list, "methods_list" :methods_list}

        for k, v in dicts.items():
            try:
                if type(v) == list:
                    pass
                else:
                    raise TypeError(k + "should be a list")
            except TypeError as e:
                print(e)
                raise

        grouped_name = ''.join(group_columns_list)
        target_name = ''.join(target_columns_list)
        combine_name = [[grouped_name] + [method_name] + [target_name] for method_name in methods_list]

        df_new = df.copy()
        grouped = df_new.groupby(group_columns_list)

        the_stats = grouped[target_name].agg(methods_list).reset_index()
        the_stats.columns = [grouped_name] + \
                            ['_%s_%s_by_%s' % (grouped_name, method_name, target_name) \
                             for (grouped_name, method_name, target_name) in combine_name]
        if keep_only_stats:
            return the_stats
        else:
            df_new = pd.merge(left=df_new, right=the_stats, on=group_columns_list, how='left')
        return df_new


In [5]:
priors, train, orders, products, aisles, departments, sample_submission = load_data(root)

In [6]:
# down_sample = None
down_sample = 10
if down_sample is not None:
    priors = priors.merge(orders[['order_id', 'user_id']], on='order_id', how='left')
    train = train.merge(orders[['order_id', 'user_id']], on='order_id', how='left')
    
    orders = orders[orders.user_id % down_sample == 0]
    priors = priors[priors.user_id % down_sample == 0]
    train = train[train.user_id % down_sample == 0]
    
    priors.drop('user_id', inplace = True, axis=1)
    train.drop('user_id', inplace = True, axis=1)

## Calculate None for priors/train orders

In [7]:
none_priors = priors.groupby('order_id').reordered.agg({'contains_reorder':'max'}).reset_index()
none_priors

Unnamed: 0,order_id,contains_reorder
0,3,1
1,4,1
2,12,1
3,16,1
4,18,1
5,43,1
6,50,1
7,64,1
8,73,1
9,88,1


## User feature engineering

In [8]:
priors_orders_detail = orders.merge(right=priors, how='inner', on='order_id').merge(products[['product_id','aisle_id']], how = 'left', on = 'product_id')
priors_orders_detail.loc[:,'user_buy_product_times'] = priors_orders_detail.groupby(['user_id', 'product_id']).cumcount() + 1

In [9]:
agg_dict_2 = {'order_number':{'user_total_orders':'max'},
              'days_since_prior_order':{'user_sum_days_since_prior_order':'sum', 
                                        'user_mean_days_since_prior_order': 'mean'}}
users = ka_add_groupby_features_1_vs_n(orders[orders.eval_set == 'prior'], ['user_id'], agg_dict_2)

agg_dict_3 = {'reordered':
              {'user_reorder_ratio': 
               lambda x: sum(priors_orders_detail.ix[x.index,'reordered']==1)/
                         sum(priors_orders_detail.ix[x.index,'order_number'] > 1)},
              'product_id':{'user_total_products':'count', 
                            'user_distinct_products':'nunique'}}
us = ka_add_groupby_features_1_vs_n(priors_orders_detail, ['user_id'], agg_dict_3)
users = users.merge(us, how='inner')

add stats features begin ......
add stats features end ......
time lapsing 0.06378293037414551 s 

add stats features begin ......
add stats features end ......
time lapsing 84.28732705116272 s 



In [10]:
users['user_average_basket'] = users.user_total_products / users.user_total_orders

us = orders[orders.eval_set != "prior"]
# us.rename(index=str, columns={'days_since_prior_order': 'time_since_last_order'}, inplace=True)

users = users.merge(us, how='inner')

In [11]:
users.head()

Unnamed: 0,user_id,user_total_orders,user_sum_days_since_prior_order,user_mean_days_since_prior_order,user_reorder_ratio,user_total_products,user_distinct_products,user_average_basket,order_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,10,5,79.0,19.75,0.355072,143,94,28.6,1822501,train,6,0,19,30.0
1,20,4,15.0,5.0,0.833333,22,7,5.5,1980631,test,5,1,11,30.0
2,30,8,151.0,21.571428,0.625,11,6,1.375,62370,train,9,2,13,22.0
3,40,9,105.0,13.125,0.73913,104,36,11.555556,2431024,test,10,0,8,7.0
4,50,67,357.0,5.409091,0.814318,453,89,6.761194,1750084,train,68,3,9,7.0


In [12]:
prior_orders = none_priors.merge(orders, on='order_id', how='left')

In [13]:
prior_orders.head()

Unnamed: 0,order_id,contains_reorder,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,3,1,205970,prior,16,5,17,12.0
1,4,1,178520,prior,36,1,9,7.0
2,12,1,152610,prior,22,6,8,10.0
3,16,1,174840,prior,18,3,12,13.0
4,18,1,118860,prior,3,4,20,6.0


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
83,1224907,10,prior,1,2,14,
84,68288,10,prior,2,5,15,30.0
85,2115522,10,prior,3,3,19,12.0
86,83395,10,prior,4,3,15,14.0
87,1353310,10,prior,5,5,20,23.0
