# Instacart - Shortest Path to Submission
## 1. 問題定義
- userごとにorderの記録が時系列順に並んでいる
- あるuserが過去注文したアイテムを再注文した場合、"reorder"フラグが立てられている
- 各userの最新orderについて、reorderしたアイテムを複数予測する
    - 実際にreorderしたアイテムを多く含み、かつreorderしたアイテムをなるべく含まないようにする
    - ひとつもreorderが無いと予想した場合は"None"を出力する

(参考)リレーション

https://www.kaggle.com/c/instacart-market-basket-analysis/discussion/33205

## 2. データを眺める
略。公式のDataページかKernelを見てザックリ雰囲気をつかむ
- https://www.kaggle.com/c/instacart-market-basket-analysis/data

## 3. First Submission
とりあえず、ユーザーごとに「過去買ったものは全部reorderする」という予測でsubmissionを作ってみる

In [1]:
import pandas as pd
import numpy as np
import os
import time
from contextlib import contextmanager

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# まずfeatherに変換しておく
def load(path):
    if not os.path.exists(path+'.f'):
        pd.read_csv(path).to_feather(path+'.f')
    return pd.read_feather(path+'.f')

with timer('load data'):
    aisles      = load('../input/aisles.csv')
    departments = load('../input/departments.csv')
    prior       = load('../input/order_products__prior.csv')
    train       = load('../input/order_products__train.csv')
    orders      = load('../input/orders.csv')
    products    = load('../input/products.csv')

load data - done in 1s


In [2]:
# order-id/user-id/product-idを一つにまとめる
with timer('merge & drop duplicates'):
    prior_orders = pd.merge(prior, orders[['order_id','user_id']], on='order_id', how='left')
    print(prior_orders.shape)
    prior_orders.drop_duplicates(subset=['user_id','product_id'], inplace=True)
    print(prior_orders.shape)   
    
prior_orders.head()

(32434489, 5)
(13307953, 5)
merge & drop duplicates - done in 11s


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id
0,2,33120,1,1,202279
1,2,28985,2,1,202279
2,2,9327,3,0,202279
3,2,45918,4,1,202279
4,2,30035,5,0,202279


In [3]:
# userごとに、過去買ったアイテムをまとめる
with timer('aggregate prior products'):
    prior_orders['product_id_str'] = prior_orders['product_id'].astype(str)
    prior_products = prior_orders.groupby('user_id')['product_id_str'].apply(lambda x: ' '.join(x)).reset_index()
    
prior_products.head()

aggregate prior products - done in 16s


Unnamed: 0,user_id,product_id_str
0,1,196 12427 10258 25133 10326 17122 41787 13176 ...
1,2,49451 32792 32139 34688 36735 37646 22829 2485...
2,3,38596 21903 248 40604 8021 17668 21137 23650 3...
3,4,22199 25146 1200 17769 43704 37646 11865 35469...
4,5,27344 24535 43693 40706 16168 21413 13988 3376...


In [4]:
with timer('make 1st submission'):
    orders_in_test = orders[orders['eval_set'] == 'test']

    submission = pd.merge(orders_in_test[['user_id','order_id']], prior_products, on='user_id', how='left')
    submission.drop('user_id', axis=1, inplace=True)
    submission.columns = ['order_id', 'products']
    submission.to_csv('../output/submission_baseline.csv', index=False)

make 1st submission - done in 1s


- Pandasは内部で列指向にデータを持っており、なるべく列をまたがない＆一度にたくさんの行を処理させた方が高速
    - product_idの型変換はgroupbyの中でやらず、先にastype(str)で追加の列を作っておく

## 4. モデル学習 (Private LB:0.3266084)

- user_id x item_idを1つの行として学習させる
    - priorで買った＆trainで買わなかった…0
    - priorで買った＆trainで買った…1
    
|eval_set|user|reordered|意味|
|--|--|--|--|
|prior|A|0/1|trainデータ(label==0)|
|prior|B|0/1|testデータ|
|train|A|1|trainデータ(label==1)|
|train|A|0|学習対象外|   
|test|-|-|testデータ|

In [5]:
with timer('extract last order for each user-x-item'):
    # 全データをまとめる
    all = pd.merge(orders, pd.concat([train,prior]), on='order_id', how='left')
    all.head()

    # user x itemで最後のデータだけを残す
    last_order_by_user_x_item = all.drop_duplicates(subset=['user_id','product_id'], keep='last')
    last_order_by_user_x_item.head()

extract last order for each user-x-item - done in 14s


In [6]:
last_order_by_user_x_item.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
1,2539329,1,prior,1,2,8,,14084.0,2.0,0.0
15,473747,1,prior,3,3,12,21.0,30450.0,5.0,0.0
25,431534,1,prior,5,4,15,28.0,10326.0,5.0,0.0
26,431534,1,prior,5,4,15,28.0,17122.0,6.0,0.0
27,431534,1,prior,5,4,15,28.0,41787.0,7.0,0.0


In [7]:
# とりあえずuser x productを1行として、Train/Testの区分と目的変数の値を入れておく
# 特徴量エンジニアリングはこのテーブルを基準にすることとする
with timer('make user_id x product_id x target x is_train'):
    # train/testでユーザーが振り分けられているので、trainに属するuserかどうかを列に追加
    train_users = orders[['eval_set','user_id']][orders['eval_set'] =='train']

    X = last_order_by_user_x_item[['user_id','product_id','eval_set']].dropna()
    X['is_train'] = X['user_id'].isin(train_users['user_id'])
    X['target'] = (X['eval_set'] == 'train').astype(np.int32)
    X.drop('eval_set', axis=1, inplace=True)

X.head()

make user_id x product_id x target x is_train - done in 4s


Unnamed: 0,user_id,product_id,is_train,target
1,1,14084.0,True,0
15,1,30450.0,True,0
25,1,10326.0,True,0
26,1,17122.0,True,0
27,1,41787.0,True,0


In [8]:
# 特徴量エンジニアリング
with timer('feature engineering'):
    # とりあえず簡単なものだけやる
    
    # 1. user x item での group-by
    # 1-1. 過去そのアイテムを何回注文しているか
    n_bought = all.groupby(['user_id','product_id'])['order_number'].count().reset_index().rename(columns={'order_number':'n_bought'})
    X = pd.merge(X, n_bought, on=['user_id','product_id'], how='left')
    
    # 2. user での group-by
    # 2-1. reorderの割合
    reorder_rate = all.groupby(['user_id'])['reordered'].mean().reset_index().rename(columns={'reordered':'mean_reorder'})
    X = pd.merge(X, reorder_rate, on=['user_id'], how='left')

    # 3. product情報
    X = pd.merge(X, products.drop('product_name',axis=1), on='product_id', how='left')
X.head()

feature engineering - done in 24s


Unnamed: 0,user_id,product_id,is_train,target,n_bought,mean_reorder,aisle_id,department_id
0,1,14084,True,0,1,0.728571,91,16
1,1,30450,True,0,1,0.728571,88,13
2,1,10326,True,0,1,0.728571,24,4
3,1,17122,True,0,1,0.728571,24,4
4,1,41787,True,0,1,0.728571,24,4


In [9]:
import lightgbm as lgb

with timer('split data'):
    X.set_index('user_id', inplace=True)
    #X.drop('product_id', axis=1, inplace=True)
    X['product_id'] = X['product_id'].astype(np.int32)
  
    X_train = X[X['is_train']].drop('is_train', axis=1)
    X_test  = X[~X['is_train']].drop('is_train', axis=1)

    dtrain = lgb.Dataset(X_train.drop(['target'], axis=1), X_train['target'])
    
    print('train: {}, test: {}'.format(X_train.shape, X_test.shape))

open C:\Users\noumi\AppData\Local\conda\conda\envs\tensorflow\lib\site-packages\lightgbm\lib_lightgbm.dll
train: (9030454, 6), test: (4833292, 6)
split data - done in 2s


In [10]:
X_train.head()

Unnamed: 0_level_0,product_id,target,n_bought,mean_reorder,aisle_id,department_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,14084,0,1,0.728571,91,16
1,30450,0,1,0.728571,88,13
1,10326,0,1,0.728571,24,4
1,17122,0,1,0.728571,24,4
1,41787,0,1,0.728571,24,4


In [11]:
with timer('lgbm training'):
    lgb_param = {
        'objective' : 'binary',
        'boosting_type': 'gbdt',
        'metric' : 'binary_logloss',
        'num_leaves' : 15,
        'seed' : 0,
        'learning_rate' : 0.1
    }

    #result = lgb.cv(lgb_param, dtrain, num_boost_round=50, nfold=3, early_stopping_rounds=100, verbose_eval=50)
    #print(result)
    
    # TODO CVのearly-stoppingからイテレーション数を決める
    best_round = 300
 
    booster = lgb.train(lgb_param, dtrain, num_boost_round=int(best_round*1.1))

lgbm training - done in 46s


In [27]:
predicted = X_test.reset_index()[['user_id','product_id']]

test_users = orders[['eval_set','user_id','order_id']][orders['eval_set'] =='test']
predicted = pd.merge(predicted, test_users, on='user_id', how='left')

predicted['y'] = booster.predict(X_test.drop('target',axis=1))

In [24]:
predicted.head()

Unnamed: 0,user_id,product_id,eval_set,order_id,y
0,3,15143,test,2774568,0.065849
1,3,39922,test,2774568,0.070221
2,3,38596,test,2774568,0.051518
3,3,248,test,2774568,0.054971
4,3,40604,test,2774568,0.05793


In [32]:
th = 0.2 # TODO

predicted['product_id'] = predicted['product_id'].astype(str)

# 閾値を超えた行だけを抜き出して、submit用に整形
sub = predicted[predicted['y'] > th].groupby(['order_id'])['product_id'].apply(lambda x: ' '.join(x)).reset_index().rename(columns={'product_id':'products'})

# 閾値をひとつも超えなかったオーダーが抜けてしまうので、Join->fillnaでNoneを埋める
sub = pd.merge(test_users[['order_id']], sub, on='order_id', how='left')
sub.fillna('None', inplace=True)

sub.sort_values(by='order_id', inplace=True)
sub.head()

Unnamed: 0,order_id,products
13407,17,21903 7035 11494 21709 1283 47766 44056 38777 ...
12775,34,10132 7131 47029 7035 39275 39180 9597 48523 1...
68056,137,43352 35694 44142 38689 28934 44422 29594 2589...
42239,182,30007 30391 21903 24009 2078 34243 35951 3397 ...
12915,257,47766 36929 30391 38693 28476 45013 4605 24838...


In [33]:
sub.to_csv('../output/submission_lgbm_baseline.csv', index=False)

## TODO:
- Cross-Validationをちゃんとやる
- F1-scoreを最大化するように閾値を決める
- 特徴量エンジニアリング
- LightGBMハイパーパラメータのチューニング
- prior/trainを1つずつずらしていくとデータの水増しができるはず