# Instacart Modeling

1. Merge Every work - data cleaning, feature enginering, word2vec

2. Modeling - LightGBM model

### data cleaning work

In [None]:
import gc
import numpy as np 
import pandas as pd 


#data cleaning
train_orders = pd.read_csv('data/order_products__train.csv', dtype={'order_id': np.uint32,
           'product_id': np.uint16, 'reordered': np.int8, 'add_to_cart_order': np.uint8 })
orders = pd.read_csv('data/orders.csv', dtype={'order_hour_of_day': np.uint8,
           'order_number': np.uint8, 'order_id': np.uint32, 'user_id': np.uint32,
           'order_dow': np.uint8, 'days_since_prior_order': np.float16})

orders.eval_set = orders.eval_set.replace({'prior': 0, 'train': 1, 'test':2}).astype(np.uint8)
orders.days_since_prior_order = orders.days_since_prior_order.fillna(30).astype(np.uint8)

train_orders = train_orders.merge(orders[['user_id','order_id']], left_on = 'order_id', right_on = 'order_id', how = 'inner')
train_orders = train_orders[train_orders['reordered']==1].drop('reordered',axis=1)
orders.set_index('order_id', drop=False, inplace=True)
train1=orders[['order_id','eval_set']].loc[orders['eval_set']==1]
train1['actual'] = train_orders.groupby('order_id').aggregate({'product_id':lambda x: list(x)})
train1['actual']=train1['actual'].fillna('')
n_actual = train1['actual'].apply(lambda x: len(x)).mean()   # this is the average cart size

test1=orders[['order_id','eval_set']].loc[orders['eval_set']==2]
test1['actual']=' '
traintest1=pd.concat([train1,test1])
traintest1.set_index('order_id', drop=False, inplace=True)

del orders, train1, test1
gc.collect()

### Import feature engieering file - instacart_data.csv

In [None]:
#import feature engieering file - instacart_data.csv
data = pd.read_csv("data/instacart_data.csv")
data = data.astype(dtype= {'user_id' : np.uint32, 'product_id'  : np.uint16,
            'up_orders'  : np.uint8, 'up_first_order' : np.uint8, 'up_last_order' : np.uint8,
            'up_average_cart_position' : np.uint8, 'prod_orders' : np.uint16, 
            'prod_reorder_probability' : np.float16,   
            'prod_reorder_ratio' : np.float16, 'user_orders' : np.uint8,
            'user_period' : np.uint8, 'user_mean_days_since_prior' : np.uint8,
            'user_total_products' : np.uint8, 'user_reorder_ratio' : np.float16, 
            'user_distinct_products' : np.uint8, 'user_average_basket' : np.uint8,
            'order_id'  : np.uint32, 'eval_set' : np.uint8, 
            'days_since_prior_order' : np.uint8, 'up_order_rate' : np.float16, 
            'up_orders_since_last_order':np.uint8,
            'aisle_id': np.uint8, 'department_id': np.uint8})

### Import word2vec file - pca_w2v.csv

In [None]:
#import word2vec file - pca_w2v.csv
word2vec = pd.read_csv("data/pca_w2v.csv")
word2vec = word2vec.astype(dtype= {'product_id'  : np.uint16,
                                  "0": np.float16, "1": np.float16, "2": np.float16,})

In [None]:
data = data.merge(word2vec, on = "product_id", how = "left")
data['reordered']=data['reordered'].astype(np.uint8)

### modeling

In [None]:
# train, test 셋 칼럼수 맞쳐줌
train = data[data['eval_set'] == 1].drop(['eval_set', 'user_id', 'product_id', 'order_id'], axis = 1)
test =  data[data['eval_set'] == 2].drop(['eval_set', 'user_id', 'reordered'], axis = 1)

In [None]:
# F1점수로 채점,   F1점수는 최적화 시킬수 있음,  최적화시키기 위해 미리 만들어둠
check =  data.drop(['eval_set', 'user_id', 'reordered'], axis = 1)

LGBM 사용하는 또 다른 방식: 학습할때 train함수를 이용해서 학습시키는 방법

In [None]:
del data
import gc
gc.collect()
import lightgbm as lgb
print('formatting and training LightGBM ...')


lgb_train = lgb.Dataset( train[train.columns.difference(['reordered'])], label=train['reordered'])

#'feature_fraction' ~ colsample_bytree
# 'bagging_fraction' ~ subsample
# 'num_iterations' ~ n_estimators
# 'min_data_in_leaf' ~ min_child_samples : 하나 잎사귀에 들어가는 최소 샘플의 갯수
# 'max_bin' ~ 숫자형 데이터를 binning을 해주는 overfitting을 막는것
# 'binary_ logloss ' ~ 각각의 제품마다 확률값, 클래스가 0또는 1
params = {'task': 'train', 'boosting_type': 'gbdt',   'objective': 'binary', 'metric': {'binary_logloss', 'auc'},
    'num_iterations' : 130, 'max_bin' : 100, 'num_leaves': 512, 'feature_fraction': 0.8,  'bagging_fraction': 0.95,
    'bagging_freq': 5, 'min_data_in_leaf' : 200, 'learning_rate' : 0.05}

# set lower num_boost_round (I used 300 instead of 25 at home) to avoid time-out on Kaggle

# num_boost_round ~ 몇번 모델을 돌릴것인가
lgb_model = lgb.train(params, lgb_train, num_boost_round = 130, 
                      #valid_sets = lgb_eval, early_stopping_rounds=15
                     )

#F1 점수 최적으로 올리는 코드
del lgb_train# X_train, y_train
gc.collect()
def combi(z,df):
    
    prd_bag = dict()
    z_bag = dict()
    for row in df.itertuples():
        if row.reordered > z:   
            try:
                prd_bag[row.order_id] += ' ' + str(row.product_id)
                z_bag[row.order_id]+= ' ' + str(int(100*row.reordered))
            except:
                prd_bag[row.order_id] = str(row.product_id)
                z_bag[row.order_id]= str(int(100*row.reordered))

    for order in df.order_id:
        if order not in prd_bag:
            prd_bag[order] = ' '
            z_bag[order] = ' '

    return prd_bag,z_bag 

# F1 function uses the actual products as a list in the train set and the list of predicted products

def f1_score_single(x):                

    y_true = x.actual
    y_pred = x.list_prod
    if y_true == '' and y_pred ==[] : return 1.
    y_true = set(y_true)
    y_pred = set(y_pred)
    cross_size = len(y_true & y_pred)
    if cross_size == 0: return 0.
    p = 1. * cross_size / len(y_pred)
    r = 1. * cross_size / len(y_true)
    return 2 * p * r / (p + r)
# check feature importance
#lgb.plot_importance(lgb_model, figsize=(7,9))
#plt.show()
print(' Applying model to all data - both train and test ')

check['reordered'] = lgb_model.predict(check[check.columns.difference(
    ['order_id', 'product_id'])], num_iteration = lgb_model.best_iteration)

gc.collect()
print(' summarizing products and probabilities ...')

# get the prediction for a range of thresholds

tt=traintest1.copy()
i=0

for z in [0.17, 0.21, 0.25]:
    
    prd_bag,z_bag = combi(z,check)
    ptemp = pd.DataFrame.from_dict(prd_bag, orient='index')
    ptemp.reset_index(inplace=True)
    ztemp = pd.DataFrame.from_dict(z_bag, orient='index')
    ztemp.reset_index(inplace=True)
    ptemp.columns = ['order_id', 'products']
    ztemp.columns = ['order_id', 'zs']
    ptemp['list_prod'] = ptemp['products'].apply(lambda x: list(map(int, x.split())))
    ztemp['list_z'] = ztemp['zs'].apply(lambda x: list(map(int, x.split())))
    n_cart = ptemp['products'].apply(lambda x: len(x.split())).mean()
    tt = tt.merge(ptemp,on='order_id',how='inner')
    tt = tt.merge(ztemp,on='order_id',how='inner')
    tt.drop(['products','zs'],axis=1,inplace=True)
    tt['zavg'] = tt['list_z'].apply(lambda x: 0.01*np.mean(x) if x!=[] else 0.).astype(np.float16)
    tt['zmax'] = tt['list_z'].apply(lambda x: 0.01*np.max(x) if x!=[] else 0.).astype(np.float16)
    tt['zmin'] = tt['list_z'].apply(lambda x: 0.01*np.min(x) if x!=[] else 0.).astype(np.float16)
    tt['f1']=tt.apply(f1_score_single,axis=1).astype(np.float16)
    F1 = tt['f1'].loc[tt['eval_set']==1].mean()
    tt = tt.rename(columns={'list_prod': 'prod'+str(i), 'f1': 'f1'+str(i), 'list_z': 'z'+str(i),
                'zavg': 'zavg'+str(i), 'zmax': 'zmax'+str(i),  'zmin': 'zmin'+str(i)})
    print(' z,F1,n_actual,n_cart :  ', z,F1,n_actual,n_cart)
    i=i+1

tt['fm'] = tt[['f10', 'f11', 'f12']].idxmax(axis=1)
tt['f1'] = tt[['f10', 'f11', 'f12']].max(axis=1)
tt['fm'] = tt.fm.replace({'f10': 0,'f11': 1, 'f12':2}).astype(np.uint8)
print(' f1 maximized ', tt['f1'].loc[tt['eval_set']==1].mean())
    
#del prd_bag, z_bag, ptemp, ztemp
gc.collect()

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics

print('Fitting the second classifier for F1 ...')

X=tt[[ 'zavg0', 'zmax0','zmin0', 'zavg1', 'zmax1', 'zmin1', 'zavg2', 'zmax2', 'zmin2']].loc[tt['eval_set']==1]
y=tt['fm'].loc[tt['eval_set']==1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

clf = GradientBoostingClassifier().fit(X_train, y_train)
print('GB Accuracy on training set: {:.2f}' .format(clf.score(X_train, y_train)))
print('Accuracy on test set: {:.2f}' .format(clf.score(X_test, y_test)))
#pd.DataFrame(clf.feature_importances_, index=X_train.columns, columns=["Importance"]).plot(kind='bar')
#plt.show()

final=tt[['order_id','prod0','prod1','prod2','zavg0']].loc[tt['eval_set']==2]
df_test=tt[[ 'zavg0', 'zmax0','zmin0', 'zavg1', 'zmax1', 'zmin1', 'zavg2', 'zmax2', 'zmin2']].loc[tt['eval_set']==2]
final['fit']= clf.predict(df_test)
final['best'] = final.apply(lambda row: row['prod0'] if row['fit']==0 else 
                                 ( row['prod1'] if row['fit']==1 else  row['prod2'] )  , axis=1)



def mylist(x):
    prodids = x.best
    zavg = x.zavg0
    if prodids == []: return 'None'            
    if zavg < 0.5:
        if len(prodids) == 1: return  str(prodids[0])+' None'
        if len(prodids) == 2: return  str(prodids[0])+ ' '+ str(prodids[1]) +' None'
    return ' '.join(str(i) for i in prodids)

final['products']=final.apply(mylist,axis=1)

final[['order_id','products']].to_csv('final_submission1.csv', index=False)  

gc.collect()