In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import lightgbm as lgb
import gc

In [2]:
import gensim
import pickle

In [3]:
from sklearn.model_selection import train_test_split

### Load Data

In [4]:
IDIR = '../data/raw/'
print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading train')
train = pd.read_csv(IDIR + 'order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

print('loading products')
products = pd.read_csv(IDIR + 'products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))
print('train {}: {}'.format(train.shape, ', '.join(products.columns)))

loading prior
loading train
loading orders
loading products
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered
train (1384617, 4): product_id, aisle_id, department_id


In [5]:
print('computing product f')
prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.int32)
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods
gc.collect()

computing product f


62

In [6]:
print('add order info to priors')
orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)

add order info to priors


In [7]:
### user features
print('computing user f')
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders[orders.eval_set == "prior"].groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['nb_orders'] = orders[orders.eval_set == "prior"].groupby('user_id').size().astype(np.int16)

users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)

users = users.join(usr)
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
print('user f', users.shape)

del usr, priors
gc.collect()

computing user f
('user f', (206209, 6))


57

In [35]:
### build list of candidate products to reorder, with features ###
def features(selected_orders, labels_given=False):
    print('build candidate list')
    order_list = []
    product_list = []
    labels = []
    i=0
    for row in selected_orders.itertuples():
        i+=1
        if i%10000 == 0: print('order row',i)
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in train.index for product in user_products]
        
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id)
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    
    print('order related features')
    df['dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    
    print('product related features')
    df['aisle_id'] = df.product_id.map(products.aisle_id)
    df['department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.orders).astype(np.int32)
    df['product_reorders'] = df.product_id.map(products.reorders)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)

    print(df.dtypes)
    print(df.memory_usage())
    return (df, labels)

In [32]:
### train / test orders ###
print('split orders : train, test')
train_orders = orders[orders.eval_set == 'train']
test_orders = orders[orders.eval_set == 'test']

train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

split orders : train, test


In [10]:
df_train, labels = features(train_orders, labels_given=True)

build candidate list
('order row', 10000)
('order row', 20000)
('order row', 30000)
('order row', 40000)
('order row', 50000)
('order row', 60000)
('order row', 70000)
('order row', 80000)
('order row', 90000)
('order row', 100000)
('order row', 110000)
('order row', 120000)
('order row', 130000)
user related features
order related features
product related features
order_id                              int32
product_id                            int32
user_id                               int32
user_total_orders                     int16
user_total_items                      int16
total_distinct_items                  int16
user_average_days_between_orders    float32
user_average_basket                 float32
dow                                    int8
order_hour_of_day                      int8
days_since_prior_order              float32
days_since_ratio                    float32
aisle_id                              uint8
department_id                         uint8
product_orders  

In [11]:
del train_orders
gc.collect()

92

In [12]:
df_train.shape

(8474661, 17)

In [13]:
df_train.head()

Unnamed: 0,order_id,product_id,user_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,dow,order_hour_of_day,days_since_prior_order,days_since_ratio,aisle_id,department_id,product_orders,product_reorders,product_reorder_rate
0,1187899,17122,1,10,59,18,19.555555,5.9,4,8,14.0,0.715909,24,4,13880,9377.0,0.675576
1,1187899,196,1,10,59,18,19.555555,5.9,4,8,14.0,0.715909,77,7,35791,27791.0,0.77648
2,1187899,26405,1,10,59,18,19.555555,5.9,4,8,14.0,0.715909,54,17,1214,536.0,0.441516
3,1187899,13032,1,10,59,18,19.555555,5.9,4,8,14.0,0.715909,121,14,3751,2465.0,0.657158
4,1187899,39657,1,10,59,18,19.555555,5.9,4,8,14.0,0.715909,45,19,5019,3846.0,0.766288


## Load other pre-calculated user features and product embedding features

In [12]:
# all user feature(others) 
all_users_features_df = pd.read_pickle("../data/processed/cleaned_all_users_features.pickle")

In [14]:
cols = all_users_features_df.columns
dow_cols = cols[cols.str.startswith('dow_')].tolist() + cols[cols.str.startswith('daytime_')].tolist()
most_cols = cols[cols.str.startswith('most_')].tolist()
top_cols = cols[cols.str.startswith('top')].tolist()
emb_cols = cols[cols.str.startswith('emb_')].tolist()

In [15]:
print("join with the user features")
to_join = ["user_id", 'user_avg_reordered', 'user_perc_reordered'] + most_cols + dow_cols + emb_cols + top_cols
df_train = pd.merge(df_train, all_users_features_df[to_join], on ="user_id")

join with the user features


In [16]:
gc.collect()

21

In [17]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8474661 entries, 0 to 8474660
Columns: 137 entries, order_id to top3_reordered_pid
dtypes: float32(108), int16(3), int32(7), int8(2), uint8(17)
memory usage: 3.9 GB


In [15]:
#product_emd = gensim.models.Word2Vec.load("../data/interim/product2vec.model")

In [16]:
#product_emd_dict = {k: product_emd[k]  for k in product_emd.wv.vocab.keys()}

In [17]:
# product_emd_df = np.round(pd.DataFrame.from_dict(product_emd_dict, orient='index', dtype = np.float32),2).\
#                     add_prefix('prod2vec_').reset_index().\
#                     rename(columns = {'index': 'product_id'})

In [23]:
product_emd_df.head()

Unnamed: 0,product_id,prod2vec_0,prod2vec_1,prod2vec_2,prod2vec_3,prod2vec_4,prod2vec_5,prod2vec_6,prod2vec_7,prod2vec_8,...,prod2vec_90,prod2vec_91,prod2vec_92,prod2vec_93,prod2vec_94,prod2vec_95,prod2vec_96,prod2vec_97,prod2vec_98,prod2vec_99
0,13357,-0.0,-0.22,0.19,-0.32,0.16,-0.17,-0.24,0.11,0.19,...,0.13,-0.13,-0.27,0.15,-0.43,0.07,-0.15,0.28,0.02,0.12
1,11542,0.12,0.29,-0.02,-0.07,-0.12,-0.02,-0.22,0.17,0.09,...,-0.21,0.07,-0.19,-0.02,-0.18,0.2,-0.13,-0.02,-0.26,0.18
2,11543,0.77,-0.3,0.9,-0.74,0.47,0.93,0.88,-0.38,0.26,...,-1.75,0.18,-1.21,-1.28,-0.6,0.36,1.37,1.58,-1.18,0.82
3,11540,0.77,-0.38,-0.62,-0.8,0.26,-1.03,-0.77,0.5,-0.51,...,-0.47,-0.02,-0.26,1.6,0.46,0.93,-0.12,-0.64,0.43,-0.51
4,11541,-0.23,-0.08,0.2,-0.07,0.04,-0.36,0.01,0.34,-0.0,...,-0.15,-0.08,0.02,0.37,0.18,-0.37,-0.41,-0.01,0.07,0.38


In [22]:
#prod2vec_cols = product_emd_df.columns[product_emd_df.columns.str.startswith('prod2vec')]

In [23]:
# for col in prod2vec_cols:
#     product_emd_df[col] = product_emd_df[col].astype('float32')

In [32]:
#product_emd_df.to_pickle("../data/interim/prod2vec_df.pickle")
product_emd_df = pd.read_pickle("../data/interim/prod2vec_df.pickle")

In [33]:
product_emd_df.head()

Unnamed: 0,product_id,prod2vec_0,prod2vec_1,prod2vec_2,prod2vec_3,prod2vec_4,prod2vec_5,prod2vec_6,prod2vec_7,prod2vec_8,...,prod2vec_90,prod2vec_91,prod2vec_92,prod2vec_93,prod2vec_94,prod2vec_95,prod2vec_96,prod2vec_97,prod2vec_98,prod2vec_99
0,13357,-0.0,-0.22,0.19,-0.32,0.16,-0.17,-0.24,0.11,0.19,...,0.13,-0.13,-0.27,0.15,-0.43,0.07,-0.15,0.28,0.02,0.12
1,11542,0.12,0.29,-0.02,-0.07,-0.12,-0.02,-0.22,0.17,0.09,...,-0.21,0.07,-0.19,-0.02,-0.18,0.2,-0.13,-0.02,-0.26,0.18
2,11543,0.77,-0.3,0.9,-0.74,0.47,0.93,0.88,-0.38,0.26,...,-1.75,0.18,-1.21,-1.28,-0.6,0.36,1.37,1.58,-1.18,0.82
3,11540,0.77,-0.38,-0.62,-0.8,0.26,-1.03,-0.77,0.5,-0.51,...,-0.47,-0.02,-0.26,1.6,0.46,0.93,-0.12,-0.64,0.43,-0.51
4,11541,-0.23,-0.08,0.2,-0.07,0.04,-0.36,0.01,0.34,-0.0,...,-0.15,-0.08,0.02,0.37,0.18,-0.37,-0.41,-0.01,0.07,0.38


In [39]:
product_emd_df['product_id'] = product_emd_df['product_id'].astype('int32')

In [42]:
print("joint product embedding")
df_train = pd.merge(df_train, product_emd_df, on = "product_id", how = "left")

joint product embedding


In [40]:
#df_train.drop(prod2vec_cols, axis = 1, inplace=True)

In [44]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8474661 entries, 0 to 8474660
Columns: 134 entries, order_id to prod2vec_99
dtypes: float32(108), int16(3), int32(3), int8(2), object(1), uint8(17)
memory usage: 3.8+ GB


In [18]:
del all_users_features_df#, product_emd_df
gc.collect()

24

In [47]:
df_train[prod2vec_cols] = df_train[prod2vec_cols].fillna(df_train[prod2vec_cols].mean())

In [48]:
df_train.shape

(8474661, 134)

## Train LightGBM

### with less features

In [19]:
features = df_train.columns
dow_cols = features[features.str.startswith("dow_")].tolist()
daytime_cols = features[features.str.startswith("daytime_")].tolist()
emb_cols = features[features.str.startswith("emb_")].tolist()
#most_cols = features[features.str.startswith("most_")].tolist()
#top_cols = features[features.str.startswith("top_")].tolist()
#prod2vec_cols = features[features.str.startswith("prod2vec")].tolist()
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items', 'dow',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate'] + dow_cols + daytime_cols  + emb_cols 

In [26]:
gc.collect()

7

In [29]:
print("split the train and validation set")
X_train, X_valid, y_train, y_valid = train_test_split(df_train[f_to_use], labels, test_size = 0.3, random_state=2017)

split the train and validation set


In [30]:
X_train = pd.DataFrame(X_train, columns = f_to_use)
X_valid = pd.DataFrame(X_valid, columns = f_to_use)

In [31]:
print('formating training and validation dataset for lgb')
d_train = lgb.Dataset(X_train,
                      label=y_train,
                      categorical_feature=['aisle_id', 'department_id'])  
d_valid = lgb.Dataset(X_valid,
                      label=y_valid,
                      categorical_feature=['aisle_id', 'department_id'])  

formating training and validation dataset for lgb


In [32]:
#del df_train
gc.collect()

19

In [33]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 80,
    'max_depth': 10,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.9,
    'bagging_freq': 8
}
ROUNDS = 300

In [34]:
print('Training light GBM ...')
bst = lgb.train(params, d_train, valid_sets= [d_valid], num_boost_round= ROUNDS, early_stopping_rounds = 10)

Training light GBM ...
[1]	valid_0's binary_logloss: 0.629905
Train until valid scores didn't improve in 10 rounds.
[2]	valid_0's binary_logloss: 0.578155
[3]	valid_0's binary_logloss: 0.535303
[4]	valid_0's binary_logloss: 0.499919
[5]	valid_0's binary_logloss: 0.469775
[6]	valid_0's binary_logloss: 0.444261
[7]	valid_0's binary_logloss: 0.422744
[8]	valid_0's binary_logloss: 0.404203
[9]	valid_0's binary_logloss: 0.388396
[10]	valid_0's binary_logloss: 0.374922
[11]	valid_0's binary_logloss: 0.363287
[12]	valid_0's binary_logloss: 0.353133
[13]	valid_0's binary_logloss: 0.344392
[14]	valid_0's binary_logloss: 0.33689
[15]	valid_0's binary_logloss: 0.33057
[16]	valid_0's binary_logloss: 0.325304
[17]	valid_0's binary_logloss: 0.320519
[18]	valid_0's binary_logloss: 0.316307
[19]	valid_0's binary_logloss: 0.312849
[20]	valid_0's binary_logloss: 0.309789
[21]	valid_0's binary_logloss: 0.307106
[22]	valid_0's binary_logloss: 0.304708
[23]	valid_0's binary_logloss: 0.302734
[24]	valid_0's

In [35]:
bst.save_model('../models/lightGBM_80_10.txt', num_iteration=bst.best_iteration)

## add more features related to users purchase patterns

In [20]:
most_cols = features[features.str.startswith("most_")].tolist()
top_cols = features[features.str.startswith("top")].tolist()
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items', 'dow',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate'] + dow_cols + daytime_cols  + emb_cols + most_cols + top_cols
print(f_to_use)

['user_total_orders', 'user_total_items', 'total_distinct_items', 'dow', 'user_average_days_between_orders', 'user_average_basket', 'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio', 'aisle_id', 'department_id', 'product_orders', 'product_reorders', 'product_reorder_rate', u'dow_0', u'dow_1', u'dow_2', u'dow_3', u'dow_4', u'dow_5', u'dow_6', u'daytime_sleeping', u'daytime_morning', u'daytime_noon', u'daytime_afternoon', u'daytime_evening', u'daytime_night', u'emb_0', u'emb_1', u'emb_2', u'emb_3', u'emb_4', u'emb_5', u'emb_6', u'emb_7', u'emb_8', u'emb_9', u'emb_10', u'emb_11', u'emb_12', u'emb_13', u'emb_14', u'emb_15', u'emb_16', u'emb_17', u'emb_18', u'emb_19', u'emb_20', u'emb_21', u'emb_22', u'emb_23', u'emb_24', u'emb_25', u'emb_26', u'emb_27', u'emb_28', u'emb_29', u'emb_30', u'emb_31', u'emb_32', u'emb_33', u'emb_34', u'emb_35', u'emb_36', u'emb_37', u'emb_38', u'emb_39', u'emb_40', u'emb_41', u'emb_42', u'emb_43', u'emb_44', u'emb_45', u'emb_46', u'emb_47', u'em

In [37]:
del d_train, d_valid
gc.collect()

63

In [21]:
print("split the train and validation set")
X_train, X_valid, y_train, y_valid = train_test_split(df_train[f_to_use], labels, test_size = 0.3, random_state=2017)

split the train and validation set


In [22]:
X_train = pd.DataFrame(X_train, columns = f_to_use)
X_valid = pd.DataFrame(X_valid, columns = f_to_use)

In [23]:
print('formating training and validation dataset for lgb')
d_train = lgb.Dataset(X_train,
                      label=y_train,
                      categorical_feature=['aisle_id', 'department_id', 'most_reordered_aiesle',
                                           'most_reordered_dpmt','top1_reordered_pid', 'top2_reordered_pid', 
                                           'top3_reordered_pid' ])  
d_valid = lgb.Dataset(X_valid,
                      label=y_valid,
                      categorical_feature=['aisle_id', 'department_id', 'most_reordered_aiesle', 'most_reordered_dpmt',
                                           'top1_reordered_pid', 'top2_reordered_pid', 
                                           'top3_reordered_pid' ])  

formating training and validation dataset for lgb


In [24]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 80,
    'max_depth': 10,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.9,
    'bagging_freq': 8
}
ROUNDS = 500

In [25]:
print('Training light GBM ...')
bst = lgb.train(params, d_train, valid_sets= [d_valid], num_boost_round= ROUNDS, early_stopping_rounds = 10)

Training light GBM ...
[1]	valid_0's binary_logloss: 0.629905
Train until valid scores didn't improve in 10 rounds.
[2]	valid_0's binary_logloss: 0.57822
[3]	valid_0's binary_logloss: 0.535383
[4]	valid_0's binary_logloss: 0.499574
[5]	valid_0's binary_logloss: 0.469468
[6]	valid_0's binary_logloss: 0.443913
[7]	valid_0's binary_logloss: 0.422222
[8]	valid_0's binary_logloss: 0.403857
[9]	valid_0's binary_logloss: 0.387963
[10]	valid_0's binary_logloss: 0.374341
[11]	valid_0's binary_logloss: 0.362801
[12]	valid_0's binary_logloss: 0.35273
[13]	valid_0's binary_logloss: 0.344388
[14]	valid_0's binary_logloss: 0.336879
[15]	valid_0's binary_logloss: 0.3306
[16]	valid_0's binary_logloss: 0.324991
[17]	valid_0's binary_logloss: 0.320155
[18]	valid_0's binary_logloss: 0.316072
[19]	valid_0's binary_logloss: 0.31262
[20]	valid_0's binary_logloss: 0.309473
[21]	valid_0's binary_logloss: 0.306745
[22]	valid_0's binary_logloss: 0.304406
[23]	valid_0's binary_logloss: 0.302421
[24]	valid_0's bi

In [26]:
bst.save_model('../models/lightGBM_morefeatures_80_10.txt', num_iteration=bst.best_iteration)

It seems it doesn't add much more values.

## Use specific product vector features

In [54]:
#df_train.drop(emb_cols, axis = 1, inplace = True)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8474661 entries, 0 to 8474660
Columns: 134 entries, order_id to prod2vec_99
dtypes: float32(108), int16(3), int32(3), int8(2), object(1), uint8(17)
memory usage: 3.8+ GB


In [55]:
gc.collect()

7

In [53]:
prod2vec_cols = features[features.str.startswith("prod2vec")].tolist()
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items', 'dow',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate'] + dow_cols + daytime_cols + prod2vec_cols
print(f_to_use)

['user_total_orders', 'user_total_items', 'total_distinct_items', 'dow', 'user_average_days_between_orders', 'user_average_basket', 'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio', 'aisle_id', 'department_id', 'product_orders', 'product_reorders', 'product_reorder_rate', u'dow_0', u'dow_1', u'dow_2', u'dow_3', u'dow_4', u'dow_5', u'dow_6', u'daytime_sleeping', u'daytime_morning', u'daytime_noon', u'daytime_afternoon', u'daytime_evening', u'daytime_night', 'prod2vec_0', 'prod2vec_1', 'prod2vec_2', 'prod2vec_3', 'prod2vec_4', 'prod2vec_5', 'prod2vec_6', 'prod2vec_7', 'prod2vec_8', 'prod2vec_9', 'prod2vec_10', 'prod2vec_11', 'prod2vec_12', 'prod2vec_13', 'prod2vec_14', 'prod2vec_15', 'prod2vec_16', 'prod2vec_17', 'prod2vec_18', 'prod2vec_19', 'prod2vec_20', 'prod2vec_21', 'prod2vec_22', 'prod2vec_23', 'prod2vec_24', 'prod2vec_25', 'prod2vec_26', 'prod2vec_27', 'prod2vec_28', 'prod2vec_29', 'prod2vec_30', 'prod2vec_31', 'prod2vec_32', 'prod2vec_33', 'prod2vec_34', 'prod2v

In [56]:
print("split the train and validation set")
X_train, X_valid, y_train, y_valid = train_test_split(df_train[f_to_use], labels, test_size = 0.3, random_state=2017)

split the train and validation set


In [58]:
X_train = pd.DataFrame(X_train, columns = f_to_use)
X_valid = pd.DataFrame(X_valid, columns = f_to_use)

In [59]:
print('formating training and validation dataset for lgb')
d_train = lgb.Dataset(X_train,
                      label=y_train,
                      categorical_feature=['aisle_id', 'department_id'])  
d_valid = lgb.Dataset(X_valid,
                      label=y_valid,
                      categorical_feature=['aisle_id', 'department_id'])  

formating training and validation dataset for lgb


In [60]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 100,
    'max_depth': 12,
    'feature_fraction': 0.85,
    'bagging_fraction': 0.9,
    'bagging_freq': 8
}
ROUNDS = 500

In [61]:
gc.collect()

19

In [62]:
print('Training light GBM ...')
bst_all = lgb.train(params, d_train, valid_sets= [d_valid], num_boost_round= ROUNDS, early_stopping_rounds = 10)

Training light GBM ...
[1]	valid_0's binary_logloss: 0.629861
Train until valid scores didn't improve in 10 rounds.
[2]	valid_0's binary_logloss: 0.578073
[3]	valid_0's binary_logloss: 0.535196
[4]	valid_0's binary_logloss: 0.499764
[5]	valid_0's binary_logloss: 0.469615
[6]	valid_0's binary_logloss: 0.44406
[7]	valid_0's binary_logloss: 0.422511
[8]	valid_0's binary_logloss: 0.403941
[9]	valid_0's binary_logloss: 0.388121
[10]	valid_0's binary_logloss: 0.374644
[11]	valid_0's binary_logloss: 0.363028
[12]	valid_0's binary_logloss: 0.352877
[13]	valid_0's binary_logloss: 0.34415
[14]	valid_0's binary_logloss: 0.336623
[15]	valid_0's binary_logloss: 0.33029
[16]	valid_0's binary_logloss: 0.324971
[17]	valid_0's binary_logloss: 0.320195
[18]	valid_0's binary_logloss: 0.316005
[19]	valid_0's binary_logloss: 0.312523
[20]	valid_0's binary_logloss: 0.309494
[21]	valid_0's binary_logloss: 0.306809
[22]	valid_0's binary_logloss: 0.304419
[23]	valid_0's binary_logloss: 0.302441
[24]	valid_0's 

In [63]:
bst_all.save_model('../models/lightGBM_prodfeats_100_12.txt', num_iteration=bst_all.best_iteration)

## Choose the second model for predictions

In [36]:
### build candidates list for test ###
df_test, _ = features(test_orders)

build candidate list
('order row', 10000)
('order row', 20000)
('order row', 30000)
('order row', 40000)
('order row', 50000)
('order row', 60000)
('order row', 70000)
user related features
order related features
product related features
order_id                              int32
product_id                            int32
user_id                               int32
user_total_orders                     int16
user_total_items                      int16
total_distinct_items                  int16
user_average_days_between_orders    float32
user_average_basket                 float32
dow                                    int8
order_hour_of_day                      int8
days_since_prior_order              float32
days_since_ratio                    float32
aisle_id                              uint8
department_id                         uint8
product_orders                        int32
product_reorders                    float32
product_reorder_rate                float32
dtype: object


In [38]:
# all user feature(others) 
all_users_features_df = pd.read_pickle("../data/processed/cleaned_all_users_features.pickle")
cols = all_users_features_df.columns
dow_cols = cols[cols.str.startswith('dow_')].tolist() + cols[cols.str.startswith('daytime_')].tolist()
most_cols = cols[cols.str.startswith('most_')].tolist()
top_cols = cols[cols.str.startswith('top')].tolist()
emb_cols = cols[cols.str.startswith('emb_')].tolist()
print("join with the user features")
to_join = ["user_id", 'user_avg_reordered', 'user_perc_reordered'] + most_cols + dow_cols + emb_cols + top_cols
df_test = pd.merge(df_test, all_users_features_df[to_join], on ="user_id")

join with the user features


In [41]:
df_test.shape

(4833292, 137)

In [42]:
del all_users_features_df, test_orders
gc.collect()

262

In [43]:
len(f_to_use)

132

In [45]:
# load the model
bst_best = lgb.Booster(model_file='../models/lightGBM_morefeatures_80_10.txt')

In [47]:
print('light GBM predict')
preds = bst_best.predict(df_test[f_to_use])

light GBM predict


In [48]:
df_test['pred'] = preds

In [50]:
def generate_submission(df_test, test_orders_ids, file_name, threshold = 0.2, single_thres = True):
    """function to generate label predictions submission format"""
    if single_thres:
        TRESHOLD = threshold

        d = dict()
        for row in df_test.itertuples():
            if row.pred > TRESHOLD:
                try:
                    d[row.order_id] += ' ' + str(row.product_id)
                except:
                    d[row.order_id] = str(row.product_id)

        for order in test_orders_ids:
            if order not in d:
                d[order] = 'None'

        sub = pd.DataFrame.from_dict(d, orient='index')
        sub.reset_index(inplace=True)
        sub.columns = ['order_id', 'products']
        sub.to_csv(file_name, index=False)
    else:
        pass

In [52]:
test_order_ids = orders[orders.eval_set == 'test'].order_id

In [53]:
len(test_order_ids)

75000

In [59]:
df_test[['order_id', 'pred', 'product_id']].to_csv("../data/processed/lightGBM_morefeatures_prob_preds.csv",index=False)

In [60]:
generate_submission(df_test, test_orders_ids, '../models/lightGBM_morefeatures_preds_20%thr.csv', threshold = 0.2)

NameError: name 'test_orders_ids' is not defined