###importing the necessary packages.

In [10]:
import gc
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from scipy import stats
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

###Reading the CSV file

In [11]:
aisles=pd.read_csv('aisles.csv')
order_products__train=pd.read_csv('order_products__train.csv')
order_products_prior=pd.read_csv('order_products__prior.csv')
orders=pd.read_csv('orders.csv')
departments=pd.read_csv('departments.csv')
products=pd.read_csv('products.csv')
submission=pd.read_csv('sample_submission.csv')

###merging orders and prior orders datasets

In [12]:
prior_orders=pd.merge(orders,order_products_prior,on='order_id',how='inner')
prior_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,473747,1,prior,3,3,12,21.0,196,1,1.0
1,473747,1,prior,3,3,12,21.0,12427,2,1.0
2,473747,1,prior,3,3,12,21.0,10258,3,1.0
3,473747,1,prior,3,3,12,21.0,25133,4,0.0
4,473747,1,prior,3,3,12,21.0,30450,5,0.0


In [13]:
del order_products_prior
gc.collect()

552

##Users Feature

Number of orders placed by each user.Taking the maximum of the order numbers placed by each user eg: user_id =1 placed the maximum order 7 

In [14]:
users = prior_orders.groupby(by='user_id')['order_number'].aggregate('max').to_frame('u_num_of_orders').reset_index()
users.head()

Unnamed: 0,user_id,u_num_of_orders
0,1,7
1,3,11
2,4,4
3,5,4
4,6,2


Average number of products bought in each orders.

In [15]:
#First getting the total number of products in each order.
total_products_per_order=prior_orders.groupby(by=['user_id','order_id'])['product_id'].aggregate('count').to_frame('total_products_per_order').reset_index()

# Getting the average products purchased by each user
avg_products=total_products_per_order.groupby(by=['user_id'])['total_products_per_order'].mean().to_frame('u_avg_prd').reset_index()
avg_products.head()

Unnamed: 0,user_id,u_avg_prd
0,1,6.0
1,3,6.666667
2,4,2.0
3,5,12.0
4,6,7.0


In [16]:
del total_products_per_order
gc.collect()

62

In [17]:
#day of the week user orders the most
dow=prior_orders.groupby(by=['user_id'])['order_dow'].aggregate(lambda x:stats.mode(x)[0]).to_frame('u_orders_the_most').reset_index()
dow.head()

Unnamed: 0,user_id,u_orders_the_most
0,1,4
1,3,3
2,4,5
3,5,1
4,6,4


In [18]:
#Hour of the day user has ordered the most
hod=prior_orders.groupby(by=['user_id'])['order_hour_of_day'].aggregate(lambda x:stats.mode(x)[0]).to_frame('hod_u_most_orders').reset_index()
hod.head()

Unnamed: 0,user_id,hod_u_most_orders
0,1,15
1,3,19
2,4,13
3,5,18
4,6,16


In [19]:
#Reordered ratio of each user.
reorder_u=prior_orders.groupby(by='user_id')['reordered'].aggregate('mean').to_frame('u_reorder_ratio').reset_index()
reorder_u.head()

#changing the dtype
reorder_u['u_reorder_ratio']=reorder_u['u_reorder_ratio'].astype(np.float16)
reorder_u.head()

Unnamed: 0,user_id,u_reorder_ratio
0,1,0.722168
1,3,0.649902
2,4,0.0
3,5,0.666504
4,6,0.285645


In [20]:
#imputing the NAN values with 0
prior_orders.days_since_prior_order.fillna(0,inplace=True)

#Average days since prior order.
avg_days=prior_orders.groupby(by='user_id')['days_since_prior_order'].aggregate('mean').to_frame('average_days_between_orders').reset_index()
avg_days.head()

Unnamed: 0,user_id,average_days_between_orders
0,1,23.833333
1,3,11.9
2,4,15.0
3,5,19.0
4,6,6.0


In [21]:
#Total items bought.
total_item = prior_orders.groupby(by='user_id').size().to_frame('u_total_items_bought').astype(np.int16)
total_item.head()

Unnamed: 0_level_0,u_total_items_bought
user_id,Unnamed: 1_level_1
1,18
3,20
4,2
5,12
6,7


Merging all the created features into the users dataset

In [22]:
users=users.merge(avg_products,on='user_id',how='left')

users=users.merge(dow,on='user_id',how='left')

users=users.merge(hod,on='user_id',how='left')

users=users.merge(reorder_u,on='user_id',how='left')

users=users.merge(avg_days,on='user_id',how='left')

users=users.merge(total_item,on='user_id',how='left')

users.head()

Unnamed: 0,user_id,u_num_of_orders,u_avg_prd,u_orders_the_most,hod_u_most_orders,u_reorder_ratio,average_days_between_orders,u_total_items_bought
0,1,7,6.0,4,15,0.722168,23.833333,18
1,3,11,6.666667,3,19,0.649902,11.9,20
2,4,4,2.0,5,13,0.0,15.0,2
3,5,4,12.0,1,18,0.666504,19.0,12
4,6,2,7.0,4,16,0.285645,6.0,7


In [23]:
del [reorder_u,dow,hod,avg_products,avg_days,total_item]
gc.collect()

141

#Product Features

In [24]:
#Number of times the product has been purchased by the users
prd=prior_orders.groupby(by='product_id')['order_id'].aggregate('count').to_frame('p_num_of_times').reset_index()
prd.head()

Unnamed: 0,product_id,p_num_of_times
0,1,81
1,2,7
2,3,10
3,4,11
4,6,1


In [25]:
#Reorder ratio of each products. Number of times the product was reordered / number of times it was purchased.
reorder_p=prior_orders.groupby(by='product_id')['reordered'].aggregate('mean').to_frame('p_reorder_ratio').reset_index()
reorder_p.head()

Unnamed: 0,product_id,p_reorder_ratio
0,1,0.493827
1,2,0.142857
2,3,0.7
3,4,0.363636
4,6,0.0


In [26]:
#Average add to cart order for each product.
add_to_cart=prior_orders.groupby(by='product_id')['add_to_cart_order'].aggregate('mean').to_frame('p_avg_cart_position').reset_index()
add_to_cart.head()

Unnamed: 0,product_id,p_avg_cart_position
0,1,5.469136
1,2,10.0
2,3,6.8
3,4,11.181818
4,6,4.0


In [27]:
#Merging all the created product features into the prd dataset.
prd=prd.merge(reorder_p,on='product_id',how='left')
prd=prd.merge(add_to_cart,on='product_id',how='left')
prd.head()

Unnamed: 0,product_id,p_num_of_times,p_reorder_ratio,p_avg_cart_position
0,1,81,0.493827,5.469136
1,2,7,0.142857,10.0
2,3,10,0.7,6.8
3,4,11,0.363636,11.181818
4,6,1,0.0,4.0


In [28]:
del [reorder_p,add_to_cart]
gc.collect()

411

##User and product feature interaction

In [29]:
#how many times the user has bought a product
uux=prior_orders.groupby(by=['user_id','product_id'])['order_id'].aggregate('count').to_frame('uxp_times_bought').reset_index()
uux.head()

Unnamed: 0,user_id,product_id,uxp_times_bought
0,1,196,3
1,1,10258,3
2,1,10326,1
3,1,12427,3
4,1,13032,1


In [30]:
#How many times the user bought the product after its first purchase 
times=prior_orders.groupby(by=['user_id','product_id'])['order_id'].aggregate('count').to_frame('times_bought').reset_index()
times.tail(100)

Unnamed: 0,user_id,product_id,times_bought
872764,40629,8424,4
872765,40629,9407,1
872766,40629,9681,1
872767,40629,13198,2
872768,40629,13605,1
...,...,...,...
872859,40635,1540,1
872860,40635,4605,1
872861,40635,26324,1
872862,40636,10148,1


In [31]:
#total orders
total_orders=prior_orders.groupby('user_id')['order_number'].max().to_frame('total_orders').reset_index()
total_orders.head()

Unnamed: 0,user_id,total_orders
0,1,7
1,3,11
2,4,4
3,5,4
4,6,2


In [32]:
#Finding when the user has bought a product the first time.
first_order_num = prior_orders.groupby(by=['user_id', 'product_id'])['order_number'].aggregate('min').to_frame('first_order_num')

#resetting the index
first_order_num = first_order_num.reset_index()
first_order_num.head()

Unnamed: 0,user_id,product_id,first_order_num
0,1,196,3
1,1,10258,3
2,1,10326,5
3,1,12427,3
4,1,13032,7


In [33]:
#merging both the dataframes
span=pd.merge(total_orders,first_order_num,on='user_id',how='right')
span.head()

Unnamed: 0,user_id,total_orders,product_id,first_order_num
0,1,7,196,3
1,1,7,10258,3
2,1,7,10326,5
3,1,7,12427,3
4,1,7,13032,7


In [34]:
#calculating the order range
span['order_Range_D']=span.total_orders-span.first_order_num+1
span.head()

Unnamed: 0,user_id,total_orders,product_id,first_order_num,order_Range_D
0,1,7,196,3,5
1,1,7,10258,3,5
2,1,7,10326,5,3
3,1,7,12427,3,5
4,1,7,13032,7,1


In [35]:
uxp_ratio=pd.merge(times,span,on=['user_id','product_id'],how='left')
uxp_ratio.head()

Unnamed: 0,user_id,product_id,times_bought,total_orders,first_order_num,order_Range_D
0,1,196,3,7,3,5
1,1,10258,3,7,3,5
2,1,10326,1,7,5,3
3,1,12427,3,7,3,5
4,1,13032,1,7,7,1


In [36]:
uxp_ratio['uux_re_order_ratio']=uxp_ratio.times_bought/uxp_ratio.order_Range_D
uxp_ratio.head()

Unnamed: 0,user_id,product_id,times_bought,total_orders,first_order_num,order_Range_D,uux_re_order_ratio
0,1,196,3,7,3,5,0.6
1,1,10258,3,7,3,5,0.6
2,1,10326,1,7,5,3,0.333333
3,1,12427,3,7,3,5,0.6
4,1,13032,1,7,7,1,1.0


In [37]:
#dropping all the unwanted columns.
uxp_ratio.drop(['times_bought', 'total_orders', 'first_order_num', 'order_Range_D'], axis=1, inplace=True)
uxp_ratio.head()  

Unnamed: 0,user_id,product_id,uux_re_order_ratio
0,1,196,0.6
1,1,10258,0.6
2,1,10326,0.333333
3,1,12427,0.6
4,1,13032,1.0


In [38]:
#deleting all the unwanted df.
del [times, span, first_order_num, total_orders]
gc.collect()

306

In [39]:
#Merging all the created features into the uxp dataset.
uxp=uux.merge(uxp_ratio,on=['user_id','product_id'],how='left')

In [40]:
del uxp_ratio
gc.collect()

260

In [41]:
uxp.head()

Unnamed: 0,user_id,product_id,uxp_times_bought,uux_re_order_ratio
0,1,196,3,0.6
1,1,10258,3,0.6
2,1,10326,1,0.333333
3,1,12427,3,0.6
4,1,13032,1,1.0


In [42]:
#How many times a customer bought a product on its last 5 orders.
#Reversing the order number for each product.
prior_orders['order_number_back'] = prior_orders.groupby(by=['user_id'])['order_number'].transform(max) - prior_orders.order_number + 1
prior_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,order_number_back
0,473747,1,prior,3,3,12,21.0,196,1,1.0,5
1,473747,1,prior,3,3,12,21.0,12427,2,1.0,5
2,473747,1,prior,3,3,12,21.0,10258,3,1.0,5
3,473747,1,prior,3,3,12,21.0,25133,4,0.0,5
4,473747,1,prior,3,3,12,21.0,30450,5,0.0,5


In [43]:
#keeping only the first 5 orders from the order_number_back.
temp = prior_orders.loc[prior_orders.order_number_back <= 5]
temp.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,order_number_back
0,473747,1,prior,3,3,12,21.0,196,1,1.0,5
1,473747,1,prior,3,3,12,21.0,12427,2,1.0,5
2,473747,1,prior,3,3,12,21.0,10258,3,1.0,5
3,473747,1,prior,3,3,12,21.0,25133,4,0.0,5
4,473747,1,prior,3,3,12,21.0,30450,5,0.0,5


In [44]:
#product bought by users in the last_five orders.
last_five = temp.groupby(by=['user_id', 'product_id'])['order_id'].aggregate('count').to_frame('uxp_last_five').reset_index()
last_five.head()

Unnamed: 0,user_id,product_id,uxp_last_five
0,1,196,3
1,1,10258,3
2,1,10326,1
3,1,12427,3
4,1,13032,1


In [45]:
#ratio of the products bought in the last_five orders.
last_five['uxp_ratio_last_five'] = last_five.uxp_last_five / 5.0

# #changing the dtype.
last_five['uxp_ratio_last_five'] = last_five['uxp_ratio_last_five'].astype(np.float16)
last_five.head()

Unnamed: 0,user_id,product_id,uxp_last_five,uxp_ratio_last_five
0,1,196,3,0.600098
1,1,10258,3,0.600098
2,1,10326,1,0.199951
3,1,12427,3,0.600098
4,1,13032,1,0.199951


In [46]:
#merging this feature with uxp df.
uxp = uux.merge(last_five, on=['user_id', 'product_id'], how='left')

del [last_five, temp]
gc.collect()
uxp.head()

Unnamed: 0,user_id,product_id,uxp_times_bought,uxp_last_five,uxp_ratio_last_five
0,1,196,3,3.0,0.600098
1,1,10258,3,3.0,0.600098
2,1,10326,1,1.0,0.199951
3,1,12427,3,3.0,0.600098
4,1,13032,1,1.0,0.199951


In [47]:
#filling the NAN values with 0.
uxp.fillna(0, inplace=True)
uxp.head(10)

Unnamed: 0,user_id,product_id,uxp_times_bought,uxp_last_five,uxp_ratio_last_five
0,1,196,3,3.0,0.600098
1,1,10258,3,3.0,0.600098
2,1,10326,1,1.0,0.199951
3,1,12427,3,3.0,0.600098
4,1,13032,1,1.0,0.199951
5,1,13176,1,1.0,0.199951
6,1,17122,1,1.0,0.199951
7,1,25133,3,3.0,0.600098
8,1,30450,1,1.0,0.199951
9,1,41787,1,1.0,0.199951


In [48]:
#Merging users ,prd and uxp dataframes
#Merge uxp features with the user features
#Store the results on a new DataFrame

data = uxp.merge(users, on='user_id', how='left')
data.head()

Unnamed: 0,user_id,product_id,uxp_times_bought,uxp_last_five,uxp_ratio_last_five,u_num_of_orders,u_avg_prd,u_orders_the_most,hod_u_most_orders,u_reorder_ratio,average_days_between_orders,u_total_items_bought
0,1,196,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18
1,1,10258,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18
2,1,10326,1,1.0,0.199951,7,6.0,4,15,0.722168,23.833333,18
3,1,12427,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18
4,1,13032,1,1.0,0.199951,7,6.0,4,15,0.722168,23.833333,18


##Creating Training and Testing data

In [49]:
orders_future=orders.loc[((orders.eval_set=='train')|(orders.eval_set=='test')),['user_id','eval_set','order_id']]
orders_future.head()

Unnamed: 0,user_id,eval_set,order_id
10,1,train,1187899
25,2,train,1492625
38,3,test,2774568
44,4,test,329954
49,5,train,2196797


In [50]:
data=data.merge(orders_future,on='user_id',how='left')
data.head()

Unnamed: 0,user_id,product_id,uxp_times_bought,uxp_last_five,uxp_ratio_last_five,u_num_of_orders,u_avg_prd,u_orders_the_most,hod_u_most_orders,u_reorder_ratio,average_days_between_orders,u_total_items_bought,eval_set,order_id
0,1,196,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18,train,1187899
1,1,10258,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18,train,1187899
2,1,10326,1,1.0,0.199951,7,6.0,4,15,0.722168,23.833333,18,train,1187899
3,1,12427,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18,train,1187899
4,1,13032,1,1.0,0.199951,7,6.0,4,15,0.722168,23.833333,18,train,1187899


In [51]:
#preparaing the training data
data_train=data[data.eval_set=='train']
data_train.head()

Unnamed: 0,user_id,product_id,uxp_times_bought,uxp_last_five,uxp_ratio_last_five,u_num_of_orders,u_avg_prd,u_orders_the_most,hod_u_most_orders,u_reorder_ratio,average_days_between_orders,u_total_items_bought,eval_set,order_id
0,1,196,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18,train,1187899
1,1,10258,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18,train,1187899
2,1,10326,1,1.0,0.199951,7,6.0,4,15,0.722168,23.833333,18,train,1187899
3,1,12427,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18,train,1187899
4,1,13032,1,1.0,0.199951,7,6.0,4,15,0.722168,23.833333,18,train,1187899


In [52]:
data_train=data_train.merge(order_products__train[['product_id','order_id','reordered']],on=['product_id','order_id'],how='left')
data_train.head()

Unnamed: 0,user_id,product_id,uxp_times_bought,uxp_last_five,uxp_ratio_last_five,u_num_of_orders,u_avg_prd,u_orders_the_most,hod_u_most_orders,u_reorder_ratio,average_days_between_orders,u_total_items_bought,eval_set,order_id,reordered
0,1,196,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18,train,1187899,1.0
1,1,10258,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18,train,1187899,1.0
2,1,10326,1,1.0,0.199951,7,6.0,4,15,0.722168,23.833333,18,train,1187899,
3,1,12427,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18,train,1187899,
4,1,13032,1,1.0,0.199951,7,6.0,4,15,0.722168,23.833333,18,train,1187899,1.0


In [53]:
#filling the NAN values in the reordered
data_train.reordered.fillna(0, inplace=True)

In [54]:
#deleting eval_set, order_id as they are not needed for training.
data_train.drop(['eval_set', 'order_id'], axis=1, inplace=True)

In [55]:
data_train.head()

Unnamed: 0,user_id,product_id,uxp_times_bought,uxp_last_five,uxp_ratio_last_five,u_num_of_orders,u_avg_prd,u_orders_the_most,hod_u_most_orders,u_reorder_ratio,average_days_between_orders,u_total_items_bought,reordered
0,1,196,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18,1.0
1,1,10258,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18,1.0
2,1,10326,1,1.0,0.199951,7,6.0,4,15,0.722168,23.833333,18,0.0
3,1,12427,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18,0.0
4,1,13032,1,1.0,0.199951,7,6.0,4,15,0.722168,23.833333,18,1.0


#preparing the test data

In [56]:
data_test=data[data.eval_set=='test']
data_test.head()

data_test.shape

(316473, 14)

In [57]:
data_test.head()

Unnamed: 0,user_id,product_id,uxp_times_bought,uxp_last_five,uxp_ratio_last_five,u_num_of_orders,u_avg_prd,u_orders_the_most,hod_u_most_orders,u_reorder_ratio,average_days_between_orders,u_total_items_bought,eval_set,order_id
10,3,248,1,0.0,0.0,11,6.666667,3,19,0.649902,11.9,20,test,2774568
11,3,1005,1,1.0,0.199951,11,6.666667,3,19,0.649902,11.9,20,test,2774568
12,3,8021,1,0.0,0.0,11,6.666667,3,19,0.649902,11.9,20,test,2774568
13,3,17668,3,2.0,0.399902,11,6.666667,3,19,0.649902,11.9,20,test,2774568
14,3,18599,1,1.0,0.199951,11,6.666667,3,19,0.649902,11.9,20,test,2774568


In [58]:
#deleting eval_set, order_id as they are not needed for testing.
data_test.drop(['eval_set', 'order_id'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [59]:
data_test.head()

Unnamed: 0,user_id,product_id,uxp_times_bought,uxp_last_five,uxp_ratio_last_five,u_num_of_orders,u_avg_prd,u_orders_the_most,hod_u_most_orders,u_reorder_ratio,average_days_between_orders,u_total_items_bought
10,3,248,1,0.0,0.0,11,6.666667,3,19,0.649902,11.9,20
11,3,1005,1,1.0,0.199951,11,6.666667,3,19,0.649902,11.9,20
12,3,8021,1,0.0,0.0,11,6.666667,3,19,0.649902,11.9,20
13,3,17668,3,2.0,0.399902,11,6.666667,3,19,0.649902,11.9,20
14,3,18599,1,1.0,0.199951,11,6.666667,3,19,0.649902,11.9,20


In [60]:
#shape of train and test.
data_train.shape, data_test.shape

((556391, 13), (316473, 12))

In [61]:
#merging product data into data_train and data_test.
data_train = data_train.merge(products, on='product_id', how='left')
data_test = data_test.merge(products, on='product_id', how='left')

In [62]:
#setting the index again
data_train = data_train.set_index(['user_id', 'product_id'])
data_test = data_test.set_index(['user_id', 'product_id'])

In [63]:
data_train.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,uxp_times_bought,uxp_last_five,uxp_ratio_last_five,u_num_of_orders,u_avg_prd,u_orders_the_most,hod_u_most_orders,u_reorder_ratio,average_days_between_orders,u_total_items_bought,reordered,product_name,aisle_id,department_id
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,196,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18,1.0,Soda,77,7
1,10258,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18,1.0,Pistachios,117,19
1,10326,1,1.0,0.199951,7,6.0,4,15,0.722168,23.833333,18,0.0,Organic Fuji Apples,24,4
1,12427,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18,0.0,Original Beef Jerky,23,19
1,13032,1,1.0,0.199951,7,6.0,4,15,0.722168,23.833333,18,1.0,Cinnamon Toast Crunch,121,14
1,13176,1,1.0,0.199951,7,6.0,4,15,0.722168,23.833333,18,0.0,Bag of Organic Bananas,24,4
1,17122,1,1.0,0.199951,7,6.0,4,15,0.722168,23.833333,18,0.0,Honeycrisp Apples,24,4
1,25133,3,3.0,0.600098,7,6.0,4,15,0.722168,23.833333,18,1.0,Organic String Cheese,21,16
1,30450,1,1.0,0.199951,7,6.0,4,15,0.722168,23.833333,18,0.0,Creamy Almond Butter,88,13
1,41787,1,1.0,0.199951,7,6.0,4,15,0.722168,23.833333,18,0.0,Bartlett Pears,24,4


In [64]:
#mean encoding categorical variables.
columns_mean = ['aisle_id', 'department_id']
for col in columns_mean:
        mean = data_train.groupby(col).reordered.mean()
        data_train[col] = data_train[col].map(mean)
        data_test[col] = data_test[col].map(mean)

In [65]:
#deleting unwanted df and collecting garbage
del [data, orders_future, products]
gc.collect()

425

In [66]:
#Creating X and Y variables 
X=data_train.drop(['reordered','uxp_ratio_last_five'],axis=1)
y=data_train.reordered

#Splitting the data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=10)

In [67]:
X_train.dtypes

uxp_times_bought                 int64
uxp_last_five                  float64
u_num_of_orders                  int64
u_avg_prd                      float64
u_orders_the_most                int64
hod_u_most_orders                int64
u_reorder_ratio                float16
average_days_between_orders    float64
u_total_items_bought             int16
product_name                    object
aisle_id                       float64
department_id                  float64
dtype: object

In [68]:
lbl = preprocessing.LabelEncoder()

In [69]:
X_train['product_name'] = lbl.fit_transform(X_train['product_name'].astype(str))
X_test['product_name'] = lbl.fit_transform(X_test['product_name'].astype(str))

In [None]:
X_train.dtypes

uxp_times_bought                 int64
uxp_last_five                  float64
u_num_of_orders                  int64
u_avg_prd                      float64
u_orders_the_most                int64
hod_u_most_orders                int64
u_reorder_ratio                float16
average_days_between_orders    float64
u_total_items_bought             int16
product_name                     int64
aisle_id                       float64
department_id                  float64
dtype: object

In [70]:
y_train.dtypes

dtype('float64')

Model 1 : Logistic Regression

In [71]:
log_reg = LogisticRegression(random_state=0, n_jobs = -1)
log_reg.fit(X_train,y_train)

#setting a threshold.
y_pred = (log_reg.predict_proba(X_test)[:, 1] >= 0.21).astype('int') 

In [72]:
#Evaluation.
print('accuracy score is ',accuracy_score(y_pred,y_test)) 
print('F1 Score: {}'.format(f1_score(y_pred, y_test))) 
print('Classification Report','\n',classification_report(y_pred,y_test))     

accuracy score is  0.7528187493260163
F1 Score: 0.27970880396641123
Classification Report 
               precision    recall  f1-score   support

           0       0.81      0.89      0.85    131752
           1       0.36      0.23      0.28     35166

    accuracy                           0.75    166918
   macro avg       0.59      0.56      0.57    166918
weighted avg       0.72      0.75      0.73    166918



Model 2 : XGBoost

In [73]:
parameters = {'eval_metric' : 'logloss','max_depth' : 5,'colsample_bytree' : 0.4,'subsample' : 0.8}

In [74]:
#Initializing the model
xgb = xgb.XGBClassifier(objective='binary:logistic', parameters=parameters, num_boost_round=10)

#fitting the model.
xgb.fit(X_train, y_train)

#setting a threshold.
y_pred2 = (xgb.predict_proba(X_test)[:, 1] >= 0.21).astype('int') 

In [75]:
#Evaluation.
print('accuracy score is: ',accuracy_score(y_pred2,y_test)) 
print('F1 Score: {}'.format(f1_score(y_pred2, y_test)))
print('Classification Report:\n',classification_report(y_pred2,y_test))   

accuracy score is:  0.8078277956841083
F1 Score: 0.3904839721055732
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.91      0.89    136406
           1       0.46      0.34      0.39     30512

    accuracy                           0.81    166918
   macro avg       0.66      0.62      0.64    166918
weighted avg       0.79      0.81      0.80    166918



Model 3 : Decision Tree

In [77]:
param_grid = {}
param_grid['max_depth'] = [5,10,15,20]
param_grid['min_samples_split'] = [2,3,4,5]
dt_clf = DecisionTreeClassifier()

#Hyperparameter Tuning
r_search = RandomizedSearchCV(dt_clf, param_distributions=param_grid, cv = 5, verbose = True, n_jobs = -1)
r_search.fit(X_train, y_train)

#setting a threshold
y_pred3 = (r_search.predict_proba(X_test)[:, 1] >= 0.21).astype('int')

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [78]:
#Evaluation.
print('accuracy score is ',accuracy_score(y_pred3,y_test)) 
print('F1 Score: {}'.format(f1_score(y_pred3, y_test)))  
print('Classification Report:\n',classification_report(y_pred3,y_test))   

accuracy score is  0.835458129141255
F1 Score: 0.3512460139364592
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.90      0.91    146698
           1       0.34      0.37      0.35     20220

    accuracy                           0.84    166918
   macro avg       0.62      0.63      0.63    166918
weighted avg       0.84      0.84      0.84    166918



Model 4 : Random Forest Classifier

In [79]:
clf=RandomForestClassifier(n_estimators=25,random_state=42,n_jobs=-1)
clf.fit(X_train, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(X_train, y_train)

#setting a threshold
y_pred4 = (sig_clf.predict_proba(X_test)[:, 1] >= 0.21).astype('int') 

In [80]:
#Evaluation.
print('accuracy score is ',accuracy_score(y_pred4,y_test)) 
print('F1 Score: {}'.format(f1_score(y_pred4, y_test)))  
print('Classification Report:',classification_report(y_pred4,y_test))   

accuracy score is  0.8361171353598773
F1 Score: 0.3886741010570541
Classification Report:               precision    recall  f1-score   support

           0       0.90      0.91      0.91    144286
           1       0.39      0.38      0.39     22632

    accuracy                           0.84    166918
   macro avg       0.65      0.65      0.65    166918
weighted avg       0.83      0.84      0.84    166918



In [81]:
data_test.dtypes

uxp_times_bought                 int64
uxp_last_five                  float64
uxp_ratio_last_five            float16
u_num_of_orders                  int64
u_avg_prd                      float64
u_orders_the_most                int64
hod_u_most_orders                int64
u_reorder_ratio                float16
average_days_between_orders    float64
u_total_items_bought             int16
product_name                    object
aisle_id                       float64
department_id                  float64
dtype: object

In [82]:
lbl = preprocessing.LabelEncoder()
data_test['product_name'] = lbl.fit_transform(data_test['product_name'].astype(str))

In [83]:
#making prdeictions on the test dataset
y_pred_test = (xgb.predict_proba(data_test.drop('uxp_ratio_last_five', axis=1))[:, 1] >= 0.21).astype('int') #setting a threshold.

In [84]:
#saving the prediction as a new column in data_test
data_test['prediction'] = y_pred_test
data_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,uxp_times_bought,uxp_last_five,uxp_ratio_last_five,u_num_of_orders,u_avg_prd,u_orders_the_most,hod_u_most_orders,u_reorder_ratio,average_days_between_orders,u_total_items_bought,product_name,aisle_id,department_id,prediction
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
3,248,1,0.0,0.0,11,6.666667,3,19,0.649902,11.9,20,7075,0.091864,0.116104,0
3,1005,1,1.0,0.199951,11,6.666667,3,19,0.649902,11.9,20,4272,0.098301,0.172215,0
3,8021,1,0.0,0.0,11,6.666667,3,19,0.649902,11.9,20,259,0.116178,0.073533,0
3,17668,3,2.0,0.399902,11,6.666667,3,19,0.649902,11.9,20,25589,0.186557,0.16799,1
3,18599,1,1.0,0.199951,11,6.666667,3,19,0.649902,11.9,20,8860,0.097074,0.084148,0


In [85]:
# Reset the index
final = data_test.reset_index()

# Keep only the required columns to create our submission file
final = final[['product_id', 'user_id', 'prediction']]

gc.collect()
final.head()

Unnamed: 0,product_id,user_id,prediction
0,248,3,0
1,1005,3,0
2,8021,3,0
3,17668,3,1
4,18599,3,0


In [86]:
#Creating a submission file
orders = pd.read_csv('orders.csv')
orders_test = orders.loc[orders.eval_set == 'test', ['user_id', 'order_id']]
orders_test.head()

Unnamed: 0,user_id,order_id
38,3,2774568
44,4,329954
53,6,1528013
96,11,1376945
102,12,1356845


In [87]:
#merging our prediction with orders_test
final = final.merge(orders_test, on='user_id', how='left')
final.head()

Unnamed: 0,product_id,user_id,prediction,order_id
0,248,3,0,2774568
1,1005,3,0,2774568
2,8021,3,0,2774568
3,17668,3,1,2774568
4,18599,3,0,2774568


In [88]:
#remove user_id column
final = final.drop('user_id', axis=1)

In [89]:
#convert product_id as integer
final['product_id'] = final.product_id.astype(int)

## Remove all unnecessary objects
del orders
del orders_test
gc.collect()

final.head()

Unnamed: 0,product_id,prediction,order_id
0,248,0,2774568
1,1005,0,2774568
2,8021,0,2774568
3,17668,1,2774568
4,18599,0,2774568


In [90]:
d = dict()
for row in final.itertuples():
    if row.prediction== 1:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in final.order_id:
    if order not in d:
        d[order] = 'None'
        
gc.collect()
#We now check how the dictionary were populated (open hidden output)

100

In [91]:
#Convert the dictionary into a DataFrame
sub = pd.DataFrame.from_dict(d, orient='index')

#Reset index
sub.reset_index(inplace=True)
#Set column names
sub.columns = ['order_id', 'products']

sub.head()

Unnamed: 0,order_id,products
0,2774568,17668 21903 32402 39190 47766
1,2161313,196 10441
2,1735923,17008 31487
3,1980631,6184 9387 13575
4,139655,22935 32096


In [92]:
sub.to_csv('sub.csv', index=False, header=True)

In [93]:
del [X, y, X_train, y_train, y_test, X_test, xgb, y_pred]
gc.collect()

129