In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import warnings
warnings.filterwarnings('ignore')

pd.options.mode.chained_assignment = None

root = '/content/drive/MyDrive/instacart-data/'

In [None]:
orders = pd.read_csv(root + 'orders.csv',
                 )


order_products_train = pd.read_csv(root + 'order_products__train.csv',
                                 )

order_products_prior = pd.read_csv(root + 'order_products__prior.csv',
                                )

product_features = pd.read_pickle(root + 'product_features.pkl')

user_features = pd.read_pickle(root + 'user_features.pkl')

user_product_features = pd.read_pickle(root + 'user_product_features.pkl')

products = pd.read_csv(root +'products.csv')

aisles = pd.read_csv(root + 'aisles.csv')

departments = pd.read_csv(root + 'departments.csv')

## reducing memory

In [None]:
def shrink_memory(df):

  """
    It tries to reduce the memory usage of the dataframe
    Parameters: Dataframe
    Return: Dataframe
    """
  start_mem_usg = df.memory_usage().sum() / 1024**3
  print("Memory usage of orignal data is :", round(start_mem_usg , 2)," GB")
  for col in df.columns:
      if df[col].dtypes in ["int64", "int32", "int16"]:

          cmin = df[col].min()
          cmax = df[col].max()

          if cmin > np.iinfo(np.int8).min and cmax < np.iinfo(np.int8).max:
              df[col] = df[col].astype(np.int8)

          elif cmin > np.iinfo(np.int16).min and cmax < np.iinfo(np.int16).max:
              df[col] = df[col].astype(np.int16)

          elif cmin > np.iinfo(np.int32).min and cmax < np.iinfo(np.int32).max:
              df[col] = df[col].astype(np.int32)

      if df[col].dtypes in ["float64", "float32"]:

          cmin = df[col].min()
          cmax = df[col].max()

          if cmin > np.finfo(np.float16).min and cmax < np.finfo(np.float16).max:
              df[col] = df[col].astype(np.float16)
          # elif cmin > np.finfo(np.float8).min and cmax < np.finfo(np.float8).max:
          #     df[col] = df[col].astype(np.float8)
          elif cmin > np.finfo(np.float32).min and cmax < np.finfo(np.float32).max:
              df[col] = df[col].astype(np.float32)


  print("")
  print("Memory after reduction without loss in precision")
  mem_usg = df.memory_usage().sum() / 1024**3
  print("Memory usage is: ",round(mem_usg , 2)," GB")
  print("This is ",100* round(mem_usg/start_mem_usg , 2),"% of the initial size")

  return df

In [None]:
orders = shrink_memory(orders)
order_products_train = shrink_memory(order_products_train)
order_products_prior = shrink_memory(order_products_prior)
product_features = shrink_memory(product_features)
user_features = shrink_memory(user_features)
user_product_features = shrink_memory(user_product_features)
products = shrink_memory(products)
aisles = shrink_memory(aisles)
departments = shrink_memory(departments)

Memory usage of orignal data is : 0.18  GB

Memory after reduction without loss in precision
Memory usage is:  0.07  GB
This is  38.0 % of the initial size
Memory usage of orignal data is : 0.04  GB

Memory after reduction without loss in precision
Memory usage is:  0.01  GB
This is  31.0 % of the initial size
Memory usage of orignal data is : 0.97  GB

Memory after reduction without loss in precision
Memory usage is:  0.33  GB
This is  34.0 % of the initial size
Memory usage of orignal data is : 0.01  GB

Memory after reduction without loss in precision
Memory usage is:  0.0  GB
This is  28.000000000000004 % of the initial size
Memory usage of orignal data is : 0.03  GB

Memory after reduction without loss in precision
Memory usage is:  0.01  GB
This is  27.0 % of the initial size
Memory usage of orignal data is : 0.81  GB

Memory after reduction without loss in precision
Memory usage is:  0.26  GB
This is  32.0 % of the initial size
Memory usage of orignal data is : 0.0  GB

Memory a

merging train order data with orders


In [None]:
train_orders = orders.merge(order_products_train, on = 'order_id', how = 'inner')
train_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,1187899,1,train,11,4,8,14.0,196,1,1
1,1187899,1,train,11,4,8,14.0,25133,2,1
2,1187899,1,train,11,4,8,14.0,38928,3,1
3,1187899,1,train,11,4,8,14.0,26405,4,1
4,1187899,1,train,11,4,8,14.0,39657,5,1


removing unnecessary columns from train_orders



In [None]:
train_orders.drop(['eval_set', 'add_to_cart_order', 'order_id'], axis = 1, inplace = True)


get unique user_ids in train data and keeping only train_users in the data





In [None]:
train_users = train_orders.user_id.unique()
print(train_users[:7])
df = user_product_features[user_product_features.user_id.isin(train_users)]
df.head()

[ 1  2  5  7  8  9 10]


Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,is_reorder_1
0,1,196,10,9,0.899902,1.400391,17.59375,10,1.0,1.0,1.0
1,1,10258,9,8,0.888672,3.333984,19.5625,10,1.0,1.0,1.0
2,1,10326,1,0,0.0,5.0,28.0,5,0.0,0.0,0.0
3,1,12427,10,9,0.899902,3.300781,17.59375,10,1.0,1.0,1.0
4,1,13032,3,2,0.666504,6.332031,21.671875,10,1.0,0.0,0.0


In [None]:
df = df.merge(train_orders, on = ['user_id', 'product_id'], how = 'outer') # outer join
df.head()

Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,is_reorder_1,order_number,order_dow,order_hour_of_day,days_since_prior_order,reordered
0,1,196,10.0,9.0,0.899902,1.400391,17.59375,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
1,1,10258,9.0,8.0,0.888672,3.333984,19.5625,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
2,1,10326,1.0,0.0,0.0,5.0,28.0,5.0,0.0,0.0,0.0,,,,,
3,1,12427,10.0,9.0,0.899902,3.300781,17.59375,10.0,1.0,1.0,1.0,,,,,
4,1,13032,3.0,2.0,0.666504,6.332031,21.671875,10.0,1.0,0.0,0.0,11.0,4.0,8.0,14.0,1.0


In [None]:
df.shape

(9030454, 16)

for order_number, order_dow, order_hour_of_day, days_since_prior_order, impute null values with mean values grouped by users as these products will also be potential candidate for order.

In [None]:
df.order_number.fillna(df.groupby('user_id')['order_number'].transform('mean'), inplace = True)
df.order_dow.fillna(df.groupby('user_id')['order_dow'].transform('mean'), inplace = True)
df.order_hour_of_day.fillna(df.groupby('user_id')['order_hour_of_day'].transform('mean'), inplace = True)
df.days_since_prior_order.fillna(df.groupby('user_id')['days_since_prior_order'].transform('mean'), inplace = True)

In [None]:
df.isna().sum()

Unnamed: 0,0
user_id,0
product_id,0
total_product_orders_by_user,555793
total_product_reorders_by_user,555793
user_product_reorder_percentage,555793
avg_add_to_cart_by_user,555793
avg_days_since_last_bought,555793
last_ordered_in,555793
is_reorder_3,555793
is_reorder_2,555793


Removing those products which were bought the first time in last order by a user



In [None]:
df.reordered.value_counts()


Unnamed: 0_level_0,count
reordered,Unnamed: 1_level_1
1.0,828824
0.0,555793


In [None]:
df = df[df.reordered != 0]
df.shape

(8474661, 16)

In [None]:
df.reordered.fillna(0, inplace = True)
df.isnull().sum()

Unnamed: 0,0
user_id,0
product_id,0
total_product_orders_by_user,0
total_product_reorders_by_user,0
user_product_reorder_percentage,0
avg_add_to_cart_by_user,0
avg_days_since_last_bought,0
last_ordered_in,0
is_reorder_3,0
is_reorder_2,0


## Merging product and user features


In [None]:
df = df.merge(product_features, on = 'product_id', how = 'left')
df = df.merge(user_features, on = 'user_id', how = 'left')
df.head()

Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,...,total_reorders_by_user,reorder_propotion_by_user,average_order_size,reorder_in_order,orders_3,orders_2,orders_1,reorder_3,reorder_2,reorder_1
0,1,196,10.0,9.0,0.899902,1.400391,17.59375,10.0,1.0,1.0,...,41,0.694824,5.898438,0.706055,6.0,6.0,9.0,0.666504,1.0,0.666504
1,1,10258,9.0,8.0,0.888672,3.333984,19.5625,10.0,1.0,1.0,...,41,0.694824,5.898438,0.706055,6.0,6.0,9.0,0.666504,1.0,0.666504
2,1,10326,1.0,0.0,0.0,5.0,28.0,5.0,0.0,0.0,...,41,0.694824,5.898438,0.706055,6.0,6.0,9.0,0.666504,1.0,0.666504
3,1,12427,10.0,9.0,0.899902,3.300781,17.59375,10.0,1.0,1.0,...,41,0.694824,5.898438,0.706055,6.0,6.0,9.0,0.666504,1.0,0.666504
4,1,13032,3.0,2.0,0.666504,6.332031,21.671875,10.0,1.0,0.0,...,41,0.694824,5.898438,0.706055,6.0,6.0,9.0,0.666504,1.0,0.666504


The dataframe has null values because the product was never bought earlier by a user

In [None]:
df.isna().sum().sum()

0

In [None]:
df.to_pickle(root + 'Finaldata.pkl')


In [None]:
df2 = pd.read_pickle(root +'Finaldata.pkl')
df2.head()

Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,...,total_reorders_by_user,reorder_propotion_by_user,average_order_size,reorder_in_order,orders_3,orders_2,orders_1,reorder_3,reorder_2,reorder_1
0,1,196,10.0,9.0,0.899902,1.400391,17.59375,10.0,1.0,1.0,...,41,0.694824,5.898438,0.706055,6.0,6.0,9.0,0.666504,1.0,0.666504
1,1,10258,9.0,8.0,0.888672,3.333984,19.5625,10.0,1.0,1.0,...,41,0.694824,5.898438,0.706055,6.0,6.0,9.0,0.666504,1.0,0.666504
2,1,10326,1.0,0.0,0.0,5.0,28.0,5.0,0.0,0.0,...,41,0.694824,5.898438,0.706055,6.0,6.0,9.0,0.666504,1.0,0.666504
3,1,12427,10.0,9.0,0.899902,3.300781,17.59375,10.0,1.0,1.0,...,41,0.694824,5.898438,0.706055,6.0,6.0,9.0,0.666504,1.0,0.666504
4,1,13032,3.0,2.0,0.666504,6.332031,21.671875,10.0,1.0,0.0,...,41,0.694824,5.898438,0.706055,6.0,6.0,9.0,0.666504,1.0,0.666504
