# **Final Data Prep.**:<a class="anchor" id="0"></a>

1. [**Data Import**](#1)
2. [**Null Imputation**](#2)
3. [**Data Merge**](#3)
4. [**Pickling**](#4)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

from _util.custom_mem_opt import custom_mem_opt

from pprint import PrettyPrinter

pp = PrettyPrinter(width=41, compact=True)
pd.options.mode.chained_assignment = None
pd.set_option('future.no_silent_downcasting', True)

### Data Import <a class=anchor id="1"></a>
[Back to top](#0)

Apply memory optimization.

In [3]:
root = './_data/'

aisles = custom_mem_opt(pd.read_csv(root + 'aisles.csv'), verbose=False)
departments = custom_mem_opt(pd.read_csv(root + 'departments.csv'), verbose=False)
orders = custom_mem_opt(pd.read_csv(root + 'orders.csv'), verbose=False)
order_products_prior = custom_mem_opt(pd.read_csv(root + 'order_products__prior.csv'), verbose=False)
order_products_train = custom_mem_opt(pd.read_csv(root + 'order_products__train.csv'), verbose=False)
products = custom_mem_opt(pd.read_csv(root + 'products.csv'), verbose=False)

In [4]:
root = './_pkls/'

cust_features = pd.read_pickle(root + 'cust_features.p')
cust_product_features = pd.read_pickle(root + 'cust_product_features.p')
product_features = pd.read_pickle(root + 'product_features.p')

In [5]:
train_orders = orders.merge(order_products_train, on = 'order_id', how = 'inner')
train_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,1187899,1,train,11,4,8,14.0,196,1,1
1,1187899,1,train,11,4,8,14.0,25133,2,1
2,1187899,1,train,11,4,8,14.0,38928,3,1
3,1187899,1,train,11,4,8,14.0,26405,4,1
4,1187899,1,train,11,4,8,14.0,39657,5,1


In [6]:
train_orders.drop(['eval_set', 'add_to_cart_order', 'order_id'], axis = 1, inplace = True)

In [7]:
train_users = train_orders.user_id.unique()
pp.pprint(train_users[:10])

array([ 1,  2,  5,  7,  8,  9, 10, 13, 14, 17], dtype=int32)


In [8]:
print(cust_product_features.shape)

(13307953, 11)


In [9]:
cust_product_features.head()

Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,is_reorder_1
0,1,196,10,9,0.9,1.4,17.6,10,1.0,1.0,1.0
1,1,10258,9,8,0.888889,3.333333,19.555555,10,1.0,1.0,1.0
2,1,10326,1,0,0.0,5.0,28.0,5,0.0,0.0,0.0
3,1,12427,10,9,0.9,3.3,17.6,10,1.0,1.0,1.0
4,1,13032,3,2,0.666667,6.333333,21.666666,10,1.0,0.0,0.0


In [10]:
df = cust_product_features[cust_product_features.user_id.isin(train_users)]
df.head()

Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,is_reorder_1
0,1,196,10,9,0.9,1.4,17.6,10,1.0,1.0,1.0
1,1,10258,9,8,0.888889,3.333333,19.555555,10,1.0,1.0,1.0
2,1,10326,1,0,0.0,5.0,28.0,5,0.0,0.0,0.0
3,1,12427,10,9,0.9,3.3,17.6,10,1.0,1.0,1.0
4,1,13032,3,2,0.666667,6.333333,21.666666,10,1.0,0.0,0.0


In [11]:
df = df.merge(train_orders, on = ['user_id', 'product_id'], how = 'outer')
df.head()

Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,is_reorder_1,order_number,order_dow,order_hour_of_day,days_since_prior_order,reordered
0,1,196,10.0,9.0,0.9,1.4,17.6,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
1,1,10258,9.0,8.0,0.888889,3.333333,19.555555,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
2,1,10326,1.0,0.0,0.0,5.0,28.0,5.0,0.0,0.0,0.0,,,,,
3,1,12427,10.0,9.0,0.9,3.3,17.6,10.0,1.0,1.0,1.0,,,,,
4,1,13032,3.0,2.0,0.666667,6.333333,21.666666,10.0,1.0,0.0,0.0,11.0,4.0,8.0,14.0,1.0


### Null Imputation <a class=anchor id="2"></a>
[Back to top](#0)

Apply memory optimization.

Use mean for the following:
- order_number 
- order_dow
- order_hour_of_day
- days_since_prior_order

In [13]:
df.order_number = df.order_number.fillna(df.groupby('user_id')['order_number'].transform('mean'))
df.order_dow = df.order_dow.fillna(df.groupby('user_id')['order_dow'].transform('mean'))
df.order_hour_of_day = df.order_hour_of_day.fillna(df.groupby('user_id')['order_hour_of_day'].transform('mean'))
df.days_since_prior_order = df.days_since_prior_order.fillna(df.groupby('user_id')['days_since_prior_order'].\
                                                              transform('mean'))

In [14]:
print(df.reordered.value_counts())

reordered
1.0    828824
0.0    555793
Name: count, dtype: int64


In [15]:
print(df.reordered.isnull().sum())

7645837


In [16]:
df = df[df.reordered != 0]

In [17]:
print(df.shape)

(8474661, 16)


In [18]:
df.reordered = df.reordered.fillna(0)
print(df.isnull().sum())

user_id                            0
product_id                         0
total_product_orders_by_user       0
total_product_reorders_by_user     0
user_product_reorder_percentage    0
avg_add_to_cart_by_user            0
avg_days_since_last_bought         0
last_ordered_in                    0
is_reorder_3                       0
is_reorder_2                       0
is_reorder_1                       0
order_number                       0
order_dow                          0
order_hour_of_day                  0
days_since_prior_order             0
reordered                          0
dtype: int64


In [19]:
df.head()

Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,is_reorder_1,order_number,order_dow,order_hour_of_day,days_since_prior_order,reordered
0,1,196,10.0,9.0,0.9,1.4,17.6,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
1,1,10258,9.0,8.0,0.888889,3.333333,19.555555,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,1.0
2,1,10326,1.0,0.0,0.0,5.0,28.0,5.0,0.0,0.0,0.0,11.0,4.0,8.0,14.0,0.0
3,1,12427,10.0,9.0,0.9,3.3,17.6,10.0,1.0,1.0,1.0,11.0,4.0,8.0,14.0,0.0
4,1,13032,3.0,2.0,0.666667,6.333333,21.666666,10.0,1.0,0.0,0.0,11.0,4.0,8.0,14.0,1.0
