In [1]:
import os

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
os.chdir('../90_Original_Data/')

In [3]:
os.listdir()

['products.csv',
 'orders.csv',
 'order_products__train.csv',
 'departments.csv',
 'aisles.csv',
 'order_products__prior.csv',
 'sample_submission.csv']

In [4]:
prod_df = pd.read_csv('products.csv')

In [6]:
ord_df = pd.read_csv('orders.csv')

In [6]:
ord_df.drop('eval_set', axis=1,inplace=True)

In [10]:
dep_df = pd.read_csv('departments.csv')

In [12]:
ais_df = pd.read_csv('aisles.csv')

We'll treat the prior as our training set and the train as our test set.  
Prior - 3214874 (our training set)  
Train - 131209 (our test set)

In [8]:
ord_prod_train_df = pd.read_csv('order_products__prior.csv')

In [10]:
ord_prod_test_df = pd.read_csv('order_products__train.csv')

Here we're combining the order/product information with user information by merging the `order_products` table with the `orders` table.

In [11]:
ord_prod_train_df = ord_prod_train_df.merge(ord_df, on='order_id')
ord_prod_test_df = ord_prod_test_df.merge(ord_df, on='order_id')

## Feature Engineering

Below I've created a new DataFrame that aggregared user-product info. I've also taken this as an opportunity to create a count of the number of orders for each product.

In [13]:
df_user_product = (ord_prod_train_df.groupby(['product_id','user_id'],as_index=False) 
                                          .agg({'order_id':'count'}) 
                                          .rename(columns={'order_id':'user_product_total_orders'}))

test_ids = ord_prod_test_df['user_id'].unique() 
df = df_user_product[df_user_product['user_id'].isin(test_ids)]
df.head()

Unnamed: 0,product_id,user_id,user_product_total_orders
0,1,138,2
1,1,709,1
3,1,777,1
6,1,1052,2
9,1,1494,3


In [33]:
ord_prod_train_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,3,5,9,8.0
1,2,28985,2,1,202279,3,5,9,8.0
2,2,9327,3,0,202279,3,5,9,8.0
3,2,45918,4,1,202279,3,5,9,8.0
4,2,30035,5,0,202279,3,5,9,8.0
5,2,17794,6,1,202279,3,5,9,8.0
6,2,40141,7,1,202279,3,5,9,8.0
7,2,1819,8,1,202279,3,5,9,8.0
8,2,43668,9,0,202279,3,5,9,8.0
9,3,33754,1,1,205970,16,5,17,12.0


In [24]:
train_carts = (ord_prod_test_df.groupby('user_id',as_index=False)
                                      .agg({'product_id':(lambda x: set(x))})
                                      .rename(columns={'product_id':'latest_cart'}))

df = df.merge(train_carts, on='user_id')
df['in_cart'] = (df.apply(lambda row: row['product_id'] in row['latest_cart'], axis=1).astype(int))

Unnamed: 0,product_id,user_id,user_product_total_orders,latest_cart,in_cart
0,1,138,2,{42475},0
1,907,138,2,{42475},0
2,1000,138,1,{42475},0
3,3265,138,1,{42475},0
4,4913,138,1,{42475},0


In [31]:
df

Unnamed: 0,product_id,user_id,user_product_total_orders,latest_cart,in_cart
0,1,138,2,{42475},0
1,907,138,2,{42475},0
2,1000,138,1,{42475},0
3,3265,138,1,{42475},0
4,4913,138,1,{42475},0
5,6184,138,1,{42475},0
6,6901,138,1,{42475},0
7,7147,138,1,{42475},0
8,7987,138,1,{42475},0
9,8048,138,2,{42475},0


In [26]:
df.in_cart.value_counts()

0    7645837
1     828824
Name: in_cart, dtype: int64

In [27]:
df.user_id.nunique()

131209

In [None]:
target_pcts = df.in_cart.value_counts(normalize=True) 
print(target_pcts)

target_pcts.plot(kind='bar')