## Contents

01. Checking the prods and ords_prods_combined data sets
02. Merging the prods and ords_prods_combined data sets
03. Consistency checks on merged data set



In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# importing order_products_combined data set

path = r'/Users/lindazhang/Instacart Basket Analysis'
df_ords_prods_combined = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))

## 01. Checking the prods and ords_prods_combined data sets

In [3]:
# importing just the columns I need from df_prods

vars_list = ['product_id', 'product_name', 'aisle_id', 'department_id', 'prices']
df_prods = pd.read_csv(os.path.join(path, '02 Data','Prepared Data','prods_cleaned.csv'), usecols = vars_list)


In [4]:
# checking the df_prods dataset

df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [5]:
# checking the shape of the df_prods data set

df_prods.shape

(49688, 5)

In [6]:
# checking the df_ords_prods_combined data set

df_ords_prods_combined.head()

Unnamed: 0,order_id,customer_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,order_added_to_cart,reordered,_merge
0,2539329,1,1,2,8,,196,1,0,both
1,2539329,1,1,2,8,,14084,2,0,both
2,2539329,1,1,2,8,,12427,3,0,both
3,2539329,1,1,2,8,,26088,4,0,both
4,2539329,1,1,2,8,,26405,5,0,both


In [7]:
# dropping the _merge column from the df_ords_prods_combined dataset

df_ords_prods_combined = df_ords_prods_combined.drop(columns = ['_merge'])

In [8]:
# checking the shape of the df_ords_prods_combined data set

df_ords_prods_combined.shape

(32434489, 9)

## 02. Merging the prods and ords_prods_combined data sets

In [9]:
# inner joining the df_prods and df_ords_prods_combined data set using product_id

df_merged_large = df_ords_prods_combined.merge(df_prods, on = ['product_id'], indicator = True)


In [10]:
df_merged_large 

Unnamed: 0,order_id,customer_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,order_added_to_cart,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32433025,1320836,202557,17,2,15,1.0,43553,2,1,Orange Energy Shots,64,7,3.7,both
32433026,31526,202557,18,5,11,3.0,43553,2,1,Orange Energy Shots,64,7,3.7,both
32433027,758936,203436,1,2,7,,42338,4,0,"Zucchini Chips, Pesto",50,19,6.9,both
32433028,2745165,203436,2,3,5,15.0,42338,16,1,"Zucchini Chips, Pesto",50,19,6.9,both


In [11]:
# checking counts on merge

df_merged_large['_merge'].value_counts()

both          32433030
left_only            0
right_only           0
Name: _merge, dtype: int64

In [12]:
# dropping _merge column 

df_merged_large = df_merged_large.drop(columns = ['_merge'])

In [13]:
df_merged_large.head()

Unnamed: 0,order_id,customer_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,order_added_to_cart,reordered,product_name,aisle_id,department_id,prices
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0


## 03. Consistency checks on merged data set

In [14]:
#checking for missing values

df_merged_large.isnull().sum()

order_id                        0
customer_id                     0
order_number                    0
orders_day_of_week              0
order_hour_of_day               0
days_since_prior_order    2077979
product_id                      0
order_added_to_cart             0
reordered                       0
product_name                28171
aisle_id                        0
department_id                   0
prices                          0
dtype: int64

In [15]:
# changed all the missing product_name values to "unknown"
df_merged_large ['product_name'].fillna('unknown', inplace=True)

In [16]:
df_merged_large.isnull().sum()


order_id                        0
customer_id                     0
order_number                    0
orders_day_of_week              0
order_hour_of_day               0
days_since_prior_order    2077979
product_id                      0
order_added_to_cart             0
reordered                       0
product_name                    0
aisle_id                        0
department_id                   0
prices                          0
dtype: int64

In [17]:
# checking for duplicates ...looks like there are not duplicates

df_merged_large_dups= df_merged_large[df_merged_large.duplicated()]
df_merged_large_dups 

Unnamed: 0,order_id,customer_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,order_added_to_cart,reordered,product_name,aisle_id,department_id,prices


In [18]:
df_merged_large.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged.pkl'))