# Table of Content
01. Import Libraries
02. Import Data
03. Change Data Types to Save Memory
04. Merge Data
05. Export Data

# 01. Import Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02. Import Data

In [2]:
# Define the main project folder path
path = r'C:\Users\saich\Desktop\CareerFoundry\Data Immersion\Achievement 4 Python Fundamentals for Data Analysts\04-2023 Instacart Basket Analysis (github)'

In [3]:
# Import 'orders_products_combined' data set from 'Prepared Data' folder
ords_prods_combined = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))

In [4]:
ords_prods_combined.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,196,1,0,both
1,2539329,1,1,2,8,,14084,2,0,both
2,2539329,1,1,2,8,,12427,3,0,both
3,2539329,1,1,2,8,,26088,4,0,both
4,2539329,1,1,2,8,,26405,5,0,both


In [5]:
ords_prods_combined.shape

(32434489, 10)

In [6]:
# Import 'products_checked' data set from 'Prepared Data' folder
prods = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index_col = 0)

In [7]:
prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [8]:
prods.shape

(49670, 5)

# 03. Change Data Types to Save Memory

In [9]:
# Check the memory usage and column data types of 'ords_prods_combined' dataframe
ords_prods_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32434488
Data columns (total 10 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int32   
 1   user_id                 int32   
 2   order_number            int8    
 3   order_day_of_week       int8    
 4   order_hour_of_day       int8    
 5   days_since_prior_order  float16 
 6   product_id              int32   
 7   add_to_cart_order       int32   
 8   reordered               int8    
 9   _merge                  category
dtypes: category(1), float16(1), int32(4), int8(4)
memory usage: 958.9 MB


In [10]:
# Check the memory usage and column data types of 'prods' dataframe
prods.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49670 entries, 0 to 49692
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49670 non-null  int64  
 1   product_name   49670 non-null  object 
 2   aisle_id       49670 non-null  int64  
 3   department_id  49670 non-null  int64  
 4   prices         49670 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 2.3+ MB


In [11]:
# Change data types for 'prods' dataframe
prods['product_id'] = prods['product_id'].astype('int32')
prods['aisle_id'] = prods['aisle_id'].astype('int8')
prods['department_id'] = prods['department_id'].astype('int8')
prods['prices'] = prods['prices'].astype('float32')

In [12]:
# Check the memory usage and column data types of 'prods' dataframe after data types changes
prods.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49670 entries, 0 to 49692
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49670 non-null  int32  
 1   product_name   49670 non-null  object 
 2   aisle_id       49670 non-null  int8   
 3   department_id  49670 non-null  int8   
 4   prices         49670 non-null  float32
dtypes: float32(1), int32(1), int8(2), object(1)
memory usage: 1.2+ MB


# 04. Merge Data

In [13]:
# Before merging data, drop the '_merge' column in 'ords_prods_combined' dataframe, else will get an error
ords_prods_combined.drop(columns = ['_merge'], inplace = True)

In [14]:
# Merge 'ords_prods_combined' dataframe with 'prods' dataframe using 'product_id' as the key column
ords_prods_merged = ords_prods_combined.merge(prods, on = 'product_id', indicator = True)

In [15]:
# Check the output
ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both


In [16]:
ords_prods_merged['_merge'].value_counts(dropna = False)

both          32399732
left_only            0
right_only           0
Name: _merge, dtype: int64

From the frequency check above, there are only entries that have a value of “both,” leading you to think that your key column, “order_id” exists completely in both dataframes. However, this conclusion is wrong. This is because you chose the default option of inner join in this merge. This means that the resulting table will only contain observations found in both dataframes. As such, the merge flag here will only show entries that have a value of “both.”

In [17]:
# To check whether it is a full match, use outer join
# Merge 'ords_prods_combined' dataframe with 'prods' dataframe using 'product_id' as the key column and outer join
ords_prods_merged_outer = ords_prods_combined.merge(prods, on = 'product_id', how = 'outer', indicator = True)

In [18]:
ords_prods_merged_outer['_merge'].value_counts(dropna = False)

both          32399732
left_only        35327
right_only          11
Name: _merge, dtype: int64

From the frequency check above, the merge rate is actually not 100%. For this Instacart project, we’ll only be working with data sets that have a full merge rate.

# 05. Export Data

In [19]:
# Dimension check for 'ords_prods_merged' dataframe after data merging
ords_prods_merged.shape

(32399732, 14)

In [20]:
# Export 'ords_prods_merged' dataframe to 'Prepared Data' folder in pickle format
ords_prods_merged.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged.pkl'))