# Table of Content
01. Import Libraries
02. Import Data
03. Change Data Types to Save Memory
04. Merge Data
05. Export Data

# 01. Import Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02. Import Data

In [2]:
# Define the main project folder path
path = r'C:\Users\saich\Desktop\CareerFoundry\Data Immersion\Achievement 4 Python Fundamentals for Data Analysts\04-2023 Instacart Basket Analysis (github)'

In [3]:
# Import 'orders_checked' data set from 'Prepared Data' folder
ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_checked.csv'), index_col = 0)

In [4]:
ords.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [5]:
ords.shape

(3421083, 6)

In [6]:
# Import 'orders_products_prior_checked' data set from 'Prepared Data' folder
ords_prods_prior = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_prior_checked.csv'), index_col = 0)

In [7]:
ords_prods_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [8]:
ords_prods_prior.shape

(32434489, 4)

# 03. Change Data Types to Save Memory

In [9]:
# Check the memory usage and column data types of 'ords' dataframe
ords.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   order_number            int64  
 3   order_day_of_week       int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
dtypes: float64(1), int64(5)
memory usage: 182.7 MB


In [10]:
# Change data types for 'ords' dataframe
ords['order_id'] = ords['order_id'].astype('int32')
ords['user_id'] = ords['user_id'].astype('int32')
ords['order_number'] = ords['order_number'].astype('int8')
ords['order_day_of_week'] = ords['order_day_of_week'].astype('int8')
ords['order_hour_of_day'] = ords['order_hour_of_day'].astype('int8')
ords['days_since_prior_order'] = ords['days_since_prior_order'].astype('float16')

In [11]:
# Check the memory usage and column data types of 'ords' dataframe after data types changes
ords.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int32  
 1   user_id                 int32  
 2   order_number            int8   
 3   order_day_of_week       int8   
 4   order_hour_of_day       int8   
 5   days_since_prior_order  float16
dtypes: float16(1), int32(2), int8(3)
memory usage: 68.5 MB


In [12]:
# Check the memory usage and column data types of 'ords_prods_prior' dataframe
ords_prods_prior.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int64
 1   product_id         int64
 2   add_to_cart_order  int64
 3   reordered          int64
dtypes: int64(4)
memory usage: 1.2 GB


In [13]:
# Change data types for 'ords_prods_prior' dataframe
ords_prods_prior['order_id'] = ords_prods_prior['order_id'].astype('int32')
ords_prods_prior['product_id'] = ords_prods_prior['product_id'].astype('int32')
ords_prods_prior['add_to_cart_order'] = ords_prods_prior['add_to_cart_order'].astype('int32')
ords_prods_prior['reordered'] = ords_prods_prior['reordered'].astype('int8')

In [14]:
# Check the memory usage and column data types of 'ords_prods_prior' dataframe after data types changes
ords_prods_prior.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int32
 1   product_id         int32
 2   add_to_cart_order  int32
 3   reordered          int8 
dtypes: int32(3), int8(1)
memory usage: 649.6 MB


# 04. Merge Data

In [15]:
# Merge 'ords' dataframe with 'ords_prods_prior' dataframe using 'order_id' as the key column
ords_prods_combined = ords.merge(ords_prods_prior, on = 'order_id', indicator = True)

In [16]:
# Check the output
ords_prods_combined.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,196,1,0,both
1,2539329,1,1,2,8,,14084,2,0,both
2,2539329,1,1,2,8,,12427,3,0,both
3,2539329,1,1,2,8,,26088,4,0,both
4,2539329,1,1,2,8,,26405,5,0,both


In [17]:
ords_prods_combined['_merge'].value_counts(dropna = False)

both          32434489
left_only            0
right_only           0
Name: _merge, dtype: int64

From the frequency check above, there are only entries that have a value of “both,” leading you to think that your key column, “order_id” exists completely in both dataframes. However, this conclusion is wrong. This is because you chose the default option of inner join in this merge. This means that the resulting table will only contain observations found in both dataframes. As such, the merge flag here will only show entries that have a value of “both.”

In [18]:
# To check whether it is a full match, use outer join
# Merge 'ords' dataframe with 'ords_prods_prior' dataframe using 'order_id' as the key column and outer join
ords_prods_combined_outer = ords.merge(ords_prods_prior, on = 'order_id', how = 'outer', indicator = True)

In [19]:
ords_prods_combined_outer['_merge'].value_counts(dropna = False)

both          32434489
left_only       206209
right_only           0
Name: _merge, dtype: int64

From the frequency check above, the merge rate is actually not 100%. For this Instacart project, we’ll only be working with data sets that have a full merge rate.

# 05. Export Data

In [20]:
# Dimension check for 'ords_prods_combined' dataframe after data merging
ords_prods_combined.shape

(32434489, 10)

In [21]:
# Export 'ords_prods_combined' dataframe to 'Prepared Data' folder in pickle format
ords_prods_combined.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))