# Table of Content
01. Import Libraries
02. Import Data
03. Change Data Types to Save Memory
04. Merge Data
05. Export Data

# 01. Import Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Adjust the setting to view all columns in this notebook
pd.options.display.max_columns = None

# 02. Import Data

In [3]:
# Define the main project folder path
path = r'C:\Users\saich\Desktop\CareerFoundry\Data Immersion\Achievement 4 Python Fundamentals for Data Analysts\04-2023 Instacart Basket Analysis (github)'

In [4]:
# Import 'orders_products_merged_flagged' data set from 'Prepared Data' folder
ords_prods_merged = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged_flagged.pkl'))

In [5]:
ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price,spender_flag,median_order_interval,order_frequency_flag
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regular busy,Regular busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regular busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regular busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer


In [6]:
ords_prods_merged.shape

(32399732, 24)

In [7]:
# Import 'customers_checked' data set from 'Prepared Data' folder
cust = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'customers_checked.csv'), index_col = 0)

In [8]:
cust.head()

Unnamed: 0,user_id,gender,state,age,date_joined,dependant_counts,family_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374


In [9]:
cust.shape

(206209, 8)

# 03. Change Data Types to Save Memory

In [10]:
# Check the memory usage and column data types of 'ords_prods_merged' dataframe
ords_prods_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32399732 entries, 0 to 32399731
Data columns (total 24 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int32   
 1   user_id                 int32   
 2   order_number            int8    
 3   order_day_of_week       int8    
 4   order_hour_of_day       int8    
 5   days_since_prior_order  float16 
 6   product_id              int32   
 7   add_to_cart_order       int32   
 8   reordered               int8    
 9   product_name            object  
 10  aisle_id                int8    
 11  department_id           int8    
 12  prices                  float32 
 13  _merge                  category
 14  price_range             object  
 15  busiest_day             object  
 16  busiest_days            object  
 17  busiest_period_of_day   object  
 18  max_order               int8    
 19  loyalty_flag            object  
 20  avg_price               float32 
 21  spende

In [11]:
# Check the memory usage and column data types of 'cust' dataframe
cust.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 206209 entries, 0 to 206208
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user_id           206209 non-null  int64 
 1   gender            206209 non-null  object
 2   state             206209 non-null  object
 3   age               206209 non-null  int64 
 4   date_joined       206209 non-null  object
 5   dependant_counts  206209 non-null  int64 
 6   family_status     206209 non-null  object
 7   income            206209 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 14.2+ MB


In [12]:
# Change the data types 'cust' dataframe
cust['user_id'] = cust['user_id'].astype('int32')
cust['age'] = cust['age'].astype('int8')
cust['dependant_counts'] = cust['dependant_counts'].astype('int8')
cust['income'] = cust['income'].astype('int32')

In [13]:
# Check the memory usage and column data types of 'cust' dataframe after data types changes
cust.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 206209 entries, 0 to 206208
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user_id           206209 non-null  int32 
 1   gender            206209 non-null  object
 2   state             206209 non-null  object
 3   age               206209 non-null  int8  
 4   date_joined       206209 non-null  object
 5   dependant_counts  206209 non-null  int8  
 6   family_status     206209 non-null  object
 7   income            206209 non-null  int32 
dtypes: int32(2), int8(2), object(4)
memory usage: 9.8+ MB


# 04. Merge Data

In [14]:
# Before merging data, drop the '_merge' column in 'ords_prods_merged' dataframe, else will get an error
ords_prods_merged.drop(columns = ['_merge'], inplace = True)

In [15]:
ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price,spender_flag,median_order_interval,order_frequency_flag
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range product,Regular busy,Regular busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regular busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regular busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer


In [16]:
# Merge 'ords_prods_merged' dataframe with 'cust' dataframe using 'user_id' as the key column
ords_prods_all = ords_prods_merged.merge(cust, on = ['user_id'], indicator = True)

In [17]:
# Check the output
ords_prods_all.head(20)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price,spender_flag,median_order_interval,order_frequency_flag,gender,state,age,date_joined,dependant_counts,family_status,income,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range product,Regular busy,Regular busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regular busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regular busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
5,3367565,1,6,2,7,19.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regular busy,Regular busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
6,550135,1,7,1,9,20.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regular busy,Most busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
7,3108588,1,8,1,14,14.0,196,2,1,Soda,77,7,9.0,Mid-range product,Regular busy,Most busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
8,2295261,1,9,1,16,0.0,196,4,1,Soda,77,7,9.0,Mid-range product,Regular busy,Most busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
9,2550362,1,10,4,8,30.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both


In [18]:
ords_prods_all['_merge'].value_counts(dropna = False)

both          32399732
left_only            0
right_only           0
Name: _merge, dtype: int64

From the frequency check above, there are only entries that have a value of “both,” leading you to think that your key column, “order_id” exists completely in both dataframes. However, this conclusion is wrong. This is because you chose the default option of inner join in this merge. This means that the resulting table will only contain observations found in both dataframes. As such, the merge flag here will only show entries that have a value of “both.”

In [19]:
# To check whether it is a full match, use outer join
# Merge 'ords_prods_merged' dataframe with 'cust' dataframe using 'user_id' as the key column and outer join
ords_prods_all_outer = ords_prods_merged.merge(cust, on = 'user_id', how = 'outer', indicator = True)

In [20]:
ords_prods_all_outer['_merge'].value_counts(dropna = False)

both          32399732
left_only            0
right_only           0
Name: _merge, dtype: int64

The actual merge rate of both dataframes is 100%. 

# 05. Export Data

In [21]:
# Dimension check for 'ords_prods_all' dataframe after data merging
ords_prods_all.shape

(32399732, 31)

In [22]:
# Export 'ords_prods_all' dataframe to 'Prepared Data' folder in pickle format
ords_prods_all.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_all.pkl'))