# Contents List

01. Import libraries
02. Import and inspect data
03. Merge dataframes
04. Export merge as pkl

# 01. Import Libraries

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

# 02. Import and inspect data

In [2]:
# create shortcut for data imports
path = r'C:\Users\jacym\Desktop\Career Foundry projects\04-2023 Instacart basket analysis'

In [4]:
# import checked orders data
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared data', 'all_orders_checked.csv'), index_col = False)

In [3]:
# import new data - order_products_prior
df_ords_prior = pd.read_csv(os.path.join(path, '02 Data', 'Original data', 'order_products_prior.csv'), index_col = False)

In [4]:
# consistency checks- blanks
df_ords_prior.isnull().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

In [5]:
# check for duplicates
df_ords_prior_dups = df_ords_prior[df_ords_prior.duplicated()]

In [6]:
# check for duplicates
df_ords_prior_dups

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered


In [7]:
# check summary statistics
df_ords_prior.describe()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
count,32434490.0,32434490.0,32434490.0,32434490.0
mean,1710749.0,25576.34,8.351076,0.5896975
std,987300.7,14096.69,7.126671,0.4918886
min,2.0,1.0,1.0,0.0
25%,855943.0,13530.0,3.0,0.0
50%,1711048.0,25256.0,6.0,1.0
75%,2565514.0,37935.0,11.0,1.0
max,3421083.0,49688.0,145.0,1.0


In [8]:
# check out orders with high add_to_cart_order values
df_ords_prior[df_ords_prior['add_to_cart_order']> 50.0]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
11332,1171,2677,51,1
11333,1171,6583,52,1
11334,1171,45636,53,1
11335,1171,38652,54,1
11336,1171,29627,55,1
...,...,...,...,...
32423239,3419937,21616,64,0
32425187,3420120,27243,51,1
32425188,3420120,651,52,1
32425189,3420120,42736,53,0


# Note: It looks like this column represents the order in which the item was added to the virtual cart, so the high values are only present for very large orders. Check out one large order to confirm...

In [10]:
df_ords_prior[df_ords_prior['order_id'] == 1171]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
11282,1171,43086,1,1
11283,1171,47209,2,1
11284,1171,49683,3,0
11285,1171,24852,4,1
11286,1171,22825,5,1
...,...,...,...,...
11338,1171,25718,57,1
11339,1171,3913,58,1
11340,1171,42356,59,1
11341,1171,4656,60,0


# Confirmed- column value increases as customer adds items.

In [6]:
# check output
df_ords_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [7]:
df_ords_prior.shape

(32434489, 4)

In [8]:
df_ords.shape

(3421083, 6)

In [9]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


# 03. Merge dataframes

In [10]:
df_merged_large = df_ords.merge(df_ords_prior, on = 'order_id', indicator = True)

In [11]:
df_merged_large.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,196,1,0,both
1,2539329,1,1,2,8,,14084,2,0,both
2,2539329,1,1,2,8,,12427,3,0,both
3,2539329,1,1,2,8,,26088,4,0,both
4,2539329,1,1,2,8,,26405,5,0,both


In [12]:
df_merged_large['_merge'].value_counts()

both          32434489
left_only            0
right_only           0
Name: _merge, dtype: int64

In [14]:
df_merged_large.shape

(32434489, 10)

# 04. Export data as pkl

In [13]:
# export data to pkl

df_merged_large.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_combined.pkl'))