### Step 1: Import Libraries

In [2]:
import pandas as pd
import os


### Step 2: Define Project Paths

In [3]:
# Define the project path
project_path = r"C:\Users\mshhan\Documents\05-2024 Instacart Basket Analysis\02 Data"

# Paths for the data files
orders_path = os.path.join(project_path, "Prepared Data", "orders_wrangled.csv")
orders_prior_path = os.path.join(project_path, "Original Data", "orders_products_prior.csv")


### Step 3: Load DataFrames

In [4]:
# Load the orders data
df_ords = pd.read_csv(orders_path)

# Load the orders_products_prior data
df_ords_prior = pd.read_csv(orders_prior_path)


### Step 4: Inspect DataFrames

In [5]:
# Inspect the first few rows of df_ords
print(df_ords.head())

# Inspect the first few rows of df_ords_prior
print(df_ords_prior.head())


   order_id  user_id  order_number  day_of_week  order_hour_of_day  \
0   2539329        1             1            2                  8   
1   2398795        1             2            3                  7   
2    473747        1             3            3                 12   
3   2254736        1             4            4                  7   
4    431534        1             5            4                 15   

   days_since_prior_order  
0                     NaN  
1                    15.0  
2                    21.0  
3                    29.0  
4                    28.0  
   order_id  product_id  add_to_cart_order  reordered
0         2       33120                  1          1
1         2       28985                  2          1
2         2        9327                  3          0
3         2       45918                  4          1
4         2       30035                  5          0


### Step 5: Check Dimensions

In [6]:
# Check dimensions of df_ords
print(df_ords.shape)

# Check dimensions of df_ords_prior
print(df_ords_prior.shape)


(3421083, 6)
(32434489, 4)


### Step 6: Merge DataFrames

In [8]:
# Merge df_ords with df_ords_prior on 'order_id'
df_merged_large = df_ords.merge(df_ords_prior, on='order_id', indicator=True)



### Step 7: Inspect Merged DataFrame

In [9]:
# Inspect the first few rows of df_merged_large
print(df_merged_large.head())

# Check the merge indicator to ensure a full match
print(df_merged_large['_merge'].value_counts())


   order_id  user_id  order_number  day_of_week  order_hour_of_day  \
0   2539329        1             1            2                  8   
1   2539329        1             1            2                  8   
2   2539329        1             1            2                  8   
3   2539329        1             1            2                  8   
4   2539329        1             1            2                  8   

   days_since_prior_order  product_id  add_to_cart_order  reordered _merge  
0                     NaN         196                  1          0   both  
1                     NaN       14084                  2          0   both  
2                     NaN       12427                  3          0   both  
3                     NaN       26088                  4          0   both  
4                     NaN       26405                  5          0   both  
_merge
both          32434489
left_only            0
right_only           0
Name: count, dtype: int64


### Step 8: Save the Merged DataFrame

In [10]:
# Save the merged DataFrame to CSV
df_merged_large.to_csv(os.path.join(project_path, "Prepared Data", "orders_products_combined.csv"), index=False)

# Save the merged DataFrame to pickle
df_merged_large.to_pickle(os.path.join(project_path, "Prepared Data", "orders_products_combined.pkl"))
