### This script contains the following points:

#### 1. Importing libraries
#### 2. Importing original order data, cleaned order data, and cleaned product data
#### 3. Check shape of original order and cleaned order data
#### 4. Merging original and cleaned order data frames
#### 5. Import products checked data
#### 6. Combine orders_products_combined with products data set
#### 7. Exporting df_merged_ord_prod to pkl

# 1. Importing libraries

In [8]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 2. Importing original order data, cleaned order data, and cleaned product data

In [9]:
# importing order_products_prior from original data
df_ords_prior = pd.read_csv(r'C:\Users\kevan\Documents\Career Foundry\Data Immersion\Achievement 4\Instacart Basket Analysis\02 Data\Original Data\order_products_prior.csv', index_col = False)

In [10]:
# importing orders from prepared data
# index_col = [0] to remove default index column in df
df_ords_clean = pd.read_csv(r'C:\Users\kevan\Documents\Career Foundry\Data Immersion\Achievement 4\Instacart Basket Analysis\02 Data\Prepared Data\orders_checked.csv', index_col = [0])

In [11]:
# importing products from prepared data
df_prods_clean = pd.read_csv(r'C:\Users\kevan\Documents\Career Foundry\Data Immersion\Achievement 4\Instacart Basket Analysis\02 Data\Prepared Data\products_checked.csv', index_col = False)

In [28]:
# assigning main project path to variable 'path'
path = r'C:\Users\kevan\Documents\Career Foundry\Data Immersion\Achievement 4\Instacart Basket Analysis'

# 3. Check shape of original order and cleaned order data

In [12]:
# checking dimensions of imported dataframe
df_ords_prior.shape

(32434489, 4)

In [13]:
# check the output to determine common column names
df_ords_prior.head(1)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1


In [14]:
# checking shape of cleaned orders data frame
df_ords_clean.shape

(3421083, 6)

In [15]:
# check the output to determine common column names
df_ords_clean.head(1)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,


# 4. Merging original and cleaned order data frames

In [16]:
#type of join defaults to inner, so it doesn't need to be specified
orders_products_combined = df_ords_clean.merge(df_ords_prior, on = ['order_id'], indicator = True)
orders_products_combined

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,196,1,0,both
1,2539329,1,1,2,8,,14084,2,0,both
2,2539329,1,1,2,8,,12427,3,0,both
3,2539329,1,1,2,8,,26088,4,0,both
4,2539329,1,1,2,8,,26405,5,0,both
...,...,...,...,...,...,...,...,...,...,...
32434484,2977660,206209,13,1,12,7.0,14197,5,1,both
32434485,2977660,206209,13,1,12,7.0,38730,6,0,both
32434486,2977660,206209,13,1,12,7.0,31477,7,0,both
32434487,2977660,206209,13,1,12,7.0,6567,8,0,both


In [17]:
# used to verify that all data merged
orders_products_combined['_merge'].value_counts()

both          32434489
left_only            0
right_only           0
Name: _merge, dtype: int64

In [18]:
#removing _merge flag column 
orders_products_combined = df_ords_clean.merge(df_ords_prior, on = ['order_id'], indicator = False)
orders_products_combined

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,2539329,1,1,2,8,,196,1,0
1,2539329,1,1,2,8,,14084,2,0
2,2539329,1,1,2,8,,12427,3,0
3,2539329,1,1,2,8,,26088,4,0
4,2539329,1,1,2,8,,26405,5,0
...,...,...,...,...,...,...,...,...,...
32434484,2977660,206209,13,1,12,7.0,14197,5,1
32434485,2977660,206209,13,1,12,7.0,38730,6,0
32434486,2977660,206209,13,1,12,7.0,31477,7,0
32434487,2977660,206209,13,1,12,7.0,6567,8,0


In [19]:
# checking dimensions of imported dataframe
orders_products_combined.shape

(32434489, 9)

# 5. Import products checked data

In [20]:
# importing products from prepared data
df_prods_clean = pd.read_csv(r'C:\Users\kevan\Documents\Career Foundry\Data Immersion\Achievement 4\Instacart Basket Analysis\02 Data\Prepared Data\products_checked.csv', index_col = 0)

# 6. Combine orders_products_combined with products data set

In [21]:
# checking shape of cleaned products data frame
df_prods_clean.shape

(49672, 5)

In [22]:
# display .head of products df to see column headers
df_prods_clean.head(1)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8


In [23]:
# display .head of orders_products_combined df to see column headers
orders_products_combined.head(1)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,2539329,1,1,2,8,,196,1,0


### Product_id is common column name between both data sets. 
### Inner join performed using product_id as the key so that matching data is present in final data frame.

In [24]:
#inner join products dataset to orders_products_combined using product_id column
# indicator used to verify that merge is successful

df_merged_ord_prod = orders_products_combined.merge(df_prods_clean, on ='product_id', indicator=True)

### Checking counts of merge values to verify that merge was successful

In [25]:
#all values confirmed to be merged

df_merged_ord_prod['_merge'].value_counts()

both          32404859
left_only            0
right_only           0
Name: _merge, dtype: int64

In [26]:
#removing _merge flag from df

df_merged_ord_prod = orders_products_combined.merge(df_prods_clean, on ='product_id')

# 7. Exporting df_merged_ord_prod to pkl

In [29]:
# Export data to pkl

df_merged_ord_prod.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged.pkl'))