# 4.6 - Combining & Exporting Data - Part 2

### This script contains the following points:

* 01 - Importing Libraries
* 02 - Importing Data
   * a) Check the output of df_ords
   * b) Check the output of df_ords_prior 
* 03 - Merging Data
* 04 - Checking Data
* 05 - Exporting Data

---

## 01 - Importing Libraries

In [55]:
# Import libraries

import pandas as pd
import numpy as np
import os

---

## 02 - Importing Data

In [56]:
# Define path

path = r'/Users/juanigalvalisi/01-07-2022 - Instacart Basket Analysis'

In [57]:
# Import dataframes

df_ords_prods_comb = pd.read_pickle(os.path.join(path, '02 - Data', 'Prepared Data', 'orders_products_combined.pkl'))
df_prods = pd.read_csv(os.path.join(path, '02 - Data', 'Prepared Data', 'products_checked.csv'))

### a) Check the output of df_ords_prods_comb

In [58]:
# Check the output of df_ords_prods_comb

df_ords_prods_comb.head()

Unnamed: 0.1,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_last_order,new_customer,Unnamed: 0,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,True,24076664,196,1,0,both
1,2539329,1,1,2,8,,True,24076665,14084,2,0,both
2,2539329,1,1,2,8,,True,24076666,12427,3,0,both
3,2539329,1,1,2,8,,True,24076667,26088,4,0,both
4,2539329,1,1,2,8,,True,24076668,26405,5,0,both


In [59]:
df_ords_prods_comb.shape

(32434489, 12)

### b) Check the output of df_prods

In [60]:
# Check the output of df_prods

df_prods.head()

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,0,1,Chocolate Sandwich Cookies,61,19,5.8
1,1,2,All-Seasons Salt,104,13,9.3
2,2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,4,5,Green Chile Anytime Sauce,5,13,4.3


In [61]:
df_prods.shape

(49672, 6)

In [62]:
# Delete Unnamed column

del df_prods['Unnamed: 0']

In [63]:
# Check the output

df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


---

## 03 - Merging Data

In [64]:
# Drop the _merge column

df_ords_prods_comb = df_ords_prods_comb.drop(columns = ['_merge'])

In [65]:
# Check the output

df_ords_prods_comb.head()

Unnamed: 0.1,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_last_order,new_customer,Unnamed: 0,product_id,add_to_cart_order,reordered
0,2539329,1,1,2,8,,True,24076664,196,1,0
1,2539329,1,1,2,8,,True,24076665,14084,2,0
2,2539329,1,1,2,8,,True,24076666,12427,3,0
3,2539329,1,1,2,8,,True,24076667,26088,4,0
4,2539329,1,1,2,8,,True,24076668,26405,5,0


In [66]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [67]:
# Combine data frames

df_ords_prods_comb = df_ords_prods_comb.merge(df_prods, on = 'product_id', indicator = True)

In [68]:
# Check the output

df_ords_prods_comb.head()

Unnamed: 0.1,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_last_order,new_customer,Unnamed: 0,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,True,24076664,196,1,0,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,False,22742744,196,1,1,Soda,77,7,9.0,both
2,473747,1,3,3,12,21.0,False,4488095,196,1,1,Soda,77,7,9.0,both
3,2254736,1,4,4,7,29.0,False,21376074,196,1,1,Soda,77,7,9.0,both
4,431534,1,5,4,15,28.0,False,4089398,196,1,1,Soda,77,7,9.0,both


In [69]:
# Check the output

df_ords_prods_comb.shape

(32404859, 16)

In [70]:
# Check the output with using a merge flag

df_ords_prods_comb.value_counts(['_merge'])

_merge    
both          32404859
left_only            0
right_only           0
dtype: int64

In [71]:
# Drop _merge column because it is already unnecesary

del df_ords_prods_comb['_merge']

---

## 04. Checking Data

### Looking for Outliers

In [72]:
# Look for outliers within the 'prices' column

departmemts_agg = df_ords_prods_comb.groupby('department_id').agg({'prices': ['mean', 'min', 'max', 'count']})

In [73]:
departmemts_agg

Unnamed: 0_level_0,prices,prices,prices,prices
Unnamed: 0_level_1,mean,min,max,count
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,7.736553,1.0,15.0,2234743
2,6.990934,1.1,15.0,36291
3,7.853809,1.0,15.0,1172428
4,7.981708,1.0,15.0,9479291
5,8.143701,1.0,15.0,153696
6,7.682264,1.0,15.0,269253
7,7.680502,1.0,15.0,2688123
8,7.88907,1.0,15.0,97716
9,7.350283,1.0,15.0,866627
10,8.34896,1.4,14.1,34573


In [74]:
# Create a flag to find outliers within 'dairy eggs' department

# Not outlier

df_ords_prods_comb.loc[df_ords_prods_comb['prices'] != 99999, 'outlier_prices'] = 'Not outlier'
df_ords_prods_comb.loc[df_ords_prods_comb['prices'] != 14900, 'outlier_prices'] = 'Not outlier'

# Outlier

df_ords_prods_comb.loc[(df_ords_prods_comb['prices'] == 99999) & (df_ords_prods_comb['department_id'] == 16), 'outlier_prices'] = 'Outlier'
df_ords_prods_comb.loc[(df_ords_prods_comb['prices'] == 14900) & (df_ords_prods_comb['department_id'] == 16), 'outlier_prices'] = 'Outlier'

In [75]:
# Check the output

df_ords_prods_comb['outlier_prices'].value_counts(dropna = False)

Not Outlier    32399732
Outlier            5127
Name: outlier_prices, dtype: int64

In [76]:
# Edit 5127 outliers based on neighboring prices. 9,99 will be counted as a replacement instead of the 99999 value
# and 14,90 instead of the 14900 value

df_ords_prods_comb['prices'] = df_ords_prods_comb['prices'].replace([99999], 9.99)
df_ords_prods_comb['prices'] = df_ords_prods_comb['prices'].replace([14900], 14.90)

In [77]:
# Check the final output with multiple aggregations for the 'department_id' column

df_ords_prods_comb.groupby('department_id').agg({'prices': ['mean', 'min', 'max', 'count']})

Unnamed: 0_level_0,prices,prices,prices,prices
Unnamed: 0_level_1,mean,min,max,count
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,7.736553,1.0,15.0,2234743
2,6.990934,1.1,15.0,36291
3,7.853809,1.0,15.0,1172428
4,7.981708,1.0,15.0,9479291
5,8.143701,1.0,15.0,153696
6,7.682264,1.0,15.0,269253
7,7.680502,1.0,15.0,2688123
8,7.88907,1.0,15.0,97716
9,7.350283,1.0,15.0,866627
10,8.34896,1.4,14.1,34573


---

## 05 - Exporting data

In [78]:
# Export df_ords_products_merged data frame as .pkl

df_ords_prods_comb.to_pickle(os.path.join(path, '02 - Data','Prepared Data', 'orders_products_merged.pkl'))