# Instacart Grocery Basket data wrangling and subsetting

In [66]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [67]:
# Establish data path
path = r'C:\Users\danie\Desktop\CareerFoundry\Achievement 4-Python\11-2023 Instacart Basket Analysis'
path

'C:\\Users\\danie\\Desktop\\CareerFoundry\\Achievement 4-Python\\11-2023 Instacart Basket Analysis'

In [68]:
# Import departments.csv
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

In [69]:
# Transposing def_dep
df_dep.T
df_dep_t = df_dep.T
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [70]:
# Add an index
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [71]:
# Create a new header in the df_dep
new_header = df_dep_t.iloc[0] # Create a new header
df_dep_t_new = df_dep_t[1:] # Remove the first row so I don't have two headers
df_dep_t_new.columns = new_header # Add the new header in the dataframe

In [72]:
# Make a data dictionary
data_dict = df_dep_t_new.to_dict('index')
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [73]:
# Import products.csv
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [74]:
# Double check first five entries in products.csv
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [75]:
# What does the department_id of 19 for Chocolate Sandwich Cookies stand for? It's snacks!
print(data_dict.get('19'))

{'department': 'snacks'}


In [76]:
# Import orders.csv
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)

In [77]:
# Dropping eval_set column from orders.csv
df_ords.drop(columns = ['eval_set'])
df_ords = df_ords.drop(columns = ['eval_set'])

In [78]:
# Rename order_dow column in orders.csv
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [79]:
# Change order_id in orders.csv to a string
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [80]:
# Subsetting data from the snacks department in df_prods
df_snacks =  df_prods[df_prods['department_id']==19]

In [81]:
df_prods['department_id']==19

0         True
1        False
2        False
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [82]:
# Find another identifier variable in the df_ords dataframe that doesn’t need to be a numeric variable
df_ords.dtypes

order_id                   object
user_id                     int64
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [83]:
# Change user_id from integer to string
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [84]:
# Change days_since_prior_order to days_since_last_order without overwriting the dataframe
df_ords.rename(columns = {'days_since_prior_order' : 'days_since_last_order'}, inplace = True)

In [85]:
# Find the busiest hour for placing orders: 10 o'clock
df_ords['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

In [86]:
# Determine the meaning behind a value of 4 in the department_id column of df_prods: Produce
print(data_dict.get('4'))

{'department': 'produce'}


In [87]:
# Create subset for breakfast item sales: Determine department_id number of 14
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [88]:
# Create subset for breakfast item sales
df_breakfast = df_prods[df_prods['department_id']==14]

In [89]:
# Create a dinner parties subset with departments alcohol, deli, beverages, and meat seafood
df_dinner_parties = df_prods.loc[df_prods['department_id'].isin([5,20,7,12])]

In [90]:
# Determine number of rows in df_dinner_parties: 7650 rows
df_dinner_parties.shape

(7650, 5)

In [91]:
# Extract information on user_id of 1
df_userid_1 = df_ords.loc[df_ords['user_id']=='1']
df_userid_1

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [92]:
# 10 Provide basic stats on purchasing behavior of user_id of 1
df_userid_1.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


In [94]:
# Export df_ords dataframe as orders_wrangled.csv in Prepared Data folder
df_ords.to_csv(os.path.join(path,'02 Data','Prepared Data','orders_wrangled.csv'))

In [95]:
# Export the df_dep_t_new dataframe as departments_wrangled.csv in Prepared Data folder
df_dep_t_new.to_csv(os.path.join(path,'02 Data','Prepared Data','departments_wrangled.csv'))