# 1. Importing libraries

In [1]:
import pandas as pd
import numpy as np
import os

# 2. Importing data

In [2]:
path = r'/Users/test/Desktop/Data Analysis/11-2024 Instacart Basket Analysis'

In [12]:
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)

In [13]:
df_prod = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [14]:
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

# 3. Data Wrangling 

In [15]:
#dropping eval_set column from orders dataframe
df_ords=df_ords.drop(columns = ['eval_set'])

In [16]:
#changing data type for order_id and user_id in orders dataframe
df_ords['order_id'] = df_ords['order_id'].astype('str')
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [17]:
# changing column name 'order_dow' to 'orders_day_of_week'
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [19]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [21]:
# Calculate the frequency of orders for each hour
order_hour_frequency = df['order_hour_of_day'].value_counts()

In [22]:
print(order_hour_frequency)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64


The busiest hour for placing orders is: 10 with 288418 orders.

In [24]:
#transposing df_dep
df_dep_t=df_dep.T

In [25]:
# Change header of departments dataframe.
new_header = df_dep_t.iloc[0]

In [26]:
df_dep_t_new = df_dep_t[1:]

In [27]:
df_dep_t_new.columns = new_header

# 4. Creating Data Dictionary

In [28]:
data_dict = df_dep_t_new.to_dict('index')

In [29]:
# The meaning behind a value of 4 in the "department_id" column within the df_prods dataframe.
print(data_dict.get('4'))

{'department': 'produce'}


The department_id "4" corresponds to the category "produce".

# 5. Creating Subset

In [30]:
# Creating subset for breakfast items. 
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [32]:
df_prod_breakfast = df_prod.loc[df_prod['department_id'] == 14]

In [33]:
df_prod_breakfast.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6


In [34]:
# Creating "dinner party" subset. Department_id for "alcohol", "deli", "bevrages" and "meat/seafood" are respectively 5, 20, 7 and 12.
df_prod_dinner_party = df_prod.loc[df_prod['department_id'].isin([5,7,12,20])]

In [36]:
# How many rows does the last dataframe you created have?
df_prod_dinner_party.shape

(7650, 5)

The new dinner party dataframe has 7650 rows.

In [37]:
# Extract all the information about user with user_id 1.
df_user1=df_ords.loc[df_ords['user_id'] == '1']
df_user1

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [38]:
# Stats about the user's behaviour.
df_user1.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


The information get from the table:
1). The user places an order every 19 days on average. The user has placed two orders on the same day, and has a maximum of 30 days between orders.
2). The user places orders mostly on Wednesday and Thursday, never from Friday to Sunday.
3). The user has placed 11 orders between 7:00 and 16:00.

# 6. Export Data

In [44]:
# Export orders dataframe
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

In [45]:
# Export departments data frame
df_dep_t_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.csv'))