# Import Libraries

In [19]:
# Import libraries
import pandas as pd
import numpy as np
import os

# Import data sets

In [22]:
# create shortcut path
path = r"C:\Users\HP\Documents\06-2024 Instacart Basket Analysis"

In [5]:
# import data sets
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

# Data Wrangling

## Dropping columns

In [8]:
# removing the 'eval_set' column as it is not relevant data
df_ords = df_ords.drop(columns = ['eval_set'])

## Renaming columns

In [10]:
# changing 'order_dow' to 'orders_day_of_week'
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [11]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## Changing a variable's data type

In [13]:
# changing 'order_id' to string
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [14]:
# changing 'user_id' to string
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [15]:
df_ords.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3214874.0
mean,17.15486,2.776219,13.45202,11.11484
std,17.73316,2.046829,4.226088,9.206737
min,1.0,0.0,0.0,0.0
25%,5.0,1.0,10.0,4.0
50%,11.0,3.0,13.0,7.0
75%,23.0,5.0,16.0,15.0
max,100.0,6.0,23.0,30.0


## Transposing Data

In [25]:
# importing 'departments' data set
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

In [26]:
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [27]:
# transposing df_dep
df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [28]:
df_dep_t = df_dep.T

In [29]:
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [30]:
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


### Creating new header

In [32]:
# take the first row of the df_dep_t for the header
new_header = df_dep_t.iloc[0]

In [33]:
new_header

0    department
Name: department_id, dtype: object

In [34]:
# removing first row in header
df_dep_t_new = df_dep_t[1:]

In [35]:
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [36]:
# add the new header
df_dep_t_new.columns = new_header

In [37]:
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


# Data Dictionary

In [31]:
data_dict = df_dep_t_new.to_dict('index')

In [32]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [33]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [34]:
print(data_dict.get('19'))

{'department': 'snacks'}


# Subsetting

In [36]:
df_prods['department_id']==19

0         True
1        False
2        False
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [37]:
df_prods[df_prods['department_id']==19]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [38]:
df_snacks =  df_prods[df_prods['department_id']==19]

In [39]:
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [40]:
df_snacks_2 = df_prods.loc[df_prods['department_id'] == 19]

In [41]:
df_snacks_2

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [42]:
df_snacks_3 = df_prods.loc[df_prods['department_id'].isin([19])]

In [43]:
df_snacks_3

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


# Task 4.4

## 02. Changing a  df_ords data type

In [46]:
df_ords.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3214874.0
mean,17.15486,2.776219,13.45202,11.11484
std,17.73316,2.046829,4.226088,9.206737
min,1.0,0.0,0.0,0.0
25%,5.0,1.0,10.0,4.0
50%,11.0,3.0,13.0,7.0
75%,23.0,5.0,16.0,15.0
max,100.0,6.0,23.0,30.0


In [47]:
# changing order_number to 'string'
df_ords['order_number'] = df_ords['order_number'].astype('str')

In [48]:
df_ords['order_number'].dtype

dtype('O')

In [49]:
df_ords.describe()

Unnamed: 0,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3214874.0
mean,2.776219,13.45202,11.11484
std,2.046829,4.226088,9.206737
min,0.0,0.0,0.0
25%,1.0,10.0,4.0
50%,3.0,13.0,7.0
75%,5.0,16.0,15.0
max,6.0,23.0,30.0


## 03. Changing a df_ords column name

In [51]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [52]:
# removing the 's' from 'orders_day_of_week (I did not think the other column names were unituitive so I updated the column that was previously updated)
df_ords.rename(columns = {'orders_day_of_week' : 'order_day_of_week'}, inplace = True)

In [53]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## 04. Finding the busiest hour for placing orders

In [55]:
# Using the '.value_counts' function to get a summary of the frequency distribution
df_ords['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

In [56]:
# The busiest hour for placing orders is 10am

## 05. Finding value of 4 in 'department_id' within df_prods

In [102]:
print(data_dict.get('4'))

{'department': 'produce'}


In [104]:
# the department_id of 4 is the produce category

## 06. Creating subset for breakfast item sales

In [113]:
# creating the subset
df_breakfast = df_prods.loc[df_prods['department_id']==14]

In [115]:
# checking result
df_breakfast.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6


## 07. Creating subset for alcohol, beverages, meat/seafood, deli

In [120]:
# creating the subset
df_dinner_party = df_prods.loc[df_prods['department_id'].isin([5,7,12,20])]

In [122]:
# checking result
df_dinner_party.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1


## 08. Dimensions of df_dinner_party

In [127]:
# using '.shape()' to find dimensions
df_dinner_party.shape

(7650, 5)

In [129]:
# the 'df_dinner_party' has 7650 rows

## 09. Extracting information of user_id '1'

In [132]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [146]:
# creating the subset
df_user_id1 = df_ords.loc[df_ords['user_id']=='1']

In [150]:
# checking result
df_user_id1

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


## 10. Pulling descriptive stats of user_id '1'

In [153]:
# using '.describe()' function
df_user_id1.describe()

Unnamed: 0,order_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,10.0
mean,2.636364,10.090909,19.0
std,1.286291,3.477198,9.030811
min,1.0,7.0,0.0
25%,1.5,7.5,14.25
50%,3.0,8.0,19.5
75%,4.0,13.0,26.25
max,4.0,16.0,30.0


#### Observations

In [165]:
# the user only places orders from Sunday to Wednesday.
# The user usually places their orders around 10am (about half the time around 8am), with the latest order being placed at 4pm.
# They will usually place an order every 19 days
# They placed two orders in the same day only once, at 2pm then 4pm

## 12. Exporting 'df_ords'

In [168]:
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

## 13. Exporting 'df_dep_t_new'

In [51]:
# df_dep_t_new.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.csv'))

In [53]:
df_dep_t_new.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.pkl'))