# Contents List

01. Import libraries
02. Import data
03. Data wrangling exercise
04. Data wrangling task

# 01. Import libraries

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

# 02. Import data

In [2]:
# create shortcut for data imports
path = r'C:\Users\jacym\Desktop\Career Foundry projects\04-2023 Instacart basket analysis'

In [3]:
# import orders.csv
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original data', 'orders.csv'), index_col = False)

In [4]:
# import products.csv
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original data', 'products.csv'), index_col = False)

In [5]:
# import departments.csv
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original data', 'departments.csv'), index_col = False)

# 03. Data wrangling exercise

In [6]:
# drop eval_set column
df_ords = df_ords.drop(columns = 'eval_set')

In [7]:
# count blanks in days_since_prior_order columns
df_ords['days_since_prior_order'].value_counts(dropna = False)

30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: days_since_prior_order, dtype: int64

In [8]:
# rename column
df_ords.rename(columns = {'order_dow' : 'order_day_of_week'}, inplace = True)

In [9]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [10]:
# change data type for order_id to string so describe() disregards it
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [11]:
# check that it worked
df_ords['order_id'].dtype

dtype('O')

In [12]:
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [13]:
#transpose df (switch rows/columns)
df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [14]:
# overwrite df with transposed df
df_dep_t = df_dep.T

In [15]:
# add index
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [16]:
# create new header
new_header = df_dep_t.iloc[0]

In [17]:
# create new dataset without first row
df_dep_t_new = df_dep_t[1:]

In [18]:
# set column names with new_header variable established earlier
df_dep_t_new.columns = new_header

In [19]:
df_dep_t_new.head()

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol


In [20]:
# turn df_dep into data dictionary
data_dict = df_dep_t_new.to_dict('index')

In [21]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [22]:
# view first 5 rows
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [23]:
# retrieve value from data dictionary
print(data_dict.get('19'))

{'department': 'snacks'}


In [24]:
# produce df that lists snacks - items with dept id 19
df_snacks = df_prods[df_prods['department_id'] == 19]

In [25]:
# view first 10 rows of snacks df
df_snacks.head(10)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
55,56,Healthy Pop Butter Popcorn,23,19,4.4
58,59,Medium Taqueria Style Chipotle Salsa,50,19,6.8
76,77,Coconut Chocolate Chip Energy Bar,3,19,6.4
77,78,Nutter Butter Cookie Bites Go-Pak,61,19,3.0
92,93,Uncured Cracked Pepper Beef,23,19,2.4


# 04. Data wrangling task

In [26]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


# Step 2 - change data type

In [27]:
# change user_id to string because the numbers can't be usefully analyzed
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [28]:
#check that it worked. O = success!
df_ords['user_id'].dtype

dtype('O')

# Step 3 - rename column w/out overwriting

In [29]:
# attempt to rename user_id column as customer_id without overwriting data frame
df_ords_renamed = df_ords.rename(columns = {'user_id' : 'customer_id'}, inplace = False)

In [30]:
# check alternate df
df_ords_renamed.head()

Unnamed: 0,order_id,customer_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


# Step 4 - busiest hour for orders

In [31]:
# display frequency counts of hour_of_day column
df_ords['order_hour_of_day'].value_counts(dropna = False)

10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: order_hour_of_day, dtype: int64

Answer- Hour 10

# Step 5- use dictionary to find dept name

In [32]:
print(data_dict.get('4'))

{'department': 'produce'}


# Step 6- pull up breakfast sales

In [33]:
# view data dictionary
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [34]:
# create subset of breakfast items
df_breakfast = df_prods[df_prods['department_id'] == 14]

# Step 7- create multi-dept subset

In [35]:
df_dinner_parties = df_prods.loc[df_prods['department_id'].isin([5,20,7,12])]

# Step 8- count rows in dataframe

In [36]:
df_dinner_parties

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


Answer: 7650 rows

# Step 9- extract info about user 1

In [37]:
# create subset for user 1
df_user_1 = df_ords.loc[df_ords['user_id'] == 1]

In [38]:
#view subset
df_user_1

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order


# Step 10- View stats about subset

In [39]:
# view descriptive stats of subset
df_user_1.describe()

Unnamed: 0,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,0.0,0.0,0.0,0.0
mean,,,,
std,,,,
min,,,,
25%,,,,
50%,,,,
75%,,,,
max,,,,


# Step 12- Export orders data

In [40]:
df_ords.to_csv(r'C:\Users\jacym\Desktop\Career Foundry projects\04-2023 Instacart basket analysis\02 Data\Prepared data\orders_wrangled.csv', index=False)

# Step 13- Export departments data

In [46]:
df_dep_t_new.to_csv(os.path.join(path, '02 Data','Prepared Data','departments_wrangled.csv'))