# Contents

01. Importing Libraries
02. Data Types
03. Data Wrangling Procedures
04. Data Dictionaries
05. Subsetting
06. Exercise 4.4
07. Exporting Dataframes

# 01. Importing Libraries

In [28]:
import pandas as pd
import numpy as npa

In [29]:
# Confirming that pandas & numpy were imported by printing versions

print(pd.__version__)
print(np.__version__)

2.1.4
1.26.4


# 02. Importing Datasets

In [31]:
# Defines path as intro file path for Instacart project

path = r'/home/0668a905-109d-4403-be59-0a04abf51dd9/Instacart Basket Analysis'

### Import Orders Dataset

In [33]:
# Defines 'df_ords' (dataframe for orders.csv) with shortened path (using os.path.join)

df_ords = pd.read_csv(os.path.join(path,'02 Data', 'Original Data', 'orders.csv'), index_col = False)

### Import Products Dataset

In [35]:
# Defines 'df_prods' (dataframe for products.csv) with shortened path (using os.path.join)

df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

### Import Departments Dataset

In [37]:
# Defines 'df_dep' (dataframe for departments.csv) with shortened path (using os.path.join)

df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

# 03. Data Wrangling Procedures

### Drop Function Test

In [39]:
# Drops 'eval_set' from orders dataset (just a visual--df_ords not defined by this drop yet)

df_ords.drop(columns = ['eval_set'])

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


### Value Counts Test

In [41]:
# Use value_counts function to look for missing values in the 'days_since_prior_order' column

df_ords['days_since_prior_order'].value_counts(dropna = False)

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

### Renaming Columns

In [43]:
# renames the orders 'order_dow' column

df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [45]:
# Prints first 5 rows of orders (to check changed 'order_dow' column name)

df_ords.head() 

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


### Changing a Variable's Data Type

In [47]:
# Changes the data type of the orders.csv to a string so that the describe function can ignore

df_ords['order_id'] = df_ords['order_id'].astype('str')

In [49]:
# prints only data type of the 'order_id' column

df_ords['order_id'].dtype

dtype('O')

### Transposing Data, Indexing, & Creating Headers

In [51]:
# prints first 5 rows of departments dataset

df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [53]:
# Transposes departments table rows into columns (only visual--not actually changing dataframe)

df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [55]:
# Creating new dataframe df_dep_t to the transposed data format so that it can be used for analysis

df_dep_t = df_dep.T

In [57]:
# prints df_dep_t for test

df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [59]:
# Adds index to dept data frame

df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [61]:
# Takes the first row of df_dep_t for the header

new_header = df_dep_t.iloc[0]

In [63]:
# Prints new_header

new_header

0    department
Name: department_id, dtype: object

In [65]:
# Deletes first row of df_dep_t and defines as new data frame

df_dep_t_new = df_dep_t[1:]

In [67]:
# Prints df_dep_t_new

df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [69]:
# Sets new_header as header of df_dep_t_new

df_dep_t_new.columns = new_header

In [71]:
# Prints df_dep_t_new

df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


# 04. Data Dictionaries

In [73]:
# Turns df_dep_t_new into data dictionary

data_dict = df_dep_t_new.to_dict('index')

In [75]:
# Prints data_dict

data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [77]:
# Prints first 5 rows of df_prods

df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [79]:
# Prints department type from data dictionary by referencing department_id '19'

print(data_dict.get('19'))

{'department': 'snacks'}


# 05. Subsetting

In [81]:
# Use brackets to index df_prods 'department_id'

df_prods['department_id']==19

0         True
1        False
2        False
3        False
4        False
         ...  
49688    False
49689    False
49690    False
49691    False
49692    False
Name: department_id, Length: 49693, dtype: bool

In [83]:
# Prints df_prods and results only in department_id '19'

df_prods[df_prods['department_id']==19]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


In [85]:
# Creates a subset of df_prods 'department_id' as 'df_snacks'

df_snacks =  df_prods[df_prods['department_id']==19]

In [87]:
# Prints first 5 rows of df_snacks

df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


# 06. Exercise 4.4

### Find another identifer (user_id) in df_ords & change format

In [89]:
# Changes 'user_id' in df_ords to a string

df_ords['user_id'] = df_ords['user_id'].astype('str')

In [91]:
# prints only data type of the 'user_id' column

df_ords['user_id'].dtype

dtype('O')

### Change column name in df_ords

In [93]:
# renames the orders 'eval_set' column to 'evaluation_set'

df_ords.rename(columns = {'eval_set' : 'evaluation_set'}, inplace = True)

In [95]:
# Prints df_ords column names

df_ords.columns

Index(['order_id', 'user_id', 'evaluation_set', 'order_number',
       'orders_day_of_week', 'order_hour_of_day', 'days_since_prior_order'],
      dtype='object')

### Busiest period for placing orders is 10am

In [97]:
# Use value_counts function to look for frequency of values in order_hour_of_day

df_ords['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

### The value of '4' in department_id is Produce

In [99]:
# Prints department type from data dictionary by referencing department_id '4'

print(data_dict.get('4'))

{'department': 'produce'}


### Subset of Breakfast Items Sales

In [101]:
# Creates a subset of df_prods 'department_id' as 'df_snacks'

df_breakfast =  df_prods[df_prods['department_id']==14]

In [103]:
# Prints first and last 5 rows of the df_breakfast subset

df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


In [105]:
# Provides Descriptive Statistics of the df_breakfast subset

df_breakfast.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,1116.0,1116.0,1116.0,1116.0
mean,25464.490143,101.517921,14.0,7.848208
std,14271.353221,33.890362,0.0,3.971678
min,28.0,48.0,14.0,1.0
25%,12754.25,57.0,14.0,4.4
50%,26268.0,121.0,14.0,7.7
75%,38035.25,130.0,14.0,11.3
max,49663.0,130.0,14.0,14.9


### Dinner Party Data Subset

In [107]:
# Prints Data Dictionary for reference

data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [109]:
# Creates dataframe subset for dinner party items alcohol(5), deli(20), beverages (7), & meat/seafood(12)

df_dinnerparty = df_prods[df_prods['department_id'].isin([5,20,7,12])]

In [111]:
# Prints first and last 5 rows of df_dinnerparty

df_dinnerparty

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


### The df_dinnerparty subset dataframe has 7650 rows

In [113]:
# Prints # of rows and columns of df_dinnerparty

df_dinnerparty.shape

(7650, 5)

### Extracted information on customer 1

In [115]:
# Creates subset of customer 1 data (using '' because user_id was converted to string)

df_customer1 = df_ords.loc[df_ords['user_id'] == '1']

In [117]:
# prints # of rows, columns in df_customer1

df_customer1.shape

(11, 7)

In [119]:
# Prints data on customer 1

df_customer1

Unnamed: 0,order_id,user_id,evaluation_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


### Descriptive Statistics on Customer 1

In [121]:
# Printing df_customer1 descriptive statistics

df_customer1.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


# 07. Exporting Dataframes

In [123]:
# Export "wrangled" df_ords data into the Prepared Data folder

df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

In [125]:
# Export "wrangled" df_dep_t_new data into the Prepared Data folder

df_dep_t_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.csv'))

In [182]:
df_dep_t_new.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.pkl'))