# 01. Importing Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02. Importing Data Sets

In [2]:
# Creating a path to main project folder
path = r'C:\Users\jboer\OneDrive\Documents\Career Foundry\Instacart Basket Analysis'

In [3]:
# Importing orders.csv file
df_ords = pd.read_csv(os.path.join(path,'02 Data','Original Data','orders.csv'), index_col = False)

In [4]:
# Importing products.csv file
df_prods = pd.read_csv(os.path.join(path,'02 Data','Original Data','products.csv'), index_col = False)

# 03. Data Wrangling

### 03.1. Dropping columns

In [5]:
# Dropping a column after importing the data set
df_ords.drop(columns=['eval_set'])

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [6]:
# Redefining the dataframe after dropping columns
df_ords = df_ords.drop(columns=['eval_set'])

### 03.2. Renaming columns

In [7]:
# Renaming "order_dow" column name
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [8]:
# Checking execution of column renaming
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


### 03.3. Changing a Variable's Data Type

In [23]:
# Changing "order_id" data type to string so it's not calculated in descriptive statistics
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [24]:
# Checking 'order_id" data type change
df_ords['order_id'].dtype

dtype('O')

### 03.4. Transposing Data

In [9]:
# Importing departments.csv file
df_dep = pd.read_csv(os.path.join(path,'02 Data','Original Data','departments.csv'), index_col = False)

In [10]:
# Checking newly imported dataframe
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [11]:
# Transposing df_dep
df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [12]:
# Creating new dataframe to be able to use it.
df_dep_t = df_dep.T

In [13]:
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [14]:
# Add an index to a dataframe without one
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [15]:
# Take the first row of df_dp_t for the header
new_header = df_dep_t.iloc[0]

In [16]:
new_header

0    department
Name: department_id, dtype: object

In [17]:
# Creating new dataframe using data from "row 1" onward
df_dep_t_new = df_dep_t[1:]

In [18]:
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [19]:
# Using the "new_header" variable to set the headers for the df_dep_t_new dataframe
df_dep_t_new.columns = new_header

In [20]:
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


### 03.5. Data Dictionary

In [21]:
# Turning the df_dep_t_new dataframe into a data dictionary
data_dict = df_dep_t_new.to_dict('index')

In [22]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

# 04. Task Questions

### 2. Find another identifier variable in the df_ords dataframe that doesn’t need to be included in your analysis as a numeric variable and change it to a suitable format.

In [25]:
# Changing "user_id" data type to string so it's not calculated in descriptive statistics
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [26]:
# Checking 'user_id" data type change
df_ords['user_id'].dtype

dtype('O')

### 3. Look for a variable in your df_ords dataframe with an unintuitive name and change its name without overwriting the dataframe.

In [27]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


##### All names seem clear and I don't think any need to be changed, but if I were to change one without overwriting the dataframe I would do the following:

In [28]:
# Renaming "days_since_prior_order" column name without overwriting dataframe
df_ords_new = df_ords.rename(columns = {'days_since_prior_order' : 'days_since_last_order'})

In [29]:
df_ords_new.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


### 4. Your client wants to know what the busiest hour is for placing orders. Find the frequency of the corresponding variable and share your findings.

In [30]:
# Finding frequency of "order_hour_of_day" variable
df_ords['order_hour_of_day'].value_counts()

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

##### The busiest hour for placing orders is 10 AM with 288,418 orders placed at this time.

### 5. Determine the meaning behind a value of 4 in the "department_id" column within the df_prods dataframe using a data dictionary.

In [32]:
# Accessing data dictionary
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

##### The meaning behind a value of 4 in the "department_id" column is it stands for "produce" department

### 6. The sales team in your client’s organization wants to know more about breakfast item sales. Create a subset containing only the required information.

In [33]:
# Creating a subset using "loc" for breakfast item sales
df_breakfast = df_prods.loc[df_prods['department_id'] == 14]

In [34]:
df_breakfast.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6


### 7. They’d also like to see details about products that customers might use to throw dinner parties. Your task is to find all observations from the entire dataframe that include items from the following departments: alcohol, deli, beverages, and meat/seafood. You’ll need to present this subset to your client.

In [36]:
# Creating a subset for multiple departments
df_dinnerparty = df_prods.loc[df_prods['department_id'].isin([5,7,12,20])]

In [37]:
df_dinnerparty.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1


### 8. It’s important that you keep track of total counts in your dataframes. How many rows does the last dataframe you created have?

In [38]:
# Checking row count for last dataframe created (df_dinnerparty)
df_dinnerparty.shape

(7650, 5)

##### The last dataframe I created, the df_dinnerparty, has 7,650 rows and 5 columns

### 9. Someone from the data engineers team in Instacart thinks they’ve spotted something strange about the customer with a "user_id" of “1.” Extract all the information you can about this user.

In [43]:
# Extracting data for customer with "user_id" of "1"
df_user1 = df_ords.loc[df_ords['user_id']=='1']

In [45]:
# Showing data for customer with "user_id" of "1"
df_user1

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


### 10. You also need to provide some details about this user’s behavior. What basic stats can you provide based on the information you have?

In [46]:
# Showing descriptive statistics for customer 1
df_user1.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


##### Some basic stats regarding this user's behavior is that they have a placed a total of 11 orders with an average of 19 days between orders.
##### The average day they like to place orders is either Monday or Tuesday and the average time of day they place an order is 10AM.
##### The earliest time of day they ever placed an order was 7AM and the lateset they placed an order was 4PM.
##### The other thing I noticed is on their 8th order they placed it at 2PM then placed another order at 4PM creating their 9th order. This makes me bleieve the forgot to order something and had to place another order the same day.

# 05. Exporting Dataframes

In [47]:
# Exporting wrangled orders dataframe to Prepared Data folder
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

In [48]:
# Exporting wrangled departments dataframe to Prepared Data folder
df_dep_t_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.csv'))