# 01. Notebook Setup

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [2]:
# Create quick path to project folder

projpath = r'/Users/laineyodette/Documents/proDocs/professional development/Learning/Learning - Data Analytics/CareerFoundry - Become a Data Analyst/Data Immersion Course/A4 - Python Fundamentals for Data Analysts/Submitted Tasks/2024-01 Instacart Basket Analysis'

# 02. File Import and Check

In [3]:
# Import orders_products_prior.csv data file into new df

df_ords_prods_all = pd.read_pickle(os.path.join(projpath, '02 Data', 'Prepared Data', 'ords_prods_merged.pkl'))

In [4]:
# Check dimensions of new df - should have 32,404,859 rows

df_ords_prods_all.shape

(32404859, 15)

**Confirmed** Number of rows is as expected.

In [5]:
df_ords_prods_all.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,customers_first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,1,196,1,0,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,0,196,1,1,Soda,77,7,9.0,both
2,473747,1,3,3,12,21.0,0,196,1,1,Soda,77,7,9.0,both
3,2254736,1,4,4,7,29.0,0,196,1,1,Soda,77,7,9.0,both
4,431534,1,5,4,15,28.0,0,196,1,1,Soda,77,7,9.0,both


# 03. Create subset to reduce amount of data

In [6]:
# Create subset of only first 1,000,000 rows

df = df_ords_prods_all[:1000000]

In [7]:
df.shape

(1000000, 15)

# 04. Define a 'price range' function with if-else and apply to subset

In [8]:
# Define the price range function

def price_label(row):

  if row['prices'] <= 5:
    return 'Low-range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid-range product'
  elif row['prices'] > 15:
    return 'High range'
  else: return 'Not enough data'

In [9]:
# Apply price range function to subset and create new column for the price range results

df['price_range'] = df.apply(price_label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis=1)


In [10]:
# Check the value counts on the price range column

df['price_range'].value_counts()

price_range
Mid-range product    756450
Low-range product    243550
Name: count, dtype: int64

**Note** No high-range products found. Reminder: this is a subset.

In [11]:
# Check to see what the most expensive item in the subset is

df['prices'].max()

14.8

# 05. Define a 'price range' function with loc() and apply to subset

In [12]:
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'


In [13]:
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product' 

In [14]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [15]:
df['price_range_loc'].value_counts()

price_range_loc
Mid-range product    756450
Low-range product    243550
Name: count, dtype: int64

# 06. Define a 'price range' function with loc() and apply to whole df

In [16]:
df_ords_prods_all.loc[df_ords_prods_all['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [17]:
df_ords_prods_all.loc[(df_ords_prods_all['prices'] <= 15) & (df_ords_prods_all['prices'] > 5), 'price_range_loc'] = 'Mid-range product' 

In [18]:
df_ords_prods_all.loc[df_ords_prods_all['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [19]:
df_ords_prods_all['price_range_loc'].value_counts()

price_range_loc
Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: count, dtype: int64

# 07. Categorize daily busyness using a for-loop

In [20]:
# Verify the busyness of each day of the week

df_ords_prods_all['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

## Create "Busiest Day" column categorizing busiest and least busy days

In [21]:
# Create a "busiest day" function and populate with a busyness categorization

result = []

for value in df_ords_prods_all["orders_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [22]:
result

['Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Reg

In [23]:
df_ords_prods_all['busiest_day'] = result

In [24]:
df_ords_prods_all['busiest_day'].value_counts(dropna = False)

busiest_day
Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: count, dtype: int64

## Add "Busiest Days" column for top and bottom two days

### TASK 4.7 Step 2

In [54]:
# Create a "busiest day" function and populate with a busyness categorization of top two and bottom two busiest days

result2 = []

for value in df_ords_prods_all["orders_day_of_week"]:
  if value == 0 or value == 1:
    result2.append("Busiest day")
  elif value == 4 or value == 3:
    result2.append("Least busy")
  else:
    result2.append("Regularly busy")

In [55]:
result2

['Regularly busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Least busy',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Least busy',
 'Busiest day',
 'Busiest day',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Busiest day',
 'Regularly busy',
 'Busiest day',
 'Busiest day',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Regularly busy',
 'Least busy',
 'Regularly b

In [56]:
#Places results in new column

df_ords_prods_all['busiest_days'] = result2

In [57]:
# Check count of results

df_ords_prods_all['busiest_days'].value_counts(dropna = False)

busiest_days
Regularly busy    12916111
Busiest day       11864412
Least busy         7624336
Name: count, dtype: int64

### TASK 4.7 Step 3

**Expectations and Observations**

Expected:

- Busiest Days = 11,864,412
- Least Busy Days = 7,624,336

Results: As expected. Took awhile to figure out how to figure out the "OR" part vs creating new elif statements, but I got it! CELEBRATE!

# 08. Categorize hourly busyness using a for-loop

In [41]:
# Verify the busyness of each hour of the day

df_ords_prods_all['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

## TASK 4.7 Step 4

**Most and Least Orders Logic** I decided to look at the numbers and select a cut off for busiest and least busy based on significant changes from the order counts by hour. Based on the results above, it looks like the difference between hour 9 and hour 17 was significant, so I used that as the cutoff for busiest. Then looking at the least busy, the difference between hour 5 and 1 was significant, so I used that cutoff.

In [63]:
# Create a "busiest period of the day" function and populate with an hourly busyness categorization

result3 = []

for value in df_ords_prods_all["order_hour_of_day"]:
  if value == 10 or value == 11 or value == 14 or value == 15 or value == 13 or value == 12 or value == 16 or value == 9: 
    result3.append("Most orders")
  elif value == 5 or value == 2 or value == 4 or value == 3:
    result3.append("Least orders")
  else:
    result3.append("Average orders")

In [64]:
result3

['Average orders',
 'Average orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most ord

In [65]:
#Places results in new column

df_ords_prods_all['busiest_period_of_day'] = result3

## TASK 4.7 Step 5

In [66]:
# Check count of results

df_ords_prods_all['busiest_period_of_day'].value_counts(dropna = False)

busiest_period_of_day
Most orders       21118071
Average orders    11024929
Least orders        261859
Name: count, dtype: int64

**Expectations and Observations**

Expected:

- Most Orders = 21,118,071
- Least Orders = 261,859

Results: As expected. Hooray!

# 09. Export updated df as pickle file

## TASK 4.7 Step 7

In [67]:
#Export updated df as pickle to Prepared Data folder

df_ords_prods_all.to_pickle(os.path.join(projpath, '02 Data', 'Prepared Data', 'ords_prods_derived.pkl'))