# Table of contents

01. Importing libraries and merged products and orders data
02. Creating price flag

- Creating new subset of data
- Defining price segments
- Counting values per price segment

03. Creating busiest day flag

- Defining busyness level segments
- Counting values per busyness segment

04. Creating busiest dayS flag

- Defining busyness level segments for most busy days (plural)
- Counting values per busyness segment

05. Creating order level flag

- Defining order level segments per hour
- Counting values of orders per hour

06. Exporting flagged products and orders dataset

# 01. Importing libraries and merged products and orders data

In [1]:
# Importing libraries

import pandas as pd
import numpy as np
import os

In [2]:
#Importing merged products and orders pkl file

df_prods_ords_merged = pd.read_pickle(r'/Users/zoey/Career Foundry Stuff/Instacart Basket Analysis - 14.12.2023/02 Data/Prepared Data/ords_prods_merge.pkl')

In [3]:
df_prods_ords_merged.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,add_to_cart_order,reordered,_merge
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,5,0,both
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,1,1,both
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,20,0,both
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,10,0,both
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,11,1,both


In [4]:
df_prods_ords_merged

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,add_to_cart_order,reordered,_merge
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,5,0,both
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,1,1,both
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,20,0,both
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,10,0,both
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,11,1,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,49688,Fresh Foaming Cleanser,73,11,13.5,1788356,200215,2,0,9,5.0,27,0,both
32404855,49688,Fresh Foaming Cleanser,73,11,13.5,3401313,200377,1,4,11,,5,0,both
32404856,49688,Fresh Foaming Cleanser,73,11,13.5,809510,200873,5,3,8,15.0,12,0,both
32404857,49688,Fresh Foaming Cleanser,73,11,13.5,2359893,200873,9,3,15,5.0,11,1,both


# 02. Creating price flag

- Creating new subset of data
- Defining price segments
- Counting values per price segment

In [5]:
#Creating df subset consisting of the first 1,000,000 rows

df = df_prods_ords_merged[:1000000]

In [6]:
df.shape

(1000000, 14)

In [7]:
#Defining the price segments

def price_label(row):

  if row['prices'] <= 5:
    return 'Low-range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid-range product'
  elif row['prices'] > 15:
    return 'High range'
  else: return 'Not enough data'

In [8]:
#Applying the function defined above

df['price_range'] = df.apply(price_label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis=1)


In [9]:
# Checking the values in the new column

df['price_range'].value_counts(dropna = False)

price_range
Mid-range product    652638
Low-range product    338018
High range             9344
Name: count, dtype: int64

In [10]:
#Looking for highest priced item in the subset

df['prices'].max()

24.5

In [12]:
#Writing conditions for high-range products

df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [13]:
#Writing conditions for mid-range products

df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [14]:
#Writing conditions for low-range products

df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [15]:
#Counting products per label

df['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product     652638
Low-range product     338018
High-range product      9344
Name: count, dtype: int64

In [16]:
#Looking through entire merged dataset for high-range products

df_prods_ords_merged.loc[df_prods_ords_merged['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [17]:
#Looking through entire merged dataset for mid-range products

df_prods_ords_merged.loc[(df_prods_ords_merged['prices'] <= 15) & (df_prods_ords_merged['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [18]:
#Looking through entire merged dataset for low-range products

df_prods_ords_merged.loc[df_prods_ords_merged['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [19]:
#Counting products per label in the entire merged dataset

df_prods_ords_merged['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: count, dtype: int64

# 03. Creating busiest day flag

- Defining busyness level segments
- Counting values per busyness segment

In [20]:
#Counting busiest day in the entire merged dataset

df_prods_ords_merged['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [21]:
#Organizing days of the week by busyness level

result = []

for value in df_prods_ords_merged["orders_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [22]:
result

['Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Busiest day',
 'Least busy',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Bus

In [23]:
#Creating a new column in my dataframe for busyness of day

df_prods_ords_merged['busiest_day']=result

In [24]:
# Counting values in busiest_day column

df_prods_ords_merged['busiest_day'].value_counts(dropna=False)

busiest_day
Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: count, dtype: int64

# 04. Creating busiest dayS flag

- Defining busyness level segments for most busy days (plural)
- Counting values per busyness segment

In [25]:
#Organizing days of the week by busyness level for the 2 busiest days and 2 least busy days

result_b = []

for value in df_prods_ords_merged["orders_day_of_week"]:
  if value == 0 or value == 1:
    result_b.append("Busiest days")
  elif value == 3 or value == 4:
    result_b.append("Least busy days")
  else:
    result_b.append("Regularly busy")

In [26]:
result_b

['Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Least busy days',
 'Least busy days',
 'Busiest days',
 'Regularly busy',
 'Least busy days',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Least busy days',
 'Least busy days',
 'Regularly busy',
 'Least busy days',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Least busy days',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Regularly busy',
 'Least busy days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Least busy days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Least busy days',
 'Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Least busy days',
 'Regularly busy',
 'Busiest days',
 'Regularly busy',


In [27]:
#Creating a new column in my dataframe for busiest and least busy 2 days

df_prods_ords_merged['busiest_days']=result_b

In [28]:
# Counting values

df_prods_ords_merged['busiest_days'].value_counts(dropna=False)

busiest_days
Regularly busy     12916111
Busiest days       11864412
Least busy days     7624336
Name: count, dtype: int64

In [29]:
#Looking at the dataframe with the above changes made

df_prods_ords_merged.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,add_to_cart_order,reordered,_merge,price_range_loc,busiest_day,busiest_days
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,5,0,both,Mid-range product,Regularly busy,Regularly busy
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,1,1,both,Mid-range product,Regularly busy,Regularly busy
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,20,0,both,Mid-range product,Busiest day,Busiest days
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,10,0,both,Mid-range product,Regularly busy,Least busy days
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,11,1,both,Mid-range product,Least busy,Least busy days


In [30]:
#The sums from "busiest_day" (32,404,859)

#busiest_day
#Regularly busy    22416875
#Busiest day        6204182
#Least busy         3783802

#Match the sums from "busiest_days" (32,404,859)

#busiest_days
#Regularly busy     12916111
#Busiest days       11864412
#Least busy days     7624336

# 05. Creating order level flag

- Defining order level segments per hour
- Counting values of orders per hour

In [31]:
#Determining order activity by hour of day

df_prods_ords_merged['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

Since we are looking to avoid app crashes at busiest hours, the top 8 values appear relatively close together, and 24 is divisible by 8 into 3 sections, this is how we will divide the day into periods of time labeled “Most orders,” “Average orders,” and “Fewest orders" in a new column labeled “busiest_period_of_day”.

In [32]:
#Organizing days by order level

result_c = []

for value in df_prods_ords_merged['order_hour_of_day']:
  if value in [10, 11, 14, 15, 13, 12, 16, 9]:
    result_c.append("Most orders")
  elif value in [23, 6, 0, 1, 5, 2, 4, 3]:
    result_c.append("Fewest orders")
  else:
    result_c.append("Average orders")

In [33]:
result_c

['Most orders',
 'Average orders',
 'Average orders',
 'Most orders',
 'Average orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Average orders',
 'Fewest orders',
 'Average orders',
 'Fewest orders',
 'Fewest orders',
 'Fewest orders',
 'Fewest orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Fewest orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Average orders',
 'Most orders',
 'Most ord

In [34]:
#Creating a new column in my dataframe for busiest period of the day

df_prods_ords_merged['busiest_period_of_day']=result_c

In [35]:
#Checking

df_prods_ords_merged.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,add_to_cart_order,reordered,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,5,0,both,Mid-range product,Regularly busy,Regularly busy,Most orders
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,1,1,both,Mid-range product,Regularly busy,Regularly busy,Average orders
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,20,0,both,Mid-range product,Busiest day,Busiest days,Average orders
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,10,0,both,Mid-range product,Regularly busy,Least busy days,Most orders
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,11,1,both,Mid-range product,Least busy,Least busy days,Average orders


In [36]:
#Determining frequency of busiest period of the day

df_prods_ords_merged['busiest_period_of_day'].value_counts(dropna = False)

busiest_period_of_day
Most orders       21118071
Average orders     9997651
Fewest orders      1289137
Name: count, dtype: int64

# 06. Exporting flagged products and orders dataset

In [37]:
#Exporting flagged dataset to pickle

df_prods_ords_merged.to_pickle(r'/Users/zoey/Career Foundry Stuff/Instacart Basket Analysis - 14.12.2023/02 Data/Prepared Data/busy_ords_prods_merge.pkl')