# Deriving New Variables

## Import Libraries and Data


In [7]:
# Import necessary libraries
import pandas as pd
import os

# Define the path to data directory
path = r'C:\Users\mshhan\Documents\05-2024 Instacart Basket Analysis\02 Data\Prepared Data'

# Import the orders_products_combined dataframe from the pickle file
df_combined = pd.read_pickle(os.path.join(path, 'orders_products_combined.pkl'))

# Check the shape of the imported dataframe
print('Shape of df_combined:', df_combined.shape)

# Import the cleaned products dataframe
df_prods = pd.read_csv(os.path.join(path, 'products_cleaned.csv'))

# Check the shape of the imported dataframe
print('Shape of df_prods:', df_prods.shape)

# Check if there is an existing _merge column in df_combined and rename it if necessary
if '_merge' in df_combined.columns:
    df_combined.rename(columns={'_merge': '_merge_existing'}, inplace=True)

# Merge df_combined with df_prods on 'product_id'
df_final = df_combined.merge(df_prods, on='product_id', indicator=True)

# Check the shape of the merged dataframe
print('Shape of df_final:', df_final.shape)


Shape of df_combined: (32434489, 10)
Shape of df_prods: (49688, 5)
Shape of df_final: (32433030, 15)


## Create a 'price_label' Column


In [8]:
# Function to label products based on price
def price_label(row):
    if row['prices'] <= 5:
        return 'Low-range product'
    elif (row['prices'] > 5) and (row['prices'] <= 15):
        return 'Mid-range product'
    elif row['prices'] > 15:
        return 'High-range product'
    else:
        return 'Not enough data'

# Apply the price_label function to create a new column in df_final
df_final['price_label'] = df_final.apply(price_label, axis=1)

# Check the value counts for the new column
print(df_final['price_label'].value_counts())


price_label
Mid-range product     21889009
Low-range product     10126339
High-range product      417682
Name: count, dtype: int64


## Create a 'busiest_day' Column


In [9]:
# Check the column names in df_final
print(df_final.columns)


Index(['order_id', 'user_id', 'order_number', 'day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', '_merge_existing', 'product_name',
       'aisle_id', 'department_id', 'prices', '_merge', 'price_label'],
      dtype='object')


In [10]:
# Frequency of days of the week
day_counts = df_final['day_of_week'].value_counts()
print(day_counts)

# Creating the 'busiest_days_updated' column
result = []
for value in df_final['day_of_week']:
    if value in [0, 1]:
        result.append('Busiest days')
    elif value in [3, 4]:
        result.append('Slowest days')
    else:
        result.append('Regularly busy')

df_final['busiest_days_updated'] = result

# Check the values
df_final[['day_of_week', 'busiest_days_updated']].head(20)


day_of_week
0    6209410
1    5665604
6    4500101
2    4217610
5    4209334
3    3843929
4    3787042
Name: count, dtype: int64


Unnamed: 0,day_of_week,busiest_days_updated
0,2,Regularly busy
1,3,Slowest days
2,3,Slowest days
3,4,Slowest days
4,4,Slowest days
5,2,Regularly busy
6,1,Busiest days
7,1,Busiest days
8,1,Busiest days
9,4,Slowest days


# Updating 'busiest_day' Column to Reflect Two Busiest and Two Slowest Days


In [11]:
# Define the path to data directory
path = r'C:\Users\mshhan\Documents\05-2024 Instacart Basket Analysis\02 Data\Prepared Data'

# Import the combined orders and products dataframe
df_final = pd.read_pickle(os.path.join(path, 'ords_prods_merge.pkl'))

# Check the shape of the imported dataframe
print('Shape of df_final:', df_final.shape)

Shape of df_final: (32433030, 17)


## Identify Two Busiest and Two Slowest Days


In [12]:
# Calculate the order frequency for each day of the week
order_freq = df_final['day_of_week'].value_counts().sort_values(ascending=False)
print('Order frequency by day of week:', order_freq)

# Identify the two busiest days
busiest_days = order_freq.index[:2].tolist()
print('Busiest days:', busiest_days)

# Identify the two slowest days
slowest_days = order_freq.index[-2:].tolist()
print('Slowest days:', slowest_days)


Order frequency by day of week: day_of_week
0    6209410
1    5665604
6    4500101
2    4217610
5    4209334
3    3843929
4    3787042
Name: count, dtype: int64
Busiest days: [0, 1]
Slowest days: [3, 4]


## Create New Column with Updated Labels

In [13]:
# Create an empty list to store the results
result = []

# For-loop to categorize days based on the new criteria
for value in df_final['day_of_week']:
    if value in busiest_days:
        result.append('Busiest days')
    elif value in slowest_days:
        result.append('Slowest days')
    else:
        result.append('Regularly busy')

# Add the results to a new column in df_final
df_final['busiest_days_updated'] = result

# Check the value counts for the new column
print(df_final['busiest_days_updated'].value_counts())


busiest_days_updated
Regularly busy    12927045
Busiest days      11875014
Slowest days       7630971
Name: count, dtype: int64


In [14]:
# Display a sample of rows to manually verify the new column values
sample = df_final[['day_of_week', 'busiest_days_updated']].sample(20)
print(sample)


          day_of_week busiest_days_updated
1535190             5       Regularly busy
18772978            1         Busiest days
7519348             4         Slowest days
30104107            0         Busiest days
12960970            6       Regularly busy
15354319            4         Slowest days
18387073            2       Regularly busy
11751154            1         Busiest days
8783716             3         Slowest days
8160334             2       Regularly busy
15243296            2       Regularly busy
9957983             1         Busiest days
21610985            3         Slowest days
2189360             6       Regularly busy
25081236            2       Regularly busy
27351433            2       Regularly busy
9455580             2       Regularly busy
11009577            4         Slowest days
19502053            0         Busiest days
8433832             0         Busiest days


In [15]:
# Compare with the original data to ensure the new labels are accurate
# Display the order frequency by day of the week again
order_freq = df_final['day_of_week'].value_counts().sort_values(ascending=False)
print(order_freq)


day_of_week
0    6209410
1    5665604
6    4500101
2    4217610
5    4209334
3    3843929
4    3787042
Name: count, dtype: int64


## Exporting the Final Dataframe


In [16]:
# Export the final dataframe as a pickle file
df_final.to_pickle(os.path.join(path, 'ords_prods_merge.pkl'))

# Export the final dataframe as a CSV file
df_final.to_csv(os.path.join(path, 'ords_prods_merge.csv'))



### Analysis

- **Busiest Days (0 and 1)**:
  - The days with the highest order counts are day 0 (Saturday) with 6,209,410 orders and day 1 (Sunday) with 5,665,604 orders.
  - In the sample, these days are correctly labeled as "Busiest days".

- **Slowest Days (3 and 4)**:
  - The days with the lowest order counts are day 3 (Wednesday) with 3,843,929 orders and day 4 (Thursday) with 3,787,042 orders.
  - In the sample, these days are correctly labeled as "Slowest days".

- **Regularly Busy Days (2, 5, 6)**:
  - The remaining days are day 2 (Tuesday), day 5 (Friday), and day 6 (Monday) with order counts of 4,217,610, 4,209,334, and 4,500,101 respectively.
  - In the sample, these days are correctly labeled as "Regularly busy".

The values in the new column 'busiest_days_updated' appear to be accurate based on the day of the week order frequencies and the sample inspection. The labels correctly correspond to the two busiest days, the two slowest days, and the regularly busy days.


# Identifying the Busiest Periods of the Day

## Introduction
To prevent the Instacart app from freezing during peak times, we need to identify the busiest hours of the day. We'll create a new column, `busiest_period_of_day`, with the labels "Most orders," "Average orders," and "Fewest orders" based on the number of orders per hour.

In [17]:
df_final.head()


Unnamed: 0,order_id,user_id,order_number,day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge_existing,product_name,aisle_id,department_id,prices,_merge,price_label,busiest_day,busiest_days_updated
0,2539329,1,1,2,8,,196,1,0,both,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Regularly busy
1,2398795,1,2,3,7,15.0,196,1,1,both,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Slowest days
2,473747,1,3,3,12,21.0,196,1,1,both,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Slowest days
3,2254736,1,4,4,7,29.0,196,1,1,both,Soda,77,7,9.0,both,Mid-range product,Least busy,Slowest days
4,431534,1,5,4,15,28.0,196,1,1,both,Soda,77,7,9.0,both,Mid-range product,Least busy,Slowest days


## Calculate Order Frequency by Hour of Day

In [18]:
# Calculate the frequency of orders for each hour of the day
order_hour_frequency = df_final['order_hour_of_day'].value_counts().sort_index()

# Display the order frequency by hour of day
print(order_hour_frequency)


order_hour_of_day
0      218925
1      115780
2       69429
3       51317
4       53280
5       88054
6      290763
7      891900
8     1719888
9     2456591
10    2764288
11    2738483
12    2620719
13    2663169
14    2691448
15    2664420
16    2537358
17    2089385
18    1637858
19    1259335
20     976991
21     796341
22     634715
23     402593
Name: count, dtype: int64


## Determine the Thresholds for "Most orders," "Average orders," and "Fewest orders"

In [19]:
# Describe the order frequency to understand distribution
order_hour_description = order_hour_frequency.describe()

# Define thresholds based on quantiles
most_orders_threshold = order_hour_description['75%']  # Top 25%
fewest_orders_threshold = order_hour_description['25%']  # Bottom 25%

print("Most orders threshold:", most_orders_threshold)
print("Fewest orders threshold:", fewest_orders_threshold)


Most orders threshold: 2558198.25
Fewest orders threshold: 272803.5


## Create the busiest_period_of_day Column

In [24]:
# Frequency of order hours
hour_counts = df_final['order_hour_of_day'].value_counts()
print(hour_counts)

# Creating the 'busiest_period_of_day' column
result = []
for value in df_final['order_hour_of_day']:
    if 10 <= value <= 16:  # Most orders: 10 AM to 4 PM
        result.append('Most orders')
    elif 6 <= value < 10 or 17 <= value < 21:  # Average orders: 6 AM to 9 AM and 5 PM to 8 PM
        result.append('Average orders')
    else:  # Fewest orders: all other times
        result.append('Fewest orders')

df_final['busiest_period_of_day'] = result

# Check the values
df_final[['order_hour_of_day', 'busiest_period_of_day']].head(20)



order_hour_of_day
10    2764288
11    2738483
14    2691448
15    2664420
13    2663169
12    2620719
16    2537358
9     2456591
17    2089385
8     1719888
18    1637858
19    1259335
20     976991
7      891900
21     796341
22     634715
23     402593
6      290763
0      218925
1      115780
5       88054
2       69429
4       53280
3       51317
Name: count, dtype: int64


Unnamed: 0,order_hour_of_day,busiest_period_of_day
0,8,Average orders
1,7,Average orders
2,12,Most orders
3,7,Average orders
4,15,Most orders
5,7,Average orders
6,9,Average orders
7,14,Most orders
8,16,Most orders
9,8,Average orders


### Verify the New Column

Print the Frequency for `busiest_period_of_day` Column

In [26]:
# Print the frequency of the new column
busiest_period_freq = df_final['busiest_period_of_day'].value_counts()
print(busiest_period_freq)


busiest_period_of_day
Most orders       18679885
Average orders    11322711
Fewest orders      2430434
Name: count, dtype: int64


In [27]:
#Export the Dataframe as a Pickle File
# Export the updated dataframe as a pickle file
df_final.to_pickle(f'{path}\\ords_prods_updated.pkl')

# Confirm the export by listing the files in the directory
print(os.listdir(f'{path}'))

['departments_wrangled.csv', 'orders_cleaned.csv', 'orders_products_combined.csv', 'orders_products_combined.pkl', 'orders_wrangled.csv', 'ords_prods_merge.csv', 'ords_prods_merge.pkl', 'ords_prods_updated.pkl', 'products_cleaned.csv']
