# 4.7 Deriving New Variables
## 4.7.0 Initialization
## 4.7.1 Path Setup and Import 
## 4.7.2 Function Definition
## 4.7.3 Apply Function to Data Frame
## 4.7.4 Using loc()
## 4.7.5 Using For-loops
## 4.7.6 Deriving New Variables Task
### 4.7.6.1 Busiest hour of the day for-loop

## 4.7.0 Initialization

In [None]:
import pandas as pd
import numpy as np
import os

## 4.7.1 Path Setup and Import 

In [None]:
path = '/Users/matthewmacbook/Documents/CareerFoundry/Data Immersion/Achievement 4 - Python Fundamentals for Data Analysts/Instacart Basket Analysis 26-07-2023'

In [None]:
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged.pkl'))

In [None]:
df = ords_prods_merge[:1000000]

## 4.7.2 Function Definition

In [None]:
# create a function that returns a string related to the price of a product
def price_label(row):

  if row['prices'] <= 5:
    return 'Low-range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid-range product'
  elif row['prices'] > 15:
    return 'High-range product'
  else: return 'Not enough data'

## 4.7.3 Apply Function to Data Frame

In [None]:
# apply function onto the data frame. 
# warning message about using function on the dataframe 
# loc() is recommended instead

df['price_range'] = df.apply(price_label, axis=1)

In [None]:
# check frequency
df['price_range'].value_counts(dropna = False)

In [None]:
# there were no high-range products
# see that the max price is just below the cutoff for high-range products
df['prices'].max()

## 4.7.4 Using loc()

In [None]:
# using loc() to see how it differs from the user-defined function

df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [None]:
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [None]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [None]:
# same results from using loc() as the user-defined function

df['price_range_loc'].value_counts(dropna = False)

## 4.7.5 Using For-Loops

In [None]:
# use For-loop to determine the busiest and least busy day for ordering groceries according to the data
# first need to determine what days are the busiest and least busy

ords_prods_merge['orders_day_of_week'].value_counts(dropna = False)

In [None]:
# creation of the for-loop with an intial empty list created first

result = []

# for each value within the column 'orders_day_of_week', the value is compared within a series of if-else statements
# the results of which is placed into the 'result' list

for value in ords_prods_merge["orders_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [None]:
# append result list onto the data frame

ords_prods_merge['busiest_day'] = result

In [None]:
# check frequency of newly added column

ords_prods_merge['busiest_day'].value_counts(dropna = False)

## 4.7.6 Deriving New Variables Task

In [None]:
# creating new for-loop so that we can find busiest 2 days and least busy 2 days

new_result = []

for value in ords_prods_merge["orders_day_of_week"]:
  if value == 0 or value == 1:
    new_result.append("Busiest days")
  elif value == 4 or value == 3:
    new_result.append("Least busy")
  else:
    new_result.append("Regularly busy")

In [None]:
ords_prods_merge['busiest_days'] = new_result

In [None]:
ords_prods_merge.head()

In [None]:
ords_prods_merge['busiest_days'].value_counts(dropna = False)

### 4.7.6.1 Busiest hour of the day for-loop

In [None]:
# check frequency of orders by hour of day

ords_prods_merge['order_hour_of_day'].value_counts(dropna = False)

Cut the above list into 3 sections with 8 items in each.

Top section: 'Most orders' [10, 11, 14, 15, 13, 12, 16, 9]

Middle section: 'Average orders' [17, 8, 18, 19, 20, 7, 21, 22]

Bottom section: 'Least orders' [23, 6, 0, 1, 5, 2, 4, 3]

In [27]:
# create new for-loop using 'in' to check if value is within a specific set of values

hour_result = []

for value in ords_prods_merge["order_hour_of_day"]:
  if value in [10, 11, 14, 15, 13, 12, 16, 9]:
    hour_result.append("Most orders")
  elif value in [17, 8, 18, 19, 20, 7, 21, 22]:
    hour_result.append("Average orders")
  else:
    hour_result.append("Least orders")

In [28]:
ords_prods_merge['busiest_period_of_day'] = hour_result

In [29]:
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,busiest_day,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,both,Regularly busy,Regularly busy,Average orders
1,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77,7,9.0,both,Regularly busy,Least busy,Average orders
2,473747,1,3,3,12,21.0,False,196,1,1,Soda,77,7,9.0,both,Regularly busy,Least busy,Most orders
3,2254736,1,4,4,7,29.0,False,196,1,1,Soda,77,7,9.0,both,Least busy,Least busy,Average orders
4,431534,1,5,4,15,28.0,False,196,1,1,Soda,77,7,9.0,both,Least busy,Least busy,Most orders


In [30]:
# check frequencies
ords_prods_merge['busiest_period_of_day'].value_counts(dropna = False)

Most orders       21118071
Average orders     9997651
Least orders       1289137
Name: busiest_period_of_day, dtype: int64

In [31]:
# export newly merged data frame with 'busy' variables as pkl file


ords_prods_merge.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged_busy.pkl'))