# Contents
1. Importing libraries and files

2. Subsetting data to first 1 million rows

3. Creating price ranges for products with User-Defined Functions
   
4. Finding the Busiest time of day for orders using For-loop Functions  

5. Identifying the busiest and least busiest days of the week
   
6. Identifying Busiest hours of the day

7. Exporting Data

# 1. Importing Libraries and Files

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
path= r'C:\Users\spada\OneDrive\Data Analytics\02-2023 Instacart Basket Analysis'

In [3]:
df_ords_prods_merged=pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged.pkl'))

# 2. Subsetting data to first 1 million rows

In [4]:
#subsetting dataframe to only use the first 1 million rows 
df = df_ords_prods_merged[:1000000]

In [5]:
df.shape

(1000000, 15)

In [6]:
df_ords_prods_merged.shape

(32404859, 15)

# 3. Creating price ranges for products with User-Defined Functions

In [7]:
#define a function
def price_label(row):

  if row['prices'] <= 5:
    return 'Low-range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid-range product'
  elif row['prices'] > 15:
    return 'High range'
  else: return 'Not enough data'

In [11]:
#creating new column for price range in dataframe
df['price_range'] = df.apply(price_label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis=1)


In [9]:
df['price_range'].value_counts(dropna=False)

Mid-range product    756450
Low-range product    243550
Name: price_range, dtype: int64

In [10]:
#Find highest priced item
df['prices'].max()

14.8

# loc() Functions

In [13]:
#define a function using loc
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [14]:
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [15]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [16]:
df['price_range_loc'].value_counts(dropna=False)

Mid-range product    756450
Low-range product    243550
Name: price_range_loc, dtype: int64

In [17]:
#define a function using loc on entire dataframe
df_ords_prods_merged.loc[df_ords_prods_merged['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [18]:
df_ords_prods_merged.loc[(df_ords_prods_merged['prices'] <= 15) & (df_ords_prods_merged['prices']>5),'price_range_loc'] = 'Mid-range product'

In [19]:
df_ords_prods_merged.loc[df_ords_prods_merged['prices'] <=5, 'price_range_loc']= 'Low-range product'

In [20]:
df_ords_prods_merged['price_range_loc'].value_counts(dropna=False)

Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: price_range_loc, dtype: int64

In [21]:
df_ords_prods_merged['price_range_loc']

0           Mid-range product
1           Mid-range product
2           Mid-range product
3           Mid-range product
4           Mid-range product
                  ...        
32404854    Low-range product
32404855    Low-range product
32404856    Mid-range product
32404857    Mid-range product
32404858    Mid-range product
Name: price_range_loc, Length: 32404859, dtype: object

# 4. Finding the Busiest time of day for orders using For-loop Functions

In [20]:
#Finding what products are bought on the busiest and slowest days of the week using for-loops
df_ords_prods_merged['order_day_of_week'].value_counts(dropna=False)

0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: order_day_of_week, dtype: int64

In [21]:
result=[]

In [22]:
for value in df_ords_prods_merged["order_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [26]:
result

['Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Reg

In [193]:
print(type(result))

<class 'list'>


In [23]:
df_ords_prods_merged['busiest_day'] = result

In [24]:
df_ords_prods_merged ['busiest_day'].value_counts(dropna=False)

Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: busiest_day, dtype: int64

# 5. Identifying the busiest and least busiest days of the week

In [195]:
#Finding what products are bought on the 2 busiest and 2 slowest days of the week using for-loops
df2 = df_ords_prods_merged['order_day_of_week'].reset_index()

In [196]:
print(df2['order_day_of_week'].head(10))

0    2
1    3
2    3
3    4
4    4
5    2
6    1
7    1
8    1
9    4
Name: order_day_of_week, dtype: int64


In [197]:
df2.value_counts(dropna=False)

index     order_day_of_week
0         2                    1
21603250  6                    1
21603248  1                    1
21603247  2                    1
21603246  1                    1
                              ..
10801614  1                    1
10801613  5                    1
10801612  3                    1
10801611  6                    1
32404858  4                    1
Length: 32404859, dtype: int64

In [198]:
result2 = []

In [199]:
for value in df2['order_day_of_week']:
  if value == 0 or value == 1:
    result2.append("Busiest days")
  elif value == 3 or value == 4:
    result2.append("Least busy")
  else:
    result2.append("Regularly busy")

In [200]:
#Checking the length
len(result2)

32404859

In [201]:
#Checking shape
df2.shape

(32404859, 2)

In [202]:
df_ords_prods_merged["Busiest days"] = result2

In [203]:
result2

['Regularly busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Least busy',
 'Busiest days',
 'Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Least busy',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Least busy',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Leas

In [204]:
print(type(result2))

<class 'list'>


In [209]:
#checking for accuracy
df_ords_prods_merged['Busiest days'].value_counts(dropna=False)

Regularly busy    12916111
Busiest days      11864412
Least busy         7624336
Name: Busiest days, dtype: int64

# 6. Identifying Busiest hours of the day

In [224]:
#Finding the busiest time of day that orders are placed
df_ords_prods_merged['order_hour_of_day'].value_counts(dropna=False)

10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: order_hour_of_day, dtype: int64

### The hours of the day will be split into 3 equal parts to account for the periods of the day when there were the Most orders placed, Average Orders placed, and Fewest orders placed.

Most orders placed= 10, 11, 14, 15, 13, 12, 16, 9
Average orders placed = 17, 8, 18, 19, 20, 7, 21, 22
Fewest orders placed= 23, 6, 0, 1, 5, 2, 4, 3

In [225]:
df3 = df_ords_prods_merged['order_hour_of_day'].reset_index()

In [227]:
df3.value_counts(dropna=False)

index     order_hour_of_day
0         8                    1
21603250  16                   1
21603248  11                   1
21603247  11                   1
21603246  15                   1
                              ..
10801614  17                   1
10801613  15                   1
10801612  18                   1
10801611  14                   1
32404858  14                   1
Length: 32404859, dtype: int64

In [228]:
result3=[]

In [229]:
for value in df_ords_prods_merged["order_hour_of_day"]:
  if value in [10, 11, 14, 15, 13, 12, 16, 9]:
    result3.append("Most orders placed")
  elif value in [17, 8, 18, 19, 20, 7, 21, 22]:
    result3.append("Average orders placed")
  else:
    result3.append("Fewest orders placed")

In [230]:
result3

['Average orders placed',
 'Average orders placed',
 'Most orders placed',
 'Average orders placed',
 'Most orders placed',
 'Average orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Average orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Average orders placed',
 'Most orders placed',
 'Average orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Average orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Average orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most orders placed',
 'Most 

In [231]:
#Checking the length
len(result3)

32404859

In [233]:
#renaming column to busiest_period_of_day
df_ords_prods_merged["busiest_period_of_day"] = result3

In [237]:
# check the output of the 'order_hour_of_day' and the 'busiest_period_of_day' column for accuracy
df_ords_prods_merged[['order_hour_of_day', 'busiest_period_of_day']].head(10)

Unnamed: 0,order_hour_of_day,busiest_period_of_day
0,8,Average orders placed
1,7,Average orders placed
2,12,Most orders placed
3,7,Average orders placed
4,15,Most orders placed
5,7,Average orders placed
6,9,Most orders placed
7,14,Most orders placed
8,16,Most orders placed
9,8,Average orders placed


In [238]:
#checking the frequency
df_ords_prods_merged['busiest_period_of_day'].value_counts(dropna=False)

Most orders placed       21118071
Average orders placed     9997651
Fewest orders placed      1289137
Name: busiest_period_of_day, dtype: int64

# 7. Exporting Data

In [240]:
# Export df_ords_prods_merged dataframe
df_ords_prods_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'df_ords_prods_merged2.pkl'))

In [5]:
df = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'df_ords_prods_merged2.pkl'))