# 4.8: Grouping Data & Aggregating Variables

## This notebook contains
1. Import data
2. Grouping data

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

# 01. Import Data

In [2]:
# Set folder path
path = r'C:\Users\hachl\Downloads\4.3_orders_products\02 Data\Prepared Data'

In [3]:
# Import the merged data pickle file
ords_prods_merge = pd.read_pickle(os.path.join(path, 'updated_data.pkl'))

In [4]:
# Create subset

df = ords_prods_merge[:1000000]

In [5]:
# Check shape of subset
df.shape

(1000000, 19)

In [6]:
# Check overview of subset
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_hour,busiest_hours
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly day,Regularly busy,Average orders,Average orders
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly day,Slowest days,Average orders,Average orders
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly day,Slowest days,Most orders,Most orders
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least day,Slowest days,Average orders,Average orders
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least day,Slowest days,Most orders,Most orders


# 02. Grouping Data

In [7]:
# Group the data based on the 'product_name' variable

df.groupby('product_name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000020E4EC4E450>

In [8]:
# Find the average number of orders per user for each department ID --> Group the data by department ID and apply aggregation function to get the mean of order number

df.groupby('department_id').agg({'order_number': ['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
4,18.82578
7,17.472355
13,17.993423
14,19.246334
16,19.463012
17,11.294069
19,19.305237
20,17.599636


### This shows the best and lowest selling departments: for example, produce(4) sells considerably more on average than household goods(17).

In [10]:
# Double check via second method

df.groupby('department_id')['order_number'].mean()

department_id
4     18.825780
7     17.472355
13    17.993423
14    19.246334
16    19.463012
17    11.294069
19    19.305237
20    17.599636
Name: order_number, dtype: float64

In [11]:
# Performing Multiple Aggregations to find the mean, minimum and maximum values

df.groupby('department_id').agg({'order_number': ['mean', 'min', 'max']})

Unnamed: 0_level_0,order_number,order_number,order_number
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
4,18.82578,1,99
7,17.472355,1,99
13,17.993423,1,99
14,19.246334,1,99
16,19.463012,1,99
17,11.294069,1,98
19,19.305237,1,99
20,17.599636,1,99


#### Locate loyalty customers at Instacart so that the business strategy team can employ some kind of bonus point program for them.

In [12]:
# Step 1: create a new column containing the max. of the “order_number” column

ords_prods_merge['max_order'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform(np.max)

#### Explanation: new column called 'max_order' is created, which will contain the maximum order number for each user - the dataframe is grouped by the “user_id” column and then the transform() function is applied on the 'order_number' column with the np.max argument  to transform the ‘order_number’ column by applying the max function from the NumPy library.

In [13]:
# Check ouput
ords_prods_merge.head(15)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_hour,busiest_hours,max_order
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly day,Regularly busy,Average orders,Average orders,10
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly day,Slowest days,Average orders,Average orders,10
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly day,Slowest days,Most orders,Most orders,10
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least day,Slowest days,Average orders,Average orders,10
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least day,Slowest days,Most orders,Most orders,10
5,3367565,1,6,2,7,19.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly day,Regularly busy,Average orders,Average orders,10
6,550135,1,7,1,9,20.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly day,Busiest days,Most orders,Most orders,10
7,3108588,1,8,1,14,14.0,196,2,1,Soda,77,7,9.0,both,Mid-range product,Regularly day,Busiest days,Most orders,Most orders,10
8,2295261,1,9,1,16,0.0,196,4,1,Soda,77,7,9.0,both,Mid-range product,Regularly day,Busiest days,Most orders,Most orders,10
9,2550362,1,10,4,8,30.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least day,Slowest days,Average orders,Average orders,10


In [15]:
# Remove row number limit

pd.options.display.max_rows = None

In [None]:
# Check first 100 rows of df
ords_prods_merge.head(100)

#### Expanding the number of rows allows me to check whether the aggregation procedure was successfull. I will now delete the ouput to manage the notebook size

In [18]:
# Step 2: create a second column containing a flag designating whether a customer is “loyal” or not

ords_prods_merge.loc[ords_prods_merge['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

In [19]:
ords_prods_merge.loc[(ords_prods_merge['max_order'] <= 40) & (ords_prods_merge['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'

In [20]:
ords_prods_merge.loc[ords_prods_merge['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [21]:
# Print the frequency of the new 'loyalty_flag' column using the value_counts() function

ords_prods_merge['loyalty_flag'].value_counts(dropna = False)

loyalty_flag
Regular customer    15876776
Loyal customer      10284093
New customer         6243990
Name: count, dtype: int64

In [23]:
# Check output

ords_prods_merge[['user_id', 'loyalty_flag', 'order_number']].tail(60)

Unnamed: 0,user_id,loyalty_flag,order_number
32404799,183369,Regular customer,18
32404800,183369,Regular customer,26
32404801,183867,Loyal customer,19
32404802,185887,New customer,1
32404803,186019,Regular customer,10
32404804,198888,Regular customer,34
32404805,186019,Regular customer,14
32404806,186227,Loyal customer,17
32404807,186321,New customer,4
32404808,186870,Regular customer,17
