# 4.8 - Grouping Data & Aggregating Variables

### This script contains the following points:

* 01 - Importing Libraries
* 02 - Importing Data
* 03 - Creating a Subset of 1 Million
* 04 - Aggregating Data with agg()
* 05 - Aggregating Data with transform()
* 06 - Deriving Columns with loc()
* 07 - Exporting Data

---

## 01 - Importing Libraries

In [None]:
# Import libraries

import pandas as pd
import numpy as np
import os

---

## 02 - Importing Data

In [None]:
# Define path

path = r'/Users/juanigalvalisi/01-07-2022 - Instacart Basket Analysis/'

In [None]:
# Import .PKL

ords_prods_merge = pd.read_pickle(os.path.join(path, '02 - Data', 'Prepared Data', 'orders_products_merged_v2.pkl'))

---

## 03 - Creating a Subset of 1 million

In [None]:
# Create a subset

df = ords_prods_merge[:1000000]

In [None]:
# Check output

df.shape

In [None]:
df.head(10)

---

## 04 - Grouping Data with Pandas

In [None]:
# 1) Split the data into groups based on the given criteria

df.groupby('product_name')

---

## 05 - Aggregating Data with agg()

In [None]:
# 2) Apply a function to each group separately
# Apply the agg() function to each group to obtain the mean values for the “order_number” column

df.groupby('department_id').agg({'order_number': ['mean']})

In [None]:
# Aggregations that can be conducted without use of the agg()
# A) .mean() + SQUARE BRACKETS

df.groupby('department_id')['order_number'].mean()

In [None]:
# B).mean() + DOT NOTATION

df.groupby('department_id').order_number.mean()

In [None]:
# Performing Multiple Aggregations

df.groupby('department_id').agg({'order_number': ['mean', 'min', 'max']})

In [None]:
# 3) Combine the results into a dataframe or alternative data
# structure or create a new column in the current dataframe.

---

## 06 - Aggregating Data with transform()

In [None]:
# Create a new column, “max_order,” into which you’ll place the results of your aggregation.

ords_prods_merge['max_order'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform(np.max)

In [None]:
# Check the output

ords_prods_merge.head(15)

In [None]:
# Check the output but wihtout maximum of rows

pd.options.display.max_rows = None
ords_prods_merge.head(100)

---

## 07 - Deriving Columns with loc()



In [None]:
# Create a flag that assigns a “loyalty” label to a user ID based on its corresponding max order value

ords_prods_merge.loc[ords_prods_merge['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

In [None]:
ords_prods_merge.loc[(ords_prods_merge['max_order'] <= 40) & (ords_prods_merge['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'

In [None]:
ords_prods_merge.loc[ords_prods_merge['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [None]:
# Check the frequency of the new column 'loyalty_flag'

ords_prods_merge['loyalty_flag'].value_counts(dropna = False)

In [None]:
# Check the output of this specific column

ords_prods_merge['loyalty_flag']

In [None]:
# Check the output of multiple columns at the same time

ords_prods_merge[['user_id', 'loyalty_flag', 'order_number']].head(10)

---

## 07 - Exporting Data



In [None]:
# Export ords_prods_merge as .pkl

ords_prods_merge.to_pickle(os.path.join(path, '02 - Data', 'Prepared Data', 'orders_products_merged_v3.pkl'))