In [1]:
# ============================================================================
# NOTEBOOK 2: DATA CLEANING & MASTER DATASET CREATION
# Purpose: Clean data, handle missing values, create unified dataset
# ============================================================================

# Cell 1: Import Libraries & Load Data
# ============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

print("üìÇ Loading datasets...")

# Load all datasets
customers = pd.read_csv('olist_customers_dataset.csv')
orders = pd.read_csv('olist_orders_dataset.csv')
order_items = pd.read_csv('olist_order_items_dataset.csv')
products = pd.read_csv('olist_products_dataset.csv')
sellers = pd.read_csv('olist_sellers_dataset.csv')
payments = pd.read_csv('olist_order_payments_dataset.csv')
reviews = pd.read_csv('olist_order_reviews_dataset.csv')
category_translation = pd.read_csv('product_category_name_translation.csv')

print("‚úÖ All datasets loaded!")

üìÇ Loading datasets...
‚úÖ All datasets loaded!


In [2]:
# Cell 2: Convert Date Columns
# ============================================================================
print("üìÖ Converting date columns...")

date_columns = [
    'order_purchase_timestamp',
    'order_approved_at',
    'order_delivered_carrier_date',
    'order_delivered_customer_date',
    'order_estimated_delivery_date'
]

for col in date_columns:
    orders[col] = pd.to_datetime(orders[col], errors='coerce')

reviews['review_creation_date'] = pd.to_datetime(reviews['review_creation_date'], errors='coerce')
reviews['review_answer_timestamp'] = pd.to_datetime(reviews['review_answer_timestamp'], errors='coerce')

print("‚úÖ Date columns converted!")
print(f"\nDate range: {orders['order_purchase_timestamp'].min().date()} to {orders['order_purchase_timestamp'].max().date()}")

üìÖ Converting date columns...
‚úÖ Date columns converted!

Date range: 2016-09-04 to 2018-10-17


In [3]:
# Cell 3: Clean Orders Dataset
# ============================================================================
print("="*70)
print("üßπ CLEANING ORDERS DATASET")
print("="*70)

print(f"\nOriginal orders: {len(orders):,}")

# Check for duplicates
duplicates = orders.duplicated(subset=['order_id']).sum()
print(f"Duplicate order_ids: {duplicates}")

# Remove duplicates if any
if duplicates > 0:
    orders = orders.drop_duplicates(subset=['order_id'], keep='first')
    print(f"After removing duplicates: {len(orders):,}")

# Filter only delivered orders for main analysis
print(f"\nOrder status distribution:")
print(orders['order_status'].value_counts())

orders_delivered = orders[orders['order_status'] == 'delivered'].copy()
print(f"\n‚úÖ Filtered to delivered orders: {len(orders_delivered):,}")

# Check missing values in delivered orders
print(f"\n‚ùì Missing values in delivered orders:")
missing = orders_delivered.isnull().sum()
missing_pct = (missing / len(orders_delivered)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])

# For delivered orders, delivery dates should not be null
# Remove orders with missing delivery dates
print(f"\nOrders with missing delivery date: {orders_delivered['order_delivered_customer_date'].isnull().sum()}")
orders_delivered = orders_delivered[orders_delivered['order_delivered_customer_date'].notna()].copy()
print(f"After removing missing delivery dates: {len(orders_delivered):,}")

üßπ CLEANING ORDERS DATASET

Original orders: 99,441
Duplicate order_ids: 0

Order status distribution:
order_status
delivered      96478
shipped         1107
canceled         625
unavailable      609
invoiced         314
processing       301
created            5
approved           2
Name: count, dtype: int64

‚úÖ Filtered to delivered orders: 96,478

‚ùì Missing values in delivered orders:
                               Missing Count  Percentage
order_approved_at                         14    0.014511
order_delivered_carrier_date               2    0.002073
order_delivered_customer_date              8    0.008292

Orders with missing delivery date: 8
After removing missing delivery dates: 96,470


In [4]:
# Cell 4: Clean Order Items Dataset
# ============================================================================
print("\n" + "="*70)
print("üßπ CLEANING ORDER ITEMS DATASET")
print("="*70)

print(f"\nOriginal order items: {len(order_items):,}")

# Check for duplicates
duplicates = order_items.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

# Check missing values
print(f"\n‚ùì Missing values:")
print(order_items.isnull().sum())

# Check for negative or zero prices
print(f"\nPrice statistics:")
print(f"Zero prices: {(order_items['price'] == 0).sum()}")
print(f"Negative prices: {(order_items['price'] < 0).sum()}")

# Remove items with zero or negative prices
order_items_clean = order_items[(order_items['price'] > 0)].copy()
print(f"After removing zero/negative prices: {len(order_items_clean):,}")

# Check freight values
print(f"\nFreight statistics:")
print(f"Zero freight: {(order_items_clean['freight_value'] == 0).sum()}")
print(f"Negative freight: {(order_items_clean['freight_value'] < 0).sum()}")

# Keep zero freight (could be free shipping) but remove negative
order_items_clean = order_items_clean[order_items_clean['freight_value'] >= 0].copy()
print(f"After cleaning freight: {len(order_items_clean):,}")


üßπ CLEANING ORDER ITEMS DATASET

Original order items: 112,650
Duplicate rows: 0

‚ùì Missing values:
order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64

Price statistics:
Zero prices: 0
Negative prices: 0
After removing zero/negative prices: 112,650

Freight statistics:
Zero freight: 383
Negative freight: 0
After cleaning freight: 112,650


In [5]:
# Cell 5: Clean Products Dataset
# ============================================================================
print("\n" + "="*70)
print("üßπ CLEANING PRODUCTS DATASET")
print("="*70)

print(f"\nOriginal products: {len(products):,}")

# Check missing values
print(f"\n‚ùì Missing values:")
missing = products.isnull().sum()
missing_pct = (missing / len(products)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])

# Merge with category translation
products_clean = products.merge(
    category_translation,
    on='product_category_name',
    how='left'
)

# Fill missing category names
products_clean['product_category_name_english'] = products_clean['product_category_name_english'].fillna('unknown')

print(f"\n‚úÖ Products with English category names: {len(products_clean):,}")
print(f"Unknown categories: {(products_clean['product_category_name_english'] == 'unknown').sum()}")


üßπ CLEANING PRODUCTS DATASET

Original products: 32,951

‚ùì Missing values:
                            Missing Count  Percentage
product_category_name                 610    1.851234
product_name_lenght                   610    1.851234
product_description_lenght            610    1.851234
product_photos_qty                    610    1.851234
product_weight_g                        2    0.006070
product_length_cm                       2    0.006070
product_height_cm                       2    0.006070
product_width_cm                        2    0.006070

‚úÖ Products with English category names: 32,951
Unknown categories: 623


In [6]:
# Cell 6: Clean Payments Dataset
# ============================================================================
print("\n" + "="*70)
print("üßπ CLEANING PAYMENTS DATASET")
print("="*70)

print(f"\nOriginal payments: {len(payments):,}")

# Check missing values
print(f"\n‚ùì Missing values:")
print(payments.isnull().sum())

# Check for negative payment values
print(f"\nNegative payment values: {(payments['payment_value'] < 0).sum()}")
print(f"Zero payment values: {(payments['payment_value'] == 0).sum()}")

# Remove negative payments
payments_clean = payments[payments['payment_value'] >= 0].copy()
print(f"After removing negative payments: {len(payments_clean):,}")

# Aggregate payments by order (some orders have multiple payment methods)
payments_agg = payments_clean.groupby('order_id').agg({
    'payment_sequential': 'max',
    'payment_type': lambda x: x.mode()[0] if len(x.mode()) > 0 else x.iloc[0],  # Most common payment type
    'payment_installments': 'max',
    'payment_value': 'sum'  # Total payment value
}).reset_index()

print(f"\n‚úÖ Aggregated payments by order: {len(payments_agg):,}")


üßπ CLEANING PAYMENTS DATASET

Original payments: 103,886

‚ùì Missing values:
order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64

Negative payment values: 0
Zero payment values: 9
After removing negative payments: 103,886

‚úÖ Aggregated payments by order: 99,440


In [7]:
# Cell 7: Clean Reviews Dataset
# ============================================================================
print("\n" + "="*70)
print("üßπ CLEANING REVIEWS DATASET")
print("="*70)

print(f"\nOriginal reviews: {len(reviews):,}")

# Check missing values
print(f"\n‚ùì Missing values:")
missing = reviews.isnull().sum()
missing_pct = (missing / len(reviews)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])

# Keep only necessary columns
reviews_clean = reviews[['review_id', 'order_id', 'review_score', 'review_creation_date']].copy()

# Remove duplicates (keep first review per order)
reviews_clean = reviews_clean.drop_duplicates(subset=['order_id'], keep='first')

print(f"\n‚úÖ Cleaned reviews: {len(reviews_clean):,}")


üßπ CLEANING REVIEWS DATASET

Original reviews: 99,224

‚ùì Missing values:
                        Missing Count  Percentage
review_comment_title            87656   88.341530
review_comment_message          58247   58.702532

‚úÖ Cleaned reviews: 98,673


In [8]:
# Cell 8: Create Master Dataset - Step 1 (Orders + Customers)
# ============================================================================
print("\n" + "="*70)
print("üîó CREATING MASTER DATASET")
print("="*70)

print("\nStep 1: Merging Orders + Customers...")

master_df = orders_delivered.merge(
    customers,
    on='customer_id',
    how='left'
)

print(f"‚úÖ After merging with customers: {len(master_df):,} rows")


üîó CREATING MASTER DATASET

Step 1: Merging Orders + Customers...
‚úÖ After merging with customers: 96,470 rows


In [9]:
# Cell 9: Create Master Dataset - Step 2 (Add Order Items)
# ============================================================================
print("\nStep 2: Merging with Order Items...")

master_df = master_df.merge(
    order_items_clean,
    on='order_id',
    how='inner'  # Only keep orders that have items
)

print(f"‚úÖ After merging with order items: {len(master_df):,} rows")


Step 2: Merging with Order Items...
‚úÖ After merging with order items: 110,189 rows


In [10]:
# Cell 10: Create Master Dataset - Step 3 (Add Products)
# ============================================================================
print("\nStep 3: Merging with Products...")

master_df = master_df.merge(
    products_clean[['product_id', 'product_category_name', 'product_category_name_english', 
                    'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm']],
    on='product_id',
    how='left'
)

print(f"‚úÖ After merging with products: {len(master_df):,} rows")


Step 3: Merging with Products...
‚úÖ After merging with products: 110,189 rows


In [11]:
# Cell 11: Create Master Dataset - Step 4 (Add Payments)
# ============================================================================
print("\nStep 4: Merging with Payments...")

master_df = master_df.merge(
    payments_agg,
    on='order_id',
    how='left'
)

print(f"‚úÖ After merging with payments: {len(master_df):,} rows")


Step 4: Merging with Payments...
‚úÖ After merging with payments: 110,189 rows


In [12]:
# Cell 12: Create Master Dataset - Step 5 (Add Reviews)
# ============================================================================
print("\nStep 5: Merging with Reviews...")

master_df = master_df.merge(
    reviews_clean,
    on='order_id',
    how='left'
)

print(f"‚úÖ After merging with reviews: {len(master_df):,} rows")

print("\n" + "="*70)
print("‚úÖ MASTER DATASET CREATED!")
print("="*70)
print(f"\nFinal shape: {master_df.shape}")
print(f"Rows: {master_df.shape[0]:,}")
print(f"Columns: {master_df.shape[1]}")


Step 5: Merging with Reviews...
‚úÖ After merging with reviews: 110,189 rows

‚úÖ MASTER DATASET CREATED!

Final shape: (110189, 31)
Rows: 110,189
Columns: 31


In [13]:
# Cell 13: Feature Engineering
# ============================================================================
print("\n" + "="*70)
print("‚öôÔ∏è FEATURE ENGINEERING")
print("="*70)

# 1. Delivery time features
print("\n1Ô∏è‚É£ Creating delivery time features...")

master_df['delivery_days'] = (
    master_df['order_delivered_customer_date'] - master_df['order_purchase_timestamp']
).dt.days

master_df['expected_delivery_days'] = (
    master_df['order_estimated_delivery_date'] - master_df['order_purchase_timestamp']
).dt.days

master_df['delivery_delay_days'] = (
    master_df['order_delivered_customer_date'] - master_df['order_estimated_delivery_date']
).dt.days

master_df['delivered_on_time'] = (master_df['delivery_delay_days'] <= 0).astype(int)

print(f"   Average delivery time: {master_df['delivery_days'].mean():.1f} days")
print(f"   On-time delivery rate: {master_df['delivered_on_time'].mean() * 100:.1f}%")

# 2. Date/time features
print("\n2Ô∏è‚É£ Creating date/time features...")

master_df['order_year'] = master_df['order_purchase_timestamp'].dt.year
master_df['order_month'] = master_df['order_purchase_timestamp'].dt.month
master_df['order_day'] = master_df['order_purchase_timestamp'].dt.day
master_df['order_dayofweek'] = master_df['order_purchase_timestamp'].dt.dayofweek
master_df['order_hour'] = master_df['order_purchase_timestamp'].dt.hour
master_df['order_quarter'] = master_df['order_purchase_timestamp'].dt.quarter

# Day name
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
master_df['order_day_name'] = master_df['order_dayofweek'].apply(lambda x: day_names[x])

# Month name
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
master_df['order_month_name'] = master_df['order_month'].apply(lambda x: month_names[x-1])

# Weekend flag
master_df['is_weekend'] = (master_df['order_dayofweek'] >= 5).astype(int)

print("   ‚úÖ Date/time features created")

# 3. Financial features
print("\n3Ô∏è‚É£ Creating financial features...")

master_df['total_item_value'] = master_df['price'] + master_df['freight_value']

# Order-level aggregations will be done later for analysis
print("   ‚úÖ Financial features created")

# 4. Product features
print("\n4Ô∏è‚É£ Creating product features...")

# Product volume
master_df['product_volume_cm3'] = (
    master_df['product_length_cm'] * 
    master_df['product_height_cm'] * 
    master_df['product_width_cm']
)

# Fill missing product dimensions with median
for col in ['product_weight_g', 'product_volume_cm3']:
    master_df[col] = master_df[col].fillna(master_df[col].median())

print("   ‚úÖ Product features created")

# 5. Customer satisfaction features
print("\n5Ô∏è‚É£ Creating satisfaction features...")

# Fill missing review scores with median (3.0)
master_df['review_score'] = master_df['review_score'].fillna(3.0)

# Satisfaction categories
def categorize_satisfaction(score):
    if pd.isna(score):
        return 'No Review'
    elif score >= 4:
        return 'Satisfied'
    elif score == 3:
        return 'Neutral'
    else:
        return 'Unsatisfied'

master_df['satisfaction_category'] = master_df['review_score'].apply(categorize_satisfaction)

print("   ‚úÖ Satisfaction features created")

print("\n" + "="*70)
print("‚úÖ FEATURE ENGINEERING COMPLETE!")
print("="*70)


‚öôÔ∏è FEATURE ENGINEERING

1Ô∏è‚É£ Creating delivery time features...
   Average delivery time: 12.0 days
   On-time delivery rate: 93.4%

2Ô∏è‚É£ Creating date/time features...
   ‚úÖ Date/time features created

3Ô∏è‚É£ Creating financial features...
   ‚úÖ Financial features created

4Ô∏è‚É£ Creating product features...
   ‚úÖ Product features created

5Ô∏è‚É£ Creating satisfaction features...
   ‚úÖ Satisfaction features created

‚úÖ FEATURE ENGINEERING COMPLETE!


In [14]:
# Cell 14: Final Data Quality Check
# ============================================================================
print("\n" + "="*70)
print("üîç FINAL DATA QUALITY CHECK")
print("="*70)

print(f"\nüìä Master Dataset Shape: {master_df.shape}")

print(f"\n‚ùì Missing Values:")
missing = master_df.isnull().sum()
missing_pct = (missing / len(master_df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False).head(20))

print(f"\nüìã Column Names ({len(master_df.columns)} total):")
print(master_df.columns.tolist())

print(f"\nüìä Data Types:")
print(master_df.dtypes.value_counts())


üîç FINAL DATA QUALITY CHECK

üìä Master Dataset Shape: (110189, 47)

‚ùì Missing Values:
                              Missing Count  Percentage
product_category_name                  1537    1.394876
review_id                               827    0.750529
review_creation_date                    827    0.750529
product_length_cm                        18    0.016336
product_height_cm                        18    0.016336
product_width_cm                         18    0.016336
order_approved_at                        15    0.013613
payment_type                              3    0.002723
payment_value                             3    0.002723
payment_sequential                        3    0.002723
payment_installments                      3    0.002723
order_delivered_carrier_date              1    0.000908

üìã Column Names (47 total):
['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_custom

In [15]:
# Cell 15: Save Master Dataset
# ============================================================================
print("\nüíæ Saving master dataset...")

# Save to CSV
master_df.to_csv('master_dataset.csv', index=False)
print(f"‚úÖ Master dataset saved: 'master_dataset.csv'")
print(f"   Size: {len(master_df):,} rows √ó {len(master_df.columns)} columns")

# Save a sample for quick testing
master_df.sample(10000).to_csv('master_dataset_sample.csv', index=False)
print(f"‚úÖ Sample dataset saved: 'master_dataset_sample.csv' (10,000 rows)")

# Display first few rows
print(f"\nüîç First 5 rows of master dataset:")
display(master_df.head())

print("\n" + "="*70)
print("üéâ DATA CLEANING & MASTER DATASET CREATION COMPLETE!")
print("="*70)
print("\nüìù NEXT STEPS:")
print("   1. Exploratory Data Analysis (EDA)")
print("   2. Sales & Revenue Analysis")
print("   3. Customer Segmentation (RFM)")
print("   4. Market Basket Analysis")
print("   5. Cohort Analysis")
print("="*70)


üíæ Saving master dataset...
‚úÖ Master dataset saved: 'master_dataset.csv'
   Size: 110,189 rows √ó 47 columns
‚úÖ Sample dataset saved: 'master_dataset_sample.csv' (10,000 rows)

üîç First 5 rows of master dataset:


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,product_category_name,product_category_name_english,product_weight_g,product_length_cm,product_height_cm,product_width_cm,payment_sequential,payment_type,payment_installments,payment_value,review_id,review_score,review_creation_date,delivery_days,expected_delivery_days,delivery_delay_days,delivered_on_time,order_year,order_month,order_day,order_dayofweek,order_hour,order_quarter,order_day_name,order_month_name,is_weekend,total_item_value,product_volume_cm3,satisfaction_category
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,1,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,2017-10-06 11:07:15,29.99,8.72,utilidades_domesticas,housewares,500.0,19.0,8.0,13.0,3.0,voucher,1.0,38.71,a54f0611adc9ed256b57ede6b6eb5114,4.0,2017-10-11,8,15,-8,1,2017,10,2,0,10,4,Monday,Oct,0,38.71,1976.0,Satisfied
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,af07308b275d755c9edb36a90c618231,47813,barreiras,BA,1,595fac2a385ac33a80bd5114aec74eb8,289cdb325fb7e7f891c38608bf9e0962,2018-07-30 03:24:27,118.7,22.76,perfumaria,perfumery,400.0,19.0,13.0,19.0,1.0,boleto,1.0,141.46,8d5266042046a06655c8db133d120ba5,4.0,2018-08-08,13,19,-6,1,2018,7,24,1,20,3,Tuesday,Jul,0,141.46,4693.0,Satisfied
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,3a653a41f6f9fc3d2a113cf8398680e8,75265,vianopolis,GO,1,aa4383b373c6aca5d8797843e5594415,4869f7a5dfa277a7dca6462dcf3b52b2,2018-08-13 08:55:23,159.9,19.22,automotivo,auto,420.0,24.0,19.0,21.0,1.0,credit_card,3.0,179.12,e73b67b67587f7644d5bd1a52deb1b01,5.0,2018-08-18,9,26,-18,1,2018,8,8,2,8,3,Wednesday,Aug,0,179.12,9576.0,Satisfied
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15,7c142cf63193a1473d2e66489a9ae977,59296,sao goncalo do amarante,RN,1,d0b61bfb1de832b15ba9d266ca96e5b0,66922902710d126a0e7d26b0e3805106,2017-11-23 19:45:59,45.0,27.2,pet_shop,pet_shop,450.0,30.0,10.0,20.0,1.0,credit_card,1.0,72.2,359d03e676b3c069f62cadba8dd3f6e8,5.0,2017-12-03,13,26,-13,1,2017,11,18,5,19,4,Saturday,Nov,1,72.2,6000.0,Satisfied
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26,72632f0f9dd73dfee390c9b22eb56dd6,9195,santo andre,SP,1,65266b2da20d04dbe00c5c2d3bb7859e,2c9e548be18521d1c43cde1c582c6de8,2018-02-19 20:31:37,19.9,8.72,papelaria,stationery,250.0,51.0,15.0,15.0,1.0,credit_card,1.0,28.62,e50934924e227544ba8246aeb3770dd4,5.0,2018-02-17,2,12,-10,1,2018,2,13,1,21,1,Tuesday,Feb,0,28.62,11475.0,Satisfied



üéâ DATA CLEANING & MASTER DATASET CREATION COMPLETE!

üìù NEXT STEPS:
   1. Exploratory Data Analysis (EDA)
   2. Sales & Revenue Analysis
   3. Customer Segmentation (RFM)
   4. Market Basket Analysis
   5. Cohort Analysis
