# H&M Recommendation System - Exploratory Data Analysis

This notebook explores the H&M dataset to understand:
- Data structure and quality
- Customer behavior patterns
- Product characteristics
- Transaction trends
- Key insights for recommendation system design

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-darkgrid')

# Add project root to path
import sys
from pathlib import Path
project_root = Path('.').resolve().parent
sys.path.insert(0, str(project_root))

from src.utils.constants import *
print(f"Data directory: {DATA_DIR}")

## 1. Load and Inspect Data

In [None]:
# Load articles data
print("Loading articles data...")
articles_df = pd.read_csv(ARTICLES_PATH)
print(f"Articles shape: {articles_df.shape}")
articles_df.head()

In [None]:
# Load customers data
print("Loading customers data...")
customers_df = pd.read_csv(CUSTOMERS_PATH)
print(f"Customers shape: {customers_df.shape}")
customers_df.head()

In [None]:
# Load a sample of transactions (due to large size)
print("Loading transactions data (sampling)...")
# First, let's check the total number of rows
total_rows = sum(1 for line in open(TRANSACTIONS_PATH)) - 1
print(f"Total transactions: {total_rows:,}")

# Load a sample for initial exploration
sample_size = 1_000_000
transactions_sample = pd.read_csv(TRANSACTIONS_PATH, nrows=sample_size)
transactions_sample['t_dat'] = pd.to_datetime(transactions_sample['t_dat'])
print(f"Sample transactions shape: {transactions_sample.shape}")
transactions_sample.head()

## 2. Data Quality Check

In [None]:
# Check for missing values
print("Missing values in articles:")
print(articles_df.isnull().sum()[articles_df.isnull().sum() > 0])
print("\nMissing values in customers:")
print(customers_df.isnull().sum()[customers_df.isnull().sum() > 0])
print("\nMissing values in transactions:")
print(transactions_sample.isnull().sum())

In [None]:
# Data types
print("Articles data types:")
print(articles_df.dtypes)
print("\nCustomers data types:")
print(customers_df.dtypes)
print("\nTransactions data types:")
print(transactions_sample.dtypes)

## 3. Customer Analysis

In [None]:
# Customer age distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Age histogram
customers_df['age'].hist(bins=50, ax=axes[0], edgecolor='black')
axes[0].set_title('Customer Age Distribution')
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Count')

# Age statistics
age_stats = customers_df['age'].describe()
axes[1].text(0.1, 0.5, f"Age Statistics:\n\n{age_stats}", 
             transform=axes[1].transAxes, fontsize=12, verticalalignment='center')
axes[1].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Customer membership and activity status
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Club member status
customers_df['club_member_status'].value_counts().plot(kind='bar', ax=axes[0])
axes[0].set_title('Club Member Status')
axes[0].set_xlabel('Status')
axes[0].set_ylabel('Count')

# Fashion news frequency
customers_df['fashion_news_frequency'].value_counts().plot(kind='bar', ax=axes[1])
axes[1].set_title('Fashion News Frequency')
axes[1].set_xlabel('Frequency')
axes[1].set_ylabel('Count')

# Active status
customers_df['Active'].value_counts().plot(kind='pie', ax=axes[2], autopct='%1.1f%%')
axes[2].set_title('Customer Active Status')

plt.tight_layout()
plt.show()

## 4. Product Analysis

In [None]:
# Top product groups
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Product group distribution
articles_df['product_group_name'].value_counts().head(15).plot(kind='barh', ax=axes[0,0])
axes[0,0].set_title('Top 15 Product Groups')
axes[0,0].set_xlabel('Count')

# Garment group distribution
articles_df['garment_group_name'].value_counts().head(15).plot(kind='barh', ax=axes[0,1])
axes[0,1].set_title('Top 15 Garment Groups')
axes[0,1].set_xlabel('Count')

# Department distribution
articles_df['department_name'].value_counts().plot(kind='bar', ax=axes[1,0])
axes[1,0].set_title('Department Distribution')
axes[1,0].set_ylabel('Count')

# Color distribution
articles_df['colour_group_name'].value_counts().head(10).plot(kind='bar', ax=axes[1,1])
axes[1,1].set_title('Top 10 Color Groups')
axes[1,1].set_ylabel('Count')

plt.tight_layout()
plt.show()

## 5. Transaction Analysis

In [None]:
# Transaction date range
print(f"Transaction date range: {transactions_sample['t_dat'].min()} to {transactions_sample['t_dat'].max()}")
print(f"Total days: {(transactions_sample['t_dat'].max() - transactions_sample['t_dat'].min()).days}")

In [None]:
# Transactions over time
daily_transactions = transactions_sample.groupby(transactions_sample['t_dat'].dt.date).size()

plt.figure(figsize=(15, 6))
daily_transactions.plot()
plt.title('Daily Transaction Count (Sample)')
plt.xlabel('Date')
plt.ylabel('Number of Transactions')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Price distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
transactions_sample['price'].hist(bins=50, edgecolor='black')
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
transactions_sample['price'].apply(np.log1p).hist(bins=50, edgecolor='black')
plt.title('Log Price Distribution')
plt.xlabel('Log(Price + 1)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

print(f"Price statistics:")
print(transactions_sample['price'].describe())

In [None]:
# Sales channel distribution
sales_channel_counts = transactions_sample['sales_channel_id'].value_counts()
plt.figure(figsize=(8, 6))
sales_channel_counts.plot(kind='bar')
plt.title('Sales Channel Distribution')
plt.xlabel('Sales Channel ID')
plt.ylabel('Number of Transactions')
plt.tight_layout()
plt.show()

## 6. Customer Purchase Behavior

In [None]:
# Customer purchase frequency
customer_purchase_counts = transactions_sample['customer_id'].value_counts()

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
customer_purchase_counts.hist(bins=50, edgecolor='black')
plt.title('Customer Purchase Frequency Distribution')
plt.xlabel('Number of Purchases')
plt.ylabel('Number of Customers')
plt.yscale('log')

plt.subplot(1, 2, 2)
purchase_stats = customer_purchase_counts.describe()
plt.text(0.1, 0.5, f"Purchase Frequency Stats:\n\n{purchase_stats}", 
         transform=plt.gca().transAxes, fontsize=12, verticalalignment='center')
plt.axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Product popularity
product_purchase_counts = transactions_sample['article_id'].value_counts()

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
product_purchase_counts.head(20).plot(kind='bar')
plt.title('Top 20 Most Popular Products')
plt.xlabel('Article ID')
plt.ylabel('Number of Purchases')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
product_purchase_counts.hist(bins=50, edgecolor='black')
plt.title('Product Purchase Frequency Distribution')
plt.xlabel('Number of Purchases')
plt.ylabel('Number of Products')
plt.yscale('log')

plt.tight_layout()
plt.show()

## 7. Key Insights for Recommendation System

In [None]:
# Calculate key statistics
print("=== Key Dataset Statistics ===")
print(f"\nTotal unique customers: {customers_df.shape[0]:,}")
print(f"Total unique articles: {articles_df.shape[0]:,}")
print(f"Total transactions (sample): {transactions_sample.shape[0]:,}")

print(f"\nAverage purchases per customer (sample): {customer_purchase_counts.mean():.2f}")
print(f"Median purchases per customer (sample): {customer_purchase_counts.median():.2f}")

print(f"\nAverage purchases per product (sample): {product_purchase_counts.mean():.2f}")
print(f"Median purchases per product (sample): {product_purchase_counts.median():.2f}")

print(f"\nSparsity of interaction matrix: {1 - (transactions_sample.shape[0] / (customers_df.shape[0] * articles_df.shape[0])):.6f}")

In [None]:
# Identify cold start problems
print("=== Cold Start Analysis ===")

# Customers with few purchases
few_purchase_customers = (customer_purchase_counts <= 2).sum()
print(f"\nCustomers with ≤2 purchases: {few_purchase_customers:,} ({few_purchase_customers/len(customer_purchase_counts)*100:.1f}%)")

# Products with few purchases
few_purchase_products = (product_purchase_counts <= 5).sum()
print(f"Products with ≤5 purchases: {few_purchase_products:,} ({few_purchase_products/len(product_purchase_counts)*100:.1f}%)")

# New customers (based on sample - would need full data for accurate count)
last_week = transactions_sample['t_dat'].max() - timedelta(days=7)
new_customers = transactions_sample[transactions_sample['t_dat'] >= last_week]['customer_id'].nunique()
print(f"\nNew customers in last week (sample): {new_customers:,}")

## 8. Save Processed Data Info

In [None]:
# Save data statistics for later use
data_stats = {
    'n_customers': customers_df.shape[0],
    'n_articles': articles_df.shape[0],
    'n_transactions_total': total_rows,
    'date_min': str(transactions_sample['t_dat'].min()),
    'date_max': str(transactions_sample['t_dat'].max()),
    'avg_customer_age': customers_df['age'].mean(),
    'sparsity': 1 - (total_rows / (customers_df.shape[0] * articles_df.shape[0]))
}

import json
stats_path = project_root / 'experiments' / 'data_stats.json'
stats_path.parent.mkdir(exist_ok=True)
with open(stats_path, 'w') as f:
    json.dump(data_stats, f, indent=2)

print(f"Data statistics saved to: {stats_path}")
print(json.dumps(data_stats, indent=2))

## Summary of Key Findings

1. **Data Scale**: Large-scale dataset with ~1.37M customers, ~105K products, and ~31M transactions
2. **Sparsity**: Extremely sparse interaction matrix (>99.99% sparse)
3. **Customer Behavior**: Most customers have few purchases (long-tail distribution)
4. **Product Popularity**: Few products dominate purchases (power law distribution)
5. **Cold Start**: Significant cold start problem for both users and items
6. **Temporal Patterns**: Clear seasonal/temporal patterns in purchase behavior

These insights will guide our recommendation system design:
- Need efficient sparse matrix handling
- Must address cold start problem (content-based features important)
- Should consider temporal dynamics
- Popular items baseline likely to be strong