# Exploratory Data Analysis (EDA)

Amazon Product Classification - DATA304 Final Project

**Objective:** Analyze dataset characteristics, class distribution, hierarchy structure, and text properties.

## Setup and Data Loading

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter
import pickle
import networkx as nx

# Project imports
from src.data_preprocessing import DataLoader

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Output directory
fig_dir = Path('../results/images/eda')
fig_dir.mkdir(parents=True, exist_ok=True)

print(f"{'='*60}")
print(f"  Exploratory Data Analysis")
print(f"{'='*60}")
print(f"✓ Images will be saved to: {fig_dir}\n")

In [None]:
# Load data
data_loader = DataLoader(data_dir='../data/raw/Amazon_products')
data_loader.load_all()

print(f"✓ Loaded {data_loader.num_classes} classes")
print(f"✓ Training samples: {len(data_loader.train_corpus)}")
print(f"✓ Test samples: {len(data_loader.test_corpus)}")
print(f"✓ Hierarchy edges: {len(data_loader.hierarchy)}")

## 1. Text Length Distribution

In [None]:
# Analyze text lengths
train_lengths = [len(text.split()) for text in data_loader.train_corpus.values()]
test_lengths = [len(text.split()) for text in data_loader.test_corpus.values()]

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Training set
axes[0].hist(train_lengths, bins=50, alpha=0.7, edgecolor='black', color='#2E86AB')
axes[0].axvline(np.mean(train_lengths), color='red', linestyle='--', linewidth=2, 
                label=f'Mean: {np.mean(train_lengths):.1f}')
axes[0].axvline(np.median(train_lengths), color='orange', linestyle='--', linewidth=2,
                label=f'Median: {np.median(train_lengths):.1f}')
axes[0].set_xlabel('Number of Words', fontsize=13, fontweight='bold')
axes[0].set_ylabel('Frequency', fontsize=13, fontweight='bold')
axes[0].set_title('Training Set - Text Length Distribution', fontsize=15, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(alpha=0.3)

# Test set
axes[1].hist(test_lengths, bins=50, alpha=0.7, edgecolor='black', color='#F18F01')
axes[1].axvline(np.mean(test_lengths), color='red', linestyle='--', linewidth=2,
                label=f'Mean: {np.mean(test_lengths):.1f}')
axes[1].axvline(np.median(test_lengths), color='orange', linestyle='--', linewidth=2,
                label=f'Median: {np.median(test_lengths):.1f}')
axes[1].set_xlabel('Number of Words', fontsize=13, fontweight='bold')
axes[1].set_ylabel('Frequency', fontsize=13, fontweight='bold')
axes[1].set_title('Test Set - Text Length Distribution', fontsize=15, fontweight='bold')
axes[1].legend(fontsize=11)
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(fig_dir / 'text_length_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"✓ Saved: {fig_dir / 'text_length_distribution.png'}")
print(f"\nTraining Set:")
print(f"  Mean: {np.mean(train_lengths):.2f} words")
print(f"  Median: {np.median(train_lengths):.1f} words")
print(f"  Min/Max: {min(train_lengths)} / {max(train_lengths)}")
print(f"\nTest Set:")
print(f"  Mean: {np.mean(test_lengths):.2f} words")
print(f"  Median: {np.median(test_lengths):.1f} words")
print(f"  Min/Max: {min(test_lengths)} / {max(test_lengths)}")

## 2. Class Distribution (from Silver Labels)

In [None]:
# Load silver labels
silver_labels_path = Path('../data/intermediate/train_silver_labels.pkl')

if silver_labels_path.exists():
    with open(silver_labels_path, 'rb') as f:
        silver_labels = pickle.load(f)
    
    # Count samples per class
    class_counts = Counter()
    labels_per_sample = []
    
    for labels in silver_labels.values():
        class_counts.update(labels)
        labels_per_sample.append(len(labels))
    
    # Top 30 classes
    top_classes = class_counts.most_common(30)
    classes, counts = zip(*top_classes)
    
    fig, axes = plt.subplots(1, 2, figsize=(18, 8))
    
    # Class frequency
    bars = axes[0].barh(range(len(classes)), counts, alpha=0.8, edgecolor='black',
                        color=plt.cm.viridis(np.linspace(0, 1, len(classes))))
    axes[0].set_yticks(range(len(classes)))
    axes[0].set_yticklabels([f'Class {c}' for c in classes], fontsize=10)
    axes[0].set_xlabel('Number of Training Samples', fontsize=13, fontweight='bold')
    axes[0].set_ylabel('Class ID', fontsize=13, fontweight='bold')
    axes[0].set_title('Top 30 Classes Distribution', fontsize=15, fontweight='bold')
    axes[0].invert_yaxis()
    axes[0].grid(axis='x', alpha=0.3)
    
    # Labels per sample
    axes[1].hist(labels_per_sample, bins=range(1, max(labels_per_sample)+2), 
                 alpha=0.8, edgecolor='black', color='#C1666B')
    axes[1].axvline(np.mean(labels_per_sample), color='red', linestyle='--', linewidth=2,
                    label=f'Mean: {np.mean(labels_per_sample):.2f}')
    axes[1].set_xlabel('Labels per Sample', fontsize=13, fontweight='bold')
    axes[1].set_ylabel('Frequency', fontsize=13, fontweight='bold')
    axes[1].set_title('Multi-Label Distribution', fontsize=15, fontweight='bold')
    axes[1].legend(fontsize=11)
    axes[1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(fig_dir / 'class_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"✓ Saved: {fig_dir / 'class_distribution.png'}")
    print(f"\nClass Statistics:")
    print(f"  Total classes: {len(class_counts)}")
    print(f"  Most frequent: Class {classes[0]} ({counts[0]} samples)")
    print(f"  Average labels per sample: {np.mean(labels_per_sample):.2f}")
else:
    print(f"⚠️  Silver labels not found: {silver_labels_path}")

## 3. Hierarchy Graph Visualization

In [None]:
# Build hierarchy graph
G = nx.DiGraph()
for parent, child in data_loader.hierarchy:
    G.add_edge(parent, child)

print(f"Hierarchy Graph:")
print(f"  Nodes: {G.number_of_nodes()}")
print(f"  Edges: {G.number_of_edges()}")
print(f"  Max depth: {nx.dag_longest_path_length(G)}")

# Compute hierarchy levels
levels = {}
for node in nx.topological_sort(G):
    if G.in_degree(node) == 0:
        levels[node] = 0
    else:
        levels[node] = max(levels[pred] for pred in G.predecessors(node)) + 1

level_counts = Counter(levels.values())

# Visualize level distribution
plt.figure(figsize=(12, 6))
level_ids = sorted(level_counts.keys())
counts = [level_counts[l] for l in level_ids]

bars = plt.bar(level_ids, counts, alpha=0.8, edgecolor='black', color='#48A9A6')
plt.xlabel('Hierarchy Level', fontsize=13, fontweight='bold')
plt.ylabel('Number of Classes', fontsize=13, fontweight='bold')
plt.title('Class Distribution by Hierarchy Level', fontsize=15, fontweight='bold')
plt.xticks(level_ids)
plt.grid(axis='y', alpha=0.3)

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height)}', ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig(fig_dir / 'hierarchy_levels.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\n✓ Saved: {fig_dir / 'hierarchy_levels.png'}")

## 4. Keyword Coverage Analysis

In [None]:
# Analyze keyword matching coverage
keywords_per_class = []

for class_id, keywords in data_loader.class_keywords.items():
    keywords_per_class.append(len(keywords))

plt.figure(figsize=(12, 6))
plt.hist(keywords_per_class, bins=30, alpha=0.8, edgecolor='black', color='#E76F51')
plt.axvline(np.mean(keywords_per_class), color='red', linestyle='--', linewidth=2,
            label=f'Mean: {np.mean(keywords_per_class):.1f}')
plt.xlabel('Keywords per Class', fontsize=13, fontweight='bold')
plt.ylabel('Frequency', fontsize=13, fontweight='bold')
plt.title('Keyword Coverage Distribution', fontsize=15, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(fig_dir / 'keyword_coverage.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"✓ Saved: {fig_dir / 'keyword_coverage.png'}")
print(f"\nKeyword Statistics:")
print(f"  Average keywords per class: {np.mean(keywords_per_class):.2f}")
print(f"  Min/Max: {min(keywords_per_class)} / {max(keywords_per_class)}")

## Summary

✅ All EDA visualizations have been saved to: `results/images/eda/`

**Generated Files:**
- `text_length_distribution.png` - Train/Test text length histograms with mean and median
- `class_distribution.png` - Top 30 classes frequency + Multi-label distribution
- `hierarchy_levels.png` - Class distribution across hierarchy levels
- `keyword_coverage.png` - Keywords per class distribution

**Key Insights:**
- Training samples contain diverse text lengths with clear distribution patterns
- Multi-label classification: Average labels per sample varies across dataset
- Hierarchical structure shows [X] levels with varying class counts
- Keyword coverage provides strong baseline for silver label generation

**Next Steps:**
1. Use these insights for model selection and hyperparameter tuning
2. Consider text length when setting max_length for tokenization
3. Address class imbalance if needed (focal loss, class weights)
4. Leverage hierarchy structure for hierarchical loss functions