# 02 - Data Generation & Exploratory Analysis
## 📊 E-commerce Dataset Creation and Analysis

This notebook demonstrates:
- Synthetic e-commerce data generation
- Key performance metrics calculation
- Visual exploration of product portfolio


In [None]:
# Setup (run from previous notebook or standalone)
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

project_root = Path('.').absolute().parent
sys.path.insert(0, str(project_root / 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from retailsense_ai import RetailSenseAIDemo

plt.style.use('default')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)

print('📊 Data Analysis Environment Ready!')

## Step 1: Generate E-commerce Dataset

In [None]:
# Initialize RetailSense AI Demo
demo = RetailSenseAIDemo()

# Generate realistic e-commerce data
print('🏭 Generating E-commerce Dataset...')
products_df = demo.create_sample_data(n_products=100)

# Calculate key metrics
total_revenue = products_df['total_revenue'].sum()
avg_conversion = products_df['view_to_purchase_rate'].mean()
total_products = len(products_df)
categories = products_df['category'].nunique()

print(f'\n✅ Dataset Generated Successfully!')
print(f'   📦 Products: {total_products:,}')
print(f'   💰 Total Revenue: ${total_revenue:,.2f}')
print(f'   📊 Avg Conversion: {avg_conversion*100:.2f}%')
print(f'   🏷️ Categories: {categories}')

# Display sample data
print('\n🔍 Sample Product Data:')
display(products_df[['product_name', 'category', 'price', 'total_revenue', 
                     'view_to_purchase_rate']].head(8))

## Step 2: Revenue Performance Analysis

In [None]:
# Revenue analysis visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle('💰 Revenue Performance Analysis', fontsize=14, fontweight='bold')

# 1. Revenue by Category
category_revenue = products_df.groupby('category')['total_revenue'].sum().sort_values(ascending=False)
colors = sns.color_palette('viridis', len(category_revenue))

bars = axes[0].bar(category_revenue.index, category_revenue.values, color=colors)
axes[0].set_title('📊 Revenue by Category', fontweight='bold')
axes[0].set_ylabel('Revenue ($)')
axes[0].tick_params(axis='x', rotation=45)

# Add revenue labels on bars
for bar, value in zip(bars, category_revenue.values):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + value*0.01,
                f'${value:,.0f}', ha='center', va='bottom', fontweight='bold')

# 2. Top 10 Products by Revenue
top_products = products_df.nlargest(10, 'total_revenue')
axes[1].barh(range(len(top_products)), top_products['total_revenue'], color='gold')
axes[1].set_title('🏆 Top 10 Products by Revenue', fontweight='bold')
axes[1].set_xlabel('Revenue ($)')
axes[1].set_yticks(range(len(top_products)))
axes[1].set_yticklabels([name[:25] + '...' if len(name) > 25 else name 
                        for name in top_products['product_name']])

# Add revenue values
for i, value in enumerate(top_products['total_revenue']):
    axes[1].text(value + max(top_products['total_revenue'])*0.01, i,
                f'${value:,.0f}', va='center', fontweight='bold')

plt.tight_layout()
plt.show()

print(f'🎯 Top performing category: {category_revenue.index[0]} (${category_revenue.iloc[0]:,.2f})')
print(f'🏆 Best product: {top_products.iloc[0]["product_name"]} (${top_products.iloc[0]["total_revenue"]:,.2f})')

## Step 3: Conversion Rate Analysis

In [None]:
# Conversion rate analysis
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle('📈 Conversion Rate Analysis', fontsize=14, fontweight='bold')

# 1. Conversion Rate Distribution
conversion_rates = products_df['view_to_purchase_rate'] * 100

axes[0].hist(conversion_rates, bins=20, edgecolor='black', alpha=0.7, color='lightcoral')
axes[0].axvline(conversion_rates.mean(), color='red', linestyle='--', linewidth=2,
               label=f'Mean: {conversion_rates.mean():.1f}%')
axes[0].axvline(conversion_rates.median(), color='blue', linestyle='--', linewidth=2,
               label=f'Median: {conversion_rates.median():.1f}%')
axes[0].set_title('📊 Conversion Rate Distribution', fontweight='bold')
axes[0].set_xlabel('Conversion Rate (%)')
axes[0].set_ylabel('Number of Products')
axes[0].legend()

# 2. Price vs Conversion Rate
scatter = axes[1].scatter(products_df['price'], conversion_rates, 
                         c=products_df['total_revenue'], cmap='viridis', 
                         alpha=0.7, s=60)
axes[1].set_title('💎 Price vs Conversion Rate\n(Color = Revenue)', fontweight='bold')
axes[1].set_xlabel('Price ($)')
axes[1].set_ylabel('Conversion Rate (%)')

# Add trend line
z = np.polyfit(products_df['price'], conversion_rates, 1)
p = np.poly1d(z)
axes[1].plot(products_df['price'], p(products_df['price']), 
            'r--', alpha=0.8, linewidth=2, label='Trend Line')
axes[1].legend()

plt.colorbar(scatter, ax=axes[1], label='Revenue ($)')
plt.tight_layout()
plt.show()

# Performance insights
high_performers = products_df[products_df['view_to_purchase_rate'] > conversion_rates.quantile(0.8)]
print(f'\n🎯 Conversion Insights:')
print(f'   📊 Average conversion rate: {conversion_rates.mean():.2f}%')
print(f'   🏆 Top 20% performers: {len(high_performers)} products')
print(f'   💰 High performers avg revenue: ${high_performers["total_revenue"].mean():,.2f}')

## Step 4: Brand Performance Matrix

In [None]:
# Brand performance analysis
brand_performance = products_df.groupby('brand').agg({
    'total_revenue': 'sum',
    'view_to_purchase_rate': 'mean',
    'product_sku': 'count',  # Product count
    'total_views': 'sum'
}).rename(columns={'product_sku': 'product_count'}).reset_index()

# Create brand performance visualization
fig, ax = plt.subplots(1, 1, figsize=(12, 8))

# Bubble chart: Conversion vs Revenue (bubble size = views)
bubble_sizes = (brand_performance['total_views'] / brand_performance['total_views'].max()) * 1000

scatter = ax.scatter(brand_performance['view_to_purchase_rate'] * 100,
                    brand_performance['total_revenue'],
                    s=bubble_sizes, alpha=0.6, 
                    c=range(len(brand_performance)), cmap='tab10')

ax.set_title('🏢 Brand Performance Matrix\n(Bubble size = Total Views)', 
            fontsize=14, fontweight='bold')
ax.set_xlabel('Average Conversion Rate (%)')
ax.set_ylabel('Total Revenue ($)')

# Add brand labels
for _, row in brand_performance.iterrows():
    ax.annotate(f"{row['brand']}\n({row['product_count']} products)", 
               (row['view_to_purchase_rate'] * 100, row['total_revenue']),
               xytext=(5, 5), textcoords='offset points', 
               fontsize=9, ha='left')

# Add quadrant lines
avg_conversion = brand_performance['view_to_purchase_rate'].mean() * 100
avg_revenue = brand_performance['total_revenue'].mean()

ax.axhline(avg_revenue, color='gray', linestyle='--', alpha=0.5)
ax.axvline(avg_conversion, color='gray', linestyle='--', alpha=0.5)

# Add quadrant labels
ax.text(ax.get_xlim()[1]*0.95, avg_revenue*1.8, 'High Revenue\nHigh Conversion', 
        ha='right', va='top', fontsize=10, style='italic', 
        bbox=dict(boxstyle='round', facecolor='lightgreen', alpha=0.5))

plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Brand insights
best_brand = brand_performance.loc[brand_performance['total_revenue'].idxmax()]
print(f'\n🏆 Brand Performance Insights:')
print(f'   🥇 Top revenue brand: {best_brand["brand"]} (${best_brand["total_revenue"]:,.2f})')
print(f'   📊 Average brand conversion: {avg_conversion:.2f}%')
print(f'   💰 Average brand revenue: ${avg_revenue:,.2f}')

## Summary: Dataset Ready for AI Analysis

✅ **Dataset Generated**: 100 products across 5 categories and 5 brands  
✅ **Key Metrics Calculated**: Revenue, conversion rates, performance scores  
✅ **Visual Analysis Complete**: Category, product, and brand performance  

**Next**: AI-powered similarity search (Semantic Detective approach)

---