# Customer Segmentation - Data Generation & Exploration
## Phase 1: Data Acquisition and Understanding

This notebook generates synthetic customer data and performs comprehensive exploratory data analysis.

In [None]:
# Import libraries
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from data_loader import CustomerDataLoader

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 1. Generate Synthetic Customer Data

In [None]:
# Initialize data loader
loader = CustomerDataLoader(data_dir='../data')

# Generate synthetic customer data
df = loader.generate_synthetic_data(n_customers=2000, random_state=42)

print(f"\nDataset shape: {df.shape}")
print(f"Features: {df.columns.tolist()}")

In [None]:
# Display first few rows
df.head(10)

## 2. Data Understanding

In [None]:
# Comprehensive data summary
loader.print_data_summary(df)

In [None]:
# Check data types
print("Data Types:")
print(df.dtypes)

In [None]:
# Missing values analysis
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})

missing_df[missing_df['Missing Count'] > 0].sort_values('Percentage', ascending=False)

## 3. Univariate Analysis

In [None]:
# Distribution of numerical features
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [col for col in numeric_cols if 'ID' not in col.upper()]

fig, axes = plt.subplots(4, 4, figsize=(18, 16))
axes = axes.ravel()

for idx, col in enumerate(numeric_cols[:16]):
    axes[idx].hist(df[col].dropna(), bins=30, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'Distribution of {col}', fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/univariate_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Box plots for outlier detection
fig, axes = plt.subplots(3, 3, figsize=(16, 14))
axes = axes.ravel()

key_features = ['Age', 'Income', 'Recency', 'Frequency', 'Monetary', 
                'AvgOrderValue', 'WebsiteVisits', 'TenureDays', 'NumCategories']

for idx, col in enumerate(key_features):
    axes[idx].boxplot(df[col].dropna(), vert=True)
    axes[idx].set_title(f'Box Plot: {col}', fontweight='bold')
    axes[idx].set_ylabel(col)
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/boxplots_outliers.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Bivariate Analysis

In [None]:
# Correlation heatmap
plt.figure(figsize=(14, 10))
correlation = df[numeric_cols].corr()
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('../reports/figures/correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# RFM Analysis - Scatter plots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Recency vs Frequency
axes[0].scatter(df['Recency'], df['Frequency'], alpha=0.5, c='steelblue')
axes[0].set_xlabel('Recency (days)', fontsize=12)
axes[0].set_ylabel('Frequency (purchases)', fontsize=12)
axes[0].set_title('Recency vs Frequency', fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Frequency vs Monetary
axes[1].scatter(df['Frequency'], df['Monetary'], alpha=0.5, c='green')
axes[1].set_xlabel('Frequency (purchases)', fontsize=12)
axes[1].set_ylabel('Monetary (total spending)', fontsize=12)
axes[1].set_title('Frequency vs Monetary', fontweight='bold')
axes[1].grid(True, alpha=0.3)

# Recency vs Monetary
axes[2].scatter(df['Recency'], df['Monetary'], alpha=0.5, c='coral')
axes[2].set_xlabel('Recency (days)', fontsize=12)
axes[2].set_ylabel('Monetary (total spending)', fontsize=12)
axes[2].set_title('Recency vs Monetary', fontweight='bold')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/rfm_scatter_plots.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Categorical Features Analysis

In [None]:
# Categorical features distribution
cat_cols = ['Gender', 'MaritalStatus', 'Education', 'ChannelPreference', 'PaymentMethod']

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(cat_cols):
    value_counts = df[col].value_counts()
    axes[idx].bar(value_counts.index, value_counts.values, alpha=0.7)
    axes[idx].set_title(f'Distribution of {col}', fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Count')
    axes[idx].tick_params(axis='x', rotation=45)
    axes[idx].grid(True, alpha=0.3, axis='y')

# Loyalty member distribution
loyalty_counts = df['LoyaltyMember'].value_counts()
axes[5].pie(loyalty_counts.values, labels=loyalty_counts.index, autopct='%1.1f%%', startangle=90)
axes[5].set_title('Loyalty Program Members', fontweight='bold')

plt.tight_layout()
plt.savefig('../reports/figures/categorical_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Business Insights from EDA

In [None]:
print("="*80)
print("KEY BUSINESS INSIGHTS")
print("="*80)

# Average customer value
avg_monetary = df['Monetary'].mean()
median_monetary = df['Monetary'].median()
print(f"\nAverage Customer Value: ${avg_monetary:.2f}")
print(f"Median Customer Value: ${median_monetary:.2f}")

# High-value customers (top 20%)
threshold_80 = df['Monetary'].quantile(0.80)
high_value_customers = df[df['Monetary'] >= threshold_80]
print(f"\nHigh-Value Customers (top 20%): {len(high_value_customers)} ({len(high_value_customers)/len(df)*100:.1f}%)")
print(f"  Average Spending: ${high_value_customers['Monetary'].mean():.2f}")
print(f"  Total Revenue Contribution: ${high_value_customers['Monetary'].sum():.2f}")
print(f"  Revenue Contribution %: {high_value_customers['Monetary'].sum()/df['Monetary'].sum()*100:.1f}%")

# Purchase frequency distribution
avg_frequency = df['Frequency'].mean()
print(f"\nAverage Purchase Frequency: {avg_frequency:.1f} purchases")
print(f"Customers with >10 purchases: {len(df[df['Frequency'] > 10])} ({len(df[df['Frequency'] > 10])/len(df)*100:.1f}%)")

# Recency insights
recent_customers = df[df['Recency'] < 30]
dormant_customers = df[df['Recency'] > 180]
print(f"\nRecent Customers (< 30 days): {len(recent_customers)} ({len(recent_customers)/len(df)*100:.1f}%)")
print(f"Dormant Customers (> 180 days): {len(dormant_customers)} ({len(dormant_customers)/len(df)*100:.1f}%)")

# Tenure insights
avg_tenure = df['TenureDays'].mean()
print(f"\nAverage Customer Tenure: {avg_tenure:.0f} days ({avg_tenure/365:.1f} years)")

# Engagement
avg_email_open = df['EmailOpenRate'].mean()
print(f"\nAverage Email Open Rate: {avg_email_open*100:.1f}%")

print("\n" + "="*80)

## 7. Save Processed Data

In [None]:
# Save the generated data
loader.save_csv(df, 'customer_data.csv', subdirectory='synthetic')

print("\nData generation and exploration complete!")
print(f"Data saved to: ../data/synthetic/customer_data.csv")