# Customer Segmentation Analysis

This notebook performs customer segmentation using:
1. RFM (Recency, Frequency, Monetary) Analysis
2. K-means Clustering
3. Segment Profiling and Visualization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import sys
sys.path.append('..')

from src.model_utils import ModelUtils
from src.config import PROCESSED_DATA_DIR, CLUSTERING_CONFIG

%matplotlib inline
plt.style.use('seaborn')

## 1. RFM Analysis

In [None]:
# Load data
df = pd.read_csv(PROCESSED_DATA_DIR / 'cleaned_sales_data.csv')
df['order_date'] = pd.to_datetime(df['order_date'])

# Calculate RFM metrics
current_date = df['order_date'].max()

rfm = df.groupby('customer_id').agg({
    'order_date': lambda x: (current_date - x.max()).days,  # Recency
    'order_id': 'count',  # Frequency
    'price': 'sum'  # Monetary
}).rename(columns={
    'order_date': 'recency',
    'order_id': 'frequency',
    'price': 'monetary'
})

# Create RFM scores
rfm_scores = rfm.copy()
for metric in ['recency', 'frequency', 'monetary']:
    if metric == 'recency':
        rfm_scores[f'{metric}_score'] = pd.qcut(rfm[metric], q=5, labels=[5,4,3,2,1])
    else:
        rfm_scores[f'{metric}_score'] = pd.qcut(rfm[metric], q=5, labels=[1,2,3,4,5])

# Calculate RFM Score
rfm_scores['rfm_score'] = rfm_scores['recency_score'].astype(str) + \
                         rfm_scores['frequency_score'].astype(str) + \
                         rfm_scores['monetary_score'].astype(str)

# Visualize RFM distributions
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
for i, metric in enumerate(['recency', 'frequency', 'monetary']):
    sns.histplot(data=rfm, x=metric, ax=axes[i])
    axes[i].set_title(f'Distribution of {metric}')
plt.tight_layout()
plt.show()

## 2. K-means Clustering

In [None]:
# Initialize model utils
model_utils = ModelUtils(df)

# Perform clustering
segments, metrics = model_utils.perform_customer_segmentation(
    n_clusters=CLUSTERING_CONFIG['n_clusters']
)

# Visualize clusters
plt.figure(figsize=(12, 8))
sns.scatterplot(data=segments, x='total_spent', y='total_orders', hue='Segment', palette='deep')
plt.title('Customer Segments')
plt.show()

# Print clustering metrics
print("\nClustering Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value}")

## 3. Segment Profiling

In [None]:
# Calculate segment profiles
segment_profiles = segments.groupby('Segment').agg({
    'total_orders': 'mean',
    'total_spent': 'mean',
    'customer_id': 'count'
}).round(2)

segment_profiles['size_percentage'] = (segment_profiles['customer_id'] / 
                                      segment_profiles['customer_id'].sum() * 100).round(2)

print("Segment Profiles:")
print(segment_profiles)

# Visualize segment characteristics
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Average order value by segment
sns.barplot(data=segments, x='Segment', y='total_spent', ax=axes[0])
axes[0].set_title('Average Total Spent by Segment')

# Segment size distribution
segment_sizes = segments['Segment'].value_counts()
plt.pie(segment_sizes, labels=segment_sizes.index, autopct='%1.1f%%')
axes[1].set_title('Segment Size Distribution')

plt.tight_layout()
plt.show()