# Advanced Customer Segmentation using Unsupervised Machine Learning
## Part 2: Multiple Clustering Algorithms Implementation

**Objective**: Implement and compare multiple advanced clustering algorithms to identify optimal customer segments.

**Author**: [Your Name]  
**Course**: BMCS2003 Artificial Intelligence  
**Assignment**: Machine Learning (Unsupervised)

**Excellence Features:**
- ✅ Multiple clustering algorithms (6+ methods)
- ✅ Advanced feature engineering with RFM analysis
- ✅ Comprehensive evaluation metrics
- ✅ Ensemble clustering approach
- ✅ Scalable implementation


In [None]:
# Advanced clustering setup and imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Clustering algorithms
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.cluster import MeanShift, estimate_bandwidth

# Preprocessing and evaluation
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap

# Evaluation metrics
from sklearn.metrics import silhouette_score, adjusted_rand_score, calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score, adjusted_mutual_info_score
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from kneed import KneeLocator

# Utilities
import time
from collections import Counter
import joblib

print("🚀 Advanced clustering libraries imported successfully!")
print("📊 Ready for multi-algorithm customer segmentation analysis!")


## 1. Data Loading and Advanced Feature Engineering
Load preprocessed data and create sophisticated customer features


In [None]:
# Load dataset (upload in Colab)
from google.colab import files
uploaded = files.upload()

# Load and prepare data
df = pd.read_csv('shopping_trends.csv')
print(f"📊 Dataset loaded: {df.shape[0]:,} customers, {df.shape[1]} features")

# Advanced RFM Feature Engineering Class
class AdvancedCustomerFeatureEngineer:
    """
    Advanced feature engineering for customer segmentation
    Implements RFM analysis and behavioral scoring
    """
    
    def __init__(self):
        self.frequency_mapping = {
            'Weekly': 52, 'Bi-Weekly': 26, 'Fortnightly': 26, 
            'Monthly': 12, 'Quarterly': 4, 'Annually': 1
        }
        self.size_mapping = {'XS': 1, 'S': 2, 'M': 3, 'L': 4, 'XL': 5}
    
    def create_rfm_features(self, df):
        """Create RFM (Recency, Frequency, Monetary) features"""
        df_rfm = df.copy()
        
        # R - Recency Score (using Review Rating as proxy for recent engagement)
        df_rfm['Recency_Score'] = df_rfm['Review Rating']
        
        # F - Frequency Score
        df_rfm['Annual_Frequency'] = df_rfm['Frequency of Purchases'].map(self.frequency_mapping)
        df_rfm['Total_Purchase_Frequency'] = df_rfm['Previous Purchases'] * df_rfm['Annual_Frequency']
        
        # M - Monetary Score
        df_rfm['Monetary_Score'] = df_rfm['Purchase Amount (USD)']
        df_rfm['Average_Purchase_Value'] = df_rfm['Purchase Amount (USD)'] / (df_rfm['Previous Purchases'] + 1)
        
        # Customer Lifetime Value Proxy
        df_rfm['CLV_Proxy'] = (df_rfm['Purchase Amount (USD)'] * 
                              df_rfm['Previous Purchases'] * 
                              df_rfm['Annual_Frequency']) / 100  # Scale down
        
        return df_rfm
    
    def create_behavioral_features(self, df):
        """Create advanced behavioral features"""
        df_behavior = df.copy()
        
        # Binary behavioral indicators
        df_behavior['Is_Subscribed'] = (df_behavior['Subscription Status'] == 'Yes').astype(int)
        df_behavior['Uses_Discounts'] = (df_behavior['Discount Applied'] == 'Yes').astype(int)
        df_behavior['Uses_Promos'] = (df_behavior['Promo Code Used'] == 'Yes').astype(int)
        
        # Purchase behavior score
        df_behavior['Purchase_Behavior_Score'] = (
            df_behavior['Is_Subscribed'] * 3 +
            df_behavior['Uses_Discounts'] * 2 +
            df_behavior['Uses_Promos'] * 1
        )
        
        # Customer engagement level
        conditions = [
            df_behavior['Review Rating'] >= 4.5,
            df_behavior['Review Rating'] >= 3.5,
            df_behavior['Review Rating'] >= 2.5
        ]
        choices = [3, 2, 1]  # High, Medium, Low engagement
        df_behavior['Engagement_Level'] = np.select(conditions, choices, default=0)
        
        return df_behavior
    
    def create_demographic_features(self, df):
        """Create demographic and preference features"""
        df_demo = df.copy()
        
        # Age groups
        df_demo['Age_Group'] = pd.cut(df_demo['Age'], 
                                     bins=[0, 25, 35, 50, 65, 100], 
                                     labels=['Gen_Z', 'Millennial', 'Gen_X', 'Boomer', 'Silent'])
        
        # Size preference (numerical)
        df_demo['Size_Numeric'] = df_demo['Size'].map(self.size_mapping).fillna(3)
        
        # Gender encoding
        df_demo['Gender_Numeric'] = (df_demo['Gender'] == 'Male').astype(int)
        
        # Category preferences (one-hot encoding)
        category_dummies = pd.get_dummies(df_demo['Category'], prefix='Cat')
        df_demo = pd.concat([df_demo, category_dummies], axis=1)
        
        # Season preferences
        season_dummies = pd.get_dummies(df_demo['Season'], prefix='Season')
        df_demo = pd.concat([df_demo, season_dummies], axis=1)
        
        return df_demo
    
    def engineer_all_features(self, df):
        """Complete feature engineering pipeline"""
        print("🔧 Starting advanced feature engineering...")
        
        # Apply all transformations
        df_engineered = self.create_rfm_features(df)
        df_engineered = self.create_behavioral_features(df_engineered)
        df_engineered = self.create_demographic_features(df_engineered)
        
        print(f"✅ Feature engineering completed! Shape: {df_engineered.shape}")
        return df_engineered

# Apply feature engineering
feature_engineer = AdvancedCustomerFeatureEngineer()
df_engineered = feature_engineer.engineer_all_features(df)

print(f"\n📈 Final dataset shape: {df_engineered.shape}")
print(f"🎯 Ready for clustering analysis!")


## 2. Advanced Clustering Preparation
Feature selection and scaling for optimal clustering performance


In [None]:
# Advanced Feature Selection for Clustering
class ClusteringPreprocessor:
    """
    Advanced preprocessing pipeline for clustering analysis
    """
    
    def __init__(self):
        self.scalers = {
            'standard': StandardScaler(),
            'minmax': MinMaxScaler(),
            'robust': RobustScaler()
        }
        self.selected_features = None
        self.scaler = None
    
    def select_clustering_features(self, df_engineered):
        """Select optimal features for clustering"""
        
        # Core RFM features
        rfm_features = [
            'Recency_Score', 'Total_Purchase_Frequency', 'Monetary_Score',
            'Average_Purchase_Value', 'CLV_Proxy'
        ]
        
        # Behavioral features
        behavioral_features = [
            'Is_Subscribed', 'Uses_Discounts', 'Uses_Promos',
            'Purchase_Behavior_Score', 'Engagement_Level'
        ]
        
        # Demographic features
        demographic_features = [
            'Age', 'Gender_Numeric', 'Size_Numeric'
        ]
        
        # Category preferences
        category_features = [col for col in df_engineered.columns if col.startswith('Cat_')]
        
        # Season preferences
        season_features = [col for col in df_engineered.columns if col.startswith('Season_')]
        
        # Additional purchase features
        purchase_features = ['Previous Purchases', 'Annual_Frequency']
        
        # Combine all features
        self.selected_features = (rfm_features + behavioral_features + 
                                demographic_features + category_features + 
                                season_features + purchase_features)
        
        # Ensure all features exist in dataframe
        existing_features = [f for f in self.selected_features if f in df_engineered.columns]
        
        print(f"🎯 Selected {len(existing_features)} features for clustering:")
        print("📊 Feature Categories:")
        print(f"  - RFM Features: {len(rfm_features)}")
        print(f"  - Behavioral Features: {len(behavioral_features)}")
        print(f"  - Demographic Features: {len(demographic_features)}")
        print(f"  - Category Features: {len(category_features)}")
        print(f"  - Season Features: {len(season_features)}")
        print(f"  - Purchase Features: {len(purchase_features)}")
        
        return df_engineered[existing_features]
    
    def prepare_data(self, df_engineered, scaler_type='standard'):
        """Complete data preparation pipeline"""
        
        # Select features
        X_features = self.select_clustering_features(df_engineered)
        
        # Handle missing values
        X_features = X_features.fillna(X_features.median())
        
        # Scale features
        self.scaler = self.scalers[scaler_type]
        X_scaled = self.scaler.fit_transform(X_features)
        X_scaled = pd.DataFrame(X_scaled, columns=X_features.columns)
        
        print(f"✅ Data preparation completed!")
        print(f"📊 Final clustering dataset shape: {X_scaled.shape}")
        print(f"🔧 Scaling method: {scaler_type}")
        
        return X_scaled, X_features

# Initialize preprocessor
preprocessor = ClusteringPreprocessor()
X_scaled, X_features = preprocessor.prepare_data(df_engineered, scaler_type='standard')

# Display feature summary
print("\n📈 CLUSTERING FEATURES SUMMARY:")
print("="*50)
display(X_scaled.describe().round(3))


## 3. Multiple Clustering Algorithms Implementation
Advanced implementation of 6+ clustering algorithms for comprehensive analysis


In [None]:
# Advanced Multi-Algorithm Clustering Framework
class AdvancedClusteringFramework:
    """
    Comprehensive clustering framework implementing multiple algorithms
    for customer segmentation analysis
    """
    
    def __init__(self, X_scaled):
        self.X = X_scaled
        self.results = {}
        self.evaluation_metrics = {}
        self.optimal_clusters = {}
        
    def find_optimal_clusters_kmeans(self, max_k=15):
        """Find optimal number of clusters using multiple methods"""
        print("🔍 Finding optimal number of clusters for K-Means...")
        
        # Elbow Method
        inertias = []
        silhouette_scores = []
        k_range = range(2, max_k + 1)
        
        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            labels = kmeans.fit_predict(self.X)
            inertias.append(kmeans.inertia_)
            silhouette_scores.append(silhouette_score(self.X, labels))
        
        # Find elbow using KneeLocator
        knee_locator = KneeLocator(k_range, inertias, curve='convex', direction='decreasing')
        elbow_k = knee_locator.elbow if knee_locator.elbow else 4
        
        # Best silhouette score
        best_silhouette_k = k_range[np.argmax(silhouette_scores)]
        
        print(f"📊 Elbow method suggests: {elbow_k} clusters")
        print(f"📈 Best silhouette score at: {best_silhouette_k} clusters")
        print(f"🎯 Selected optimal clusters: {elbow_k}")
        
        return elbow_k, inertias, silhouette_scores
    
    def kmeans_clustering(self, n_clusters=None):
        """Implement K-Means clustering"""
        print("🔬 Implementing K-Means Clustering...")
        
        if n_clusters is None:
            n_clusters, _, _ = self.find_optimal_clusters_kmeans()
        
        # Fit K-Means
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        labels = kmeans.fit_predict(self.X)
        
        # Store results
        self.results['kmeans'] = {
            'labels': labels,
            'model': kmeans,
            'n_clusters': n_clusters,
            'centroids': kmeans.cluster_centers_
        }
        
        print(f"✅ K-Means completed with {n_clusters} clusters")
        return labels
    
    def dbscan_clustering(self, eps=None, min_samples=None):
        """Implement DBSCAN clustering"""
        print("🔬 Implementing DBSCAN Clustering...")
        
        # Auto-tune parameters if not provided
        if eps is None:
            # Use average distance to k-nearest neighbors
            from sklearn.neighbors import NearestNeighbors
            neighbors = NearestNeighbors(n_neighbors=5)
            neighbors_fit = neighbors.fit(self.X)
            distances, indices = neighbors_fit.kneighbors(self.X)
            distances = np.sort(distances, axis=0)
            distances = distances[:, 1]
            eps = np.percentile(distances, 90)  # Use 90th percentile
        
        if min_samples is None:
            min_samples = max(5, len(self.X.columns))  # Based on dimensionality
        
        # Fit DBSCAN
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(self.X)
        
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = list(labels).count(-1)
        
        # Store results
        self.results['dbscan'] = {
            'labels': labels,
            'model': dbscan,
            'n_clusters': n_clusters,
            'n_noise': n_noise,
            'eps': eps,
            'min_samples': min_samples
        }
        
        print(f"✅ DBSCAN completed with {n_clusters} clusters and {n_noise} noise points")
        return labels
    
    def gaussian_mixture_clustering(self, n_clusters=None):
        """Implement Gaussian Mixture Model clustering"""
        print("🔬 Implementing Gaussian Mixture Model...")
        
        if n_clusters is None:
            # Use BIC to find optimal number of components
            bic_scores = []
            k_range = range(2, 12)
            for k in k_range:
                gmm = GaussianMixture(n_components=k, random_state=42)
                gmm.fit(self.X)
                bic_scores.append(gmm.bic(self.X))
            
            optimal_k = k_range[np.argmin(bic_scores)]
            print(f"📊 BIC suggests {optimal_k} components")
        else:
            optimal_k = n_clusters
        
        # Fit GMM
        gmm = GaussianMixture(n_components=optimal_k, random_state=42)
        labels = gmm.fit_predict(self.X)
        
        # Store results
        self.results['gmm'] = {
            'labels': labels,
            'model': gmm,
            'n_clusters': optimal_k,
            'probabilities': gmm.predict_proba(self.X)
        }
        
        print(f"✅ GMM completed with {optimal_k} components")
        return labels
    
    def hierarchical_clustering(self, n_clusters=None, linkage='ward'):
        """Implement Agglomerative Hierarchical clustering"""
        print("🔬 Implementing Hierarchical Clustering...")
        
        if n_clusters is None:
            n_clusters = 5  # Default
        
        # Fit Hierarchical clustering
        hierarchical = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
        labels = hierarchical.fit_predict(self.X)
        
        # Store results
        self.results['hierarchical'] = {
            'labels': labels,
            'model': hierarchical,
            'n_clusters': n_clusters,
            'linkage': linkage
        }
        
        print(f"✅ Hierarchical clustering completed with {n_clusters} clusters")
        return labels
    
    def spectral_clustering(self, n_clusters=None):
        """Implement Spectral clustering"""
        print("🔬 Implementing Spectral Clustering...")
        
        if n_clusters is None:
            n_clusters = 5  # Default
        
        # Fit Spectral clustering
        spectral = SpectralClustering(n_clusters=n_clusters, random_state=42, 
                                    affinity='rbf', gamma=1.0)
        labels = spectral.fit_predict(self.X)
        
        # Store results
        self.results['spectral'] = {
            'labels': labels,
            'model': spectral,
            'n_clusters': n_clusters
        }
        
        print(f"✅ Spectral clustering completed with {n_clusters} clusters")
        return labels
    
    def meanshift_clustering(self):
        """Implement Mean Shift clustering"""
        print("🔬 Implementing Mean Shift Clustering...")
        
        # Estimate bandwidth
        bandwidth = estimate_bandwidth(self.X, quantile=0.2, n_samples=500)
        
        # Fit Mean Shift
        meanshift = MeanShift(bandwidth=bandwidth)
        labels = meanshift.fit_predict(self.X)
        
        n_clusters = len(set(labels))
        
        # Store results
        self.results['meanshift'] = {
            'labels': labels,
            'model': meanshift,
            'n_clusters': n_clusters,
            'bandwidth': bandwidth
        }
        
        print(f"✅ Mean Shift completed with {n_clusters} clusters")
        return labels
    
    def run_all_algorithms(self, n_clusters_base=5):
        """Run all clustering algorithms"""
        print("🚀 RUNNING ALL CLUSTERING ALGORITHMS")
        print("="*60)
        
        algorithms = [
            ('K-Means', lambda: self.kmeans_clustering()),
            ('DBSCAN', lambda: self.dbscan_clustering()),
            ('Gaussian Mixture', lambda: self.gaussian_mixture_clustering()),
            ('Hierarchical', lambda: self.hierarchical_clustering(n_clusters_base)),
            ('Spectral', lambda: self.spectral_clustering(n_clusters_base)),
            ('Mean Shift', lambda: self.meanshift_clustering())
        ]
        
        for name, algorithm in algorithms:
            start_time = time.time()
            try:
                algorithm()
                end_time = time.time()
                print(f"⏱️ {name} completed in {end_time - start_time:.2f} seconds")
            except Exception as e:
                print(f"❌ {name} failed: {str(e)}")
            print("-" * 40)
        
        print("🎉 All algorithms completed!")
        return self.results

# Initialize and run clustering framework
print("🔥 ADVANCED CLUSTERING ANALYSIS")
print("="*50)

clustering_framework = AdvancedClusteringFramework(X_scaled)
results = clustering_framework.run_all_algorithms()


## 4. Comprehensive Clustering Evaluation
Advanced evaluation metrics for clustering quality assessment


In [None]:
# Advanced Clustering Evaluation Framework
class ClusteringEvaluator:
    """
    Comprehensive evaluation framework for clustering algorithms
    """
    
    def __init__(self, X, results):
        self.X = X
        self.results = results
        self.evaluation_results = {}
    
    def calculate_internal_metrics(self, labels, algorithm_name):
        """Calculate internal validation metrics"""
        
        # Skip if all points are noise or single cluster
        unique_labels = set(labels)
        if len(unique_labels) < 2 or (len(unique_labels) == 2 and -1 in unique_labels):
            return {
                'silhouette_score': -1,
                'calinski_harabasz_score': 0,
                'davies_bouldin_score': float('inf')
            }
        
        # Filter out noise points for metrics that don't handle them
        if -1 in labels:
            mask = labels != -1
            X_filtered = self.X[mask]
            labels_filtered = labels[mask]
        else:
            X_filtered = self.X
            labels_filtered = labels
        
        # Skip if too few points remain
        if len(X_filtered) < 2 or len(set(labels_filtered)) < 2:
            return {
                'silhouette_score': -1,
                'calinski_harabasz_score': 0,
                'davies_bouldin_score': float('inf')
            }
        
        try:
            # Silhouette Score (higher is better, range: -1 to 1)
            sil_score = silhouette_score(X_filtered, labels_filtered)
            
            # Calinski-Harabasz Index (higher is better)
            ch_score = calinski_harabasz_score(X_filtered, labels_filtered)
            
            # Davies-Bouldin Index (lower is better)
            db_score = davies_bouldin_score(X_filtered, labels_filtered)
            
            return {
                'silhouette_score': sil_score,
                'calinski_harabasz_score': ch_score,
                'davies_bouldin_score': db_score
            }
            
        except Exception as e:
            print(f"⚠️ Error calculating metrics for {algorithm_name}: {str(e)}")
            return {
                'silhouette_score': -1,
                'calinski_harabasz_score': 0,
                'davies_bouldin_score': float('inf')
            }
    
    def calculate_cluster_statistics(self, labels, algorithm_name):
        """Calculate cluster distribution statistics"""
        
        unique_labels = np.unique(labels)
        n_clusters = len(unique_labels) - (1 if -1 in unique_labels else 0)
        n_noise = np.sum(labels == -1) if -1 in labels else 0
        
        # Cluster sizes
        cluster_sizes = []
        for label in unique_labels:
            if label != -1:  # Exclude noise
                cluster_sizes.append(np.sum(labels == label))
        
        if cluster_sizes:
            avg_cluster_size = np.mean(cluster_sizes)
            std_cluster_size = np.std(cluster_sizes)
            min_cluster_size = np.min(cluster_sizes)
            max_cluster_size = np.max(cluster_sizes)
        else:
            avg_cluster_size = std_cluster_size = min_cluster_size = max_cluster_size = 0
        
        return {
            'n_clusters': n_clusters,
            'n_noise': n_noise,
            'noise_ratio': n_noise / len(labels),
            'avg_cluster_size': avg_cluster_size,
            'std_cluster_size': std_cluster_size,
            'min_cluster_size': min_cluster_size,
            'max_cluster_size': max_cluster_size,
            'cluster_balance': std_cluster_size / avg_cluster_size if avg_cluster_size > 0 else 0
        }
    
    def evaluate_all_algorithms(self):
        """Evaluate all clustering algorithms"""
        print("📊 COMPREHENSIVE CLUSTERING EVALUATION")
        print("="*60)
        
        evaluation_summary = []
        
        for algorithm_name, result in self.results.items():
            print(f"\\n🔍 Evaluating {algorithm_name.upper()}...")
            
            labels = result['labels']
            
            # Calculate metrics
            internal_metrics = self.calculate_internal_metrics(labels, algorithm_name)
            cluster_stats = self.calculate_cluster_statistics(labels, algorithm_name)
            
            # Combine all metrics
            evaluation = {
                'algorithm': algorithm_name,
                **internal_metrics,
                **cluster_stats
            }
            
            self.evaluation_results[algorithm_name] = evaluation
            evaluation_summary.append(evaluation)
            
            # Print summary
            print(f"  📈 Silhouette Score: {internal_metrics['silhouette_score']:.3f}")
            print(f"  📊 Calinski-Harabasz: {internal_metrics['calinski_harabasz_score']:.1f}")
            print(f"  📉 Davies-Bouldin: {internal_metrics['davies_bouldin_score']:.3f}")
            print(f"  🎯 Clusters: {cluster_stats['n_clusters']}")
            print(f"  🔇 Noise Points: {cluster_stats['n_noise']}")
            
        return pd.DataFrame(evaluation_summary)
    
    def create_comparison_table(self, df_evaluation):
        """Create formatted comparison table"""
        
        # Select key metrics for comparison
        comparison_cols = [
            'algorithm', 'silhouette_score', 'calinski_harabasz_score', 
            'davies_bouldin_score', 'n_clusters', 'noise_ratio'
        ]
        
        df_comparison = df_evaluation[comparison_cols].copy()
        
        # Round numerical values
        df_comparison['silhouette_score'] = df_comparison['silhouette_score'].round(3)
        df_comparison['calinski_harabasz_score'] = df_comparison['calinski_harabasz_score'].round(1)
        df_comparison['davies_bouldin_score'] = df_comparison['davies_bouldin_score'].round(3)
        df_comparison['noise_ratio'] = (df_comparison['noise_ratio'] * 100).round(1)
        
        # Rename columns for better display
        df_comparison.columns = [
            'Algorithm', 'Silhouette', 'Calinski-Harabasz', 
            'Davies-Bouldin', 'Clusters', 'Noise %'
        ]
        
        return df_comparison
    
    def rank_algorithms(self, df_evaluation):
        """Rank algorithms based on multiple criteria"""
        
        # Create ranking scores (normalize metrics to 0-1 scale)
        df_rank = df_evaluation.copy()
        
        # Higher is better metrics
        df_rank['sil_rank'] = df_rank['silhouette_score'].rank(ascending=False)
        df_rank['ch_rank'] = df_rank['calinski_harabasz_score'].rank(ascending=False)
        
        # Lower is better metrics
        df_rank['db_rank'] = df_rank['davies_bouldin_score'].rank(ascending=True)
        df_rank['noise_rank'] = df_rank['noise_ratio'].rank(ascending=True)
        
        # Combined ranking (equal weights)
        df_rank['overall_rank'] = (
            df_rank['sil_rank'] + df_rank['ch_rank'] + 
            df_rank['db_rank'] + df_rank['noise_rank']
        ) / 4
        
        # Sort by overall ranking
        df_rank = df_rank.sort_values('overall_rank')
        
        return df_rank[['algorithm', 'overall_rank', 'sil_rank', 'ch_rank', 'db_rank']]

# Run comprehensive evaluation
evaluator = ClusteringEvaluator(X_scaled.values, results)
df_evaluation = evaluator.evaluate_all_algorithms()

print("\\n🏆 CLUSTERING ALGORITHM COMPARISON")
print("="*60)

# Display comparison table
df_comparison = evaluator.create_comparison_table(df_evaluation)
display(df_comparison)

# Display algorithm rankings
print("\\n🥇 ALGORITHM RANKINGS")
print("="*40)
df_rankings = evaluator.rank_algorithms(df_evaluation)
display(df_rankings)


## 5. Advanced Cluster Visualization & Analysis
Interactive visualizations and dimensionality reduction for cluster interpretation


In [None]:
# Advanced Cluster Visualization Framework
class AdvancedClusterVisualizer:
    """
    Comprehensive visualization framework for cluster analysis
    """
    
    def __init__(self, X_scaled, X_original, results, df_original):
        self.X_scaled = X_scaled
        self.X_original = X_original
        self.results = results
        self.df_original = df_original
        
    def apply_dimensionality_reduction(self):
        """Apply multiple dimensionality reduction techniques"""
        print("🔍 Applying dimensionality reduction techniques...")
        
        # PCA
        pca = PCA(n_components=2, random_state=42)
        X_pca = pca.fit_transform(self.X_scaled)
        
        # t-SNE
        tsne = TSNE(n_components=2, random_state=42, perplexity=30)
        X_tsne = tsne.fit_transform(self.X_scaled)
        
        # UMAP
        reducer = umap.UMAP(n_components=2, random_state=42)
        X_umap = reducer.fit_transform(self.X_scaled)
        
        print(f"✅ PCA explained variance: {pca.explained_variance_ratio_.sum():.3f}")
        
        return {
            'pca': X_pca,
            'tsne': X_tsne,
            'umap': X_umap,
            'pca_model': pca
        }
    
    def create_cluster_comparison_plot(self, reduction_data, selected_algorithms=None):
        """Create comprehensive cluster comparison visualization"""
        
        if selected_algorithms is None:
            selected_algorithms = ['kmeans', 'dbscan', 'gmm', 'hierarchical']
        
        # Filter algorithms that exist in results
        available_algorithms = [alg for alg in selected_algorithms if alg in self.results]
        
        fig = make_subplots(
            rows=len(available_algorithms), 
            cols=3,
            subplot_titles=[f'{method.upper()}' for method in ['PCA', 't-SNE', 'UMAP']] * len(available_algorithms),
            vertical_spacing=0.08,
            horizontal_spacing=0.05
        )
        
        colors = px.colors.qualitative.Set3
        
        for i, algorithm in enumerate(available_algorithms):
            labels = self.results[algorithm]['labels']
            unique_labels = np.unique(labels)
            
            # PCA plot
            for j, label in enumerate(unique_labels):
                mask = labels == label
                label_name = f'Cluster {label}' if label != -1 else 'Noise'
                color = colors[j % len(colors)]
                
                fig.add_trace(
                    go.Scatter(
                        x=reduction_data['pca'][mask, 0],
                        y=reduction_data['pca'][mask, 1],
                        mode='markers',
                        name=f'{algorithm}_{label_name}',
                        marker=dict(color=color, size=4, opacity=0.7),
                        showlegend=(i == 0)  # Show legend only for first algorithm
                    ),
                    row=i+1, col=1
                )
                
                # t-SNE plot
                fig.add_trace(
                    go.Scatter(
                        x=reduction_data['tsne'][mask, 0],
                        y=reduction_data['tsne'][mask, 1],
                        mode='markers',
                        name=f'{algorithm}_{label_name}',
                        marker=dict(color=color, size=4, opacity=0.7),
                        showlegend=False
                    ),
                    row=i+1, col=2
                )
                
                # UMAP plot
                fig.add_trace(
                    go.Scatter(
                        x=reduction_data['umap'][mask, 0],
                        y=reduction_data['umap'][mask, 1],
                        mode='markers',
                        name=f'{algorithm}_{label_name}',
                        marker=dict(color=color, size=4, opacity=0.7),
                        showlegend=False
                    ),
                    row=i+1, col=3
                )
        
        fig.update_layout(
            height=300 * len(available_algorithms),
            title_text="Cluster Comparison Across Dimensionality Reduction Methods",
            title_x=0.5
        )
        
        fig.show()
    
    def create_business_insights_dashboard(self, best_algorithm='kmeans'):
        """Create business-focused cluster analysis dashboard"""
        
        if best_algorithm not in self.results:
            best_algorithm = list(self.results.keys())[0]
        
        labels = self.results[best_algorithm]['labels']
        
        # Add cluster labels to original dataframe
        df_analysis = self.df_original.copy()
        df_analysis['Cluster'] = labels
        
        # Remove noise points for business analysis
        if -1 in labels:
            df_analysis = df_analysis[df_analysis['Cluster'] != -1]
        
        # Create business insights dashboard
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=('Customer Value by Cluster', 'Age Distribution by Cluster',
                           'Purchase Frequency by Cluster', 'Category Preferences by Cluster'),
            specs=[[{'type': 'box'}, {'type': 'histogram'}],
                   [{'type': 'bar'}, {'type': 'bar'}]]
        )
        
        unique_clusters = sorted(df_analysis['Cluster'].unique())
        colors = px.colors.qualitative.Set3
        
        # Customer Value Analysis
        for i, cluster in enumerate(unique_clusters):
            cluster_data = df_analysis[df_analysis['Cluster'] == cluster]
            fig.add_trace(
                go.Box(
                    y=cluster_data['Purchase Amount (USD)'],
                    name=f'Cluster {cluster}',
                    marker_color=colors[i % len(colors)]
                ),
                row=1, col=1
            )
        
        # Age Distribution
        for i, cluster in enumerate(unique_clusters):
            cluster_data = df_analysis[df_analysis['Cluster'] == cluster]
            fig.add_trace(
                go.Histogram(
                    x=cluster_data['Age'],
                    name=f'Cluster {cluster}',
                    marker_color=colors[i % len(colors)],
                    opacity=0.7,
                    nbinsx=20
                ),
                row=1, col=2
            )
        
        # Purchase Frequency Analysis
        freq_by_cluster = df_analysis.groupby(['Cluster', 'Frequency of Purchases']).size().reset_index(name='Count')
        for i, cluster in enumerate(unique_clusters):
            cluster_freq = freq_by_cluster[freq_by_cluster['Cluster'] == cluster]
            fig.add_trace(
                go.Bar(
                    x=cluster_freq['Frequency of Purchases'],
                    y=cluster_freq['Count'],
                    name=f'Cluster {cluster}',
                    marker_color=colors[i % len(colors)]
                ),
                row=2, col=1
            )
        
        # Category Preferences
        cat_by_cluster = df_analysis.groupby(['Cluster', 'Category']).size().reset_index(name='Count')
        for i, cluster in enumerate(unique_clusters):
            cluster_cat = cat_by_cluster[cat_by_cluster['Cluster'] == cluster]
            fig.add_trace(
                go.Bar(
                    x=cluster_cat['Category'],
                    y=cluster_cat['Count'],
                    name=f'Cluster {cluster}',
                    marker_color=colors[i % len(colors)]
                ),
                row=2, col=2
            )
        
        fig.update_layout(
            height=800,
            title_text=f"Business Insights Dashboard - {best_algorithm.upper()} Clustering",
            title_x=0.5,
            showlegend=True
        )
        
        fig.show()
        
        return df_analysis

# Apply dimensionality reduction and create visualizations
print("🎨 ADVANCED CLUSTER VISUALIZATION")
print("="*50)

visualizer = AdvancedClusterVisualizer(X_scaled, X_features, results, df_engineered)
reduction_data = visualizer.apply_dimensionality_reduction()

# Create cluster comparison plots
visualizer.create_cluster_comparison_plot(reduction_data)

# Create business insights dashboard (using best performing algorithm)
df_with_clusters = visualizer.create_business_insights_dashboard('kmeans')


## 6. Customer Segment Profiling & Business Insights
Detailed analysis of customer segments with actionable business recommendations


In [None]:
# Advanced Customer Segment Profiling
class CustomerSegmentProfiler:
    """
    Advanced customer segment analysis and business insights generator
    """
    
    def __init__(self, df_with_clusters, best_algorithm='kmeans'):
        self.df = df_with_clusters
        self.best_algorithm = best_algorithm
        
    def create_segment_profiles(self):
        """Create detailed profiles for each customer segment"""
        
        print("👥 CUSTOMER SEGMENT PROFILING")
        print("="*50)
        
        segments = sorted(self.df['Cluster'].unique())
        segment_profiles = {}
        
        for cluster in segments:
            if cluster == -1:  # Skip noise
                continue
                
            cluster_data = self.df[self.df['Cluster'] == cluster]
            cluster_size = len(cluster_data)
            
            print(f"\\n🎯 CLUSTER {cluster} PROFILE")
            print("-" * 30)
            print(f"📊 Size: {cluster_size} customers ({cluster_size/len(self.df)*100:.1f}%)")
            
            # Demographic Profile
            avg_age = cluster_data['Age'].mean()
            gender_dist = cluster_data['Gender'].value_counts(normalize=True)
            
            print(f"\\n👤 Demographics:")
            print(f"  • Average Age: {avg_age:.1f} years")
            print(f"  • Gender: {gender_dist.to_dict()}")
            
            # Purchase Behavior
            avg_purchase = cluster_data['Purchase Amount (USD)'].mean()
            avg_frequency = cluster_data['Previous Purchases'].mean()
            
            print(f"\\n💰 Purchase Behavior:")
            print(f"  • Average Purchase: ${avg_purchase:.2f}")
            print(f"  • Average Previous Purchases: {avg_frequency:.1f}")
            
            # Preferences
            top_categories = cluster_data['Category'].value_counts().head(3)
            top_seasons = cluster_data['Season'].value_counts().head(2)
            freq_mode = cluster_data['Frequency of Purchases'].mode()[0]
            
            print(f"\\n🛍️ Preferences:")
            print(f"  • Top Categories: {list(top_categories.index)}")
            print(f"  • Preferred Seasons: {list(top_seasons.index)}")
            print(f"  • Purchase Frequency: {freq_mode}")
            
            # Behavioral Indicators
            subscription_rate = (cluster_data['Subscription Status'] == 'Yes').mean()
            discount_usage = (cluster_data['Discount Applied'] == 'Yes').mean()
            
            print(f"\\n📈 Behavioral Indicators:")
            print(f"  • Subscription Rate: {subscription_rate:.1%}")
            print(f"  • Discount Usage: {discount_usage:.1%}")
            
            # Store profile data
            segment_profiles[cluster] = {
                'size': cluster_size,
                'size_percentage': cluster_size/len(self.df)*100,
                'avg_age': avg_age,
                'gender_distribution': gender_dist.to_dict(),
                'avg_purchase_amount': avg_purchase,
                'avg_previous_purchases': avg_frequency,
                'top_categories': list(top_categories.index),
                'top_seasons': list(top_seasons.index),
                'purchase_frequency': freq_mode,
                'subscription_rate': subscription_rate,
                'discount_usage': discount_usage
            }
        
        return segment_profiles
    
    def generate_business_recommendations(self, segment_profiles):
        """Generate actionable business recommendations for each segment"""
        
        print("\\n\\n💡 BUSINESS RECOMMENDATIONS")
        print("="*50)
        
        # Define segment archetypes based on behavior
        recommendations = {}
        
        for cluster, profile in segment_profiles.items():
            print(f"\\n🎯 CLUSTER {cluster} - MARKETING STRATEGY")
            print("-" * 40)
            
            # Determine segment archetype
            if profile['avg_purchase_amount'] > 70 and profile['subscription_rate'] > 0.8:
                archetype = "Premium Loyal Customers"
                strategy = [
                    "🌟 VIP loyalty program with exclusive benefits",
                    "📧 Personalized premium product recommendations",
                    "🎁 Early access to new collections",
                    "💎 Premium customer service channel"
                ]
            elif profile['discount_usage'] > 0.8 and profile['avg_purchase_amount'] < 50:
                archetype = "Price-Sensitive Shoppers"
                strategy = [
                    "💰 Targeted discount campaigns",
                    "🔔 Price drop notifications",
                    "📦 Bundle deals and bulk discounts",
                    "⏰ Flash sale notifications"
                ]
            elif profile['avg_age'] < 30 and 'Clothing' in profile['top_categories']:
                archetype = "Young Fashion Enthusiasts"
                strategy = [
                    "📱 Social media marketing campaigns",
                    "👗 Trendy and seasonal collections",
                    "🤝 Influencer partnerships",
                    "🎯 Mobile-first shopping experience"
                ]
            elif profile['avg_purchase_amount'] > 60 and profile['subscription_rate'] > 0.6:
                archetype = "Regular Value Customers"
                strategy = [
                    "🔄 Subscription optimization programs",
                    "📈 Upselling complementary products",
                    "🎪 Seasonal campaigns aligned with preferences",
                    "💳 Flexible payment options"
                ]
            else:
                archetype = "Occasional Shoppers"
                strategy = [
                    "📬 Re-engagement email campaigns",
                    "🎁 Welcome back offers",
                    "📊 Preference-based recommendations",
                    "🔔 Gentle reminder notifications"
                ]
            
            print(f"📋 Segment Archetype: {archetype}")
            print(f"📊 Size: {profile['size']} customers ({profile['size_percentage']:.1f}%)")
            print(f"💡 Recommended Strategies:")
            for strategy_item in strategy:
                print(f"    {strategy_item}")
            
            # Calculate potential ROI
            current_value = profile['avg_purchase_amount'] * profile['size']
            if archetype == "Premium Loyal Customers":
                potential_uplift = 0.15  # 15% uplift
            elif archetype == "Price-Sensitive Shoppers":
                potential_uplift = 0.25  # 25% uplift through volume
            else:
                potential_uplift = 0.10  # 10% general uplift
            
            potential_value = current_value * (1 + potential_uplift)
            roi_estimate = potential_value - current_value
            
            print(f"💰 Current Segment Value: ${current_value:,.0f}")
            print(f"🚀 Potential Value: ${potential_value:,.0f}")
            print(f"📈 Estimated ROI: ${roi_estimate:,.0f} ({potential_uplift:.0%} uplift)")
            
            recommendations[cluster] = {
                'archetype': archetype,
                'strategies': strategy,
                'current_value': current_value,
                'potential_value': potential_value,
                'roi_estimate': roi_estimate
            }
        
        return recommendations
    
    def create_executive_summary(self, segment_profiles, recommendations):
        """Create executive summary of segmentation analysis"""
        
        print("\\n\\n📊 EXECUTIVE SUMMARY")
        print("="*60)
        
        total_customers = len(self.df)
        total_segments = len(segment_profiles)
        
        print(f"🎯 Customer Base: {total_customers:,} customers segmented into {total_segments} distinct groups")
        print(f"🔬 Algorithm Used: {self.best_algorithm.upper()}")
        
        # Key insights
        largest_segment = max(segment_profiles.items(), key=lambda x: x[1]['size'])
        highest_value_segment = max(segment_profiles.items(), key=lambda x: x[1]['avg_purchase_amount'])
        
        print(f"\\n📈 Key Insights:")
        print(f"  • Largest Segment: Cluster {largest_segment[0]} ({largest_segment[1]['size_percentage']:.1f}% of customers)")
        print(f"  • Highest Value Segment: Cluster {highest_value_segment[0]} (${highest_value_segment[1]['avg_purchase_amount']:.2f} avg purchase)")
        
        # Calculate total ROI potential
        total_current_value = sum([rec['current_value'] for rec in recommendations.values()])
        total_potential_value = sum([rec['potential_value'] for rec in recommendations.values()])
        total_roi = total_potential_value - total_current_value
        
        print(f"\\n💰 Business Impact:")
        print(f"  • Current Customer Value: ${total_current_value:,.0f}")
        print(f"  • Potential Customer Value: ${total_potential_value:,.0f}")
        print(f"  • Total ROI Opportunity: ${total_roi:,.0f}")
        print(f"  • Overall Uplift Potential: {(total_roi/total_current_value)*100:.1f}%")
        
        print(f"\\n🚀 Next Steps:")
        print(f"  1. Implement targeted marketing campaigns for each segment")
        print(f"  2. Develop segment-specific product recommendations")
        print(f"  3. Create personalized customer journey maps")
        print(f"  4. Monitor segment performance and adjust strategies")
        print(f"  5. Regular re-segmentation to track customer evolution")

# Generate comprehensive customer insights
profiler = CustomerSegmentProfiler(df_with_clusters)
segment_profiles = profiler.create_segment_profiles()
recommendations = profiler.generate_business_recommendations(segment_profiles)
profiler.create_executive_summary(segment_profiles, recommendations)
