# End-to-End Clustering Analysis with MAACLI Framework

This notebook implements a comprehensive clustering pipeline with:
- PCA dimensionality reduction
- Correlation analysis
- KMeans clustering with Optuna hyperparameter tuning
- MAACLI framework for interpretability
- Cluster quantile profiling

**MAACLI Framework Components:**
- **M**odel-**A**gnostic: Works with any clustering algorithm
- **A**lgorithm-**A**gnostic: Independent of specific ML algorithms
- **C**luster **L**abel **I**nterpretation: Uses surrogate models for explanation

## 1. Import Required Libraries

In [None]:
# Core data manipulation and numerical libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn components
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score, classification_report
from sklearn.tree import DecisionTreeClassifier

# Hyperparameter optimization
import optuna

# XGBoost for surrogate modeling
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("XGBoost not available, will use DecisionTreeClassifier as surrogate model")

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for better plots
plt.style.use('default')
sns.set_palette("husl")

print("All libraries imported successfully!")

## 2. Load and Explore Dataset

In [None]:
# Load the dataset
try:
    df = pd.read_csv("data.csv")
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Warning: data.csv not found. Using sample data for demonstration.")
    # Create sample data for demonstration
    np.random.seed(42)
    n_samples = 1000
    df = pd.DataFrame({
        'feature_1': np.random.normal(0, 1, n_samples),
        'feature_2': np.random.normal(2, 1.5, n_samples),
        'feature_3': np.random.exponential(2, n_samples),
        'feature_4': np.random.uniform(-5, 5, n_samples),
        'feature_5': np.random.gamma(2, 2, n_samples),
        'categorical_feature': np.random.choice(['A', 'B', 'C'], n_samples)
    })

print(f"Dataset shape: {df.shape}")
print("\n=== Dataset Preview ===")
print(df.head())

In [None]:
# Dataset information
print("\n=== Dataset Information ===")
print(df.info())

print("\n=== Dataset Statistics ===")
print(df.describe())

print("\n=== Missing Values Count ===")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])
if missing_values.sum() == 0:
    print("No missing values found.")

In [None]:
# Automatically detect numerical columns
numerical_columns = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numerical columns ({len(numerical_columns)}): {numerical_columns}")
print(f"Categorical columns ({len(categorical_columns)}): {categorical_columns}")

if len(numerical_columns) == 0:
    raise ValueError("No numerical columns found for clustering analysis!")

## 3. Data Preprocessing

In [None]:
def preprocess_data(df, numerical_columns):
    """
    Preprocess the dataset for clustering analysis.
    
    Args:
        df: Input dataframe
        numerical_columns: List of numerical column names
    
    Returns:
        X_scaled: Scaled feature matrix
        scaler: Fitted StandardScaler object
        df_processed: Processed dataframe
    """
    # Create a copy to avoid modifying original data
    df_processed = df.copy()
    
    # Handle missing values (simple imputation with median)
    for col in numerical_columns:
        if df_processed[col].isnull().sum() > 0:
            median_val = df_processed[col].median()
            df_processed[col].fillna(median_val, inplace=True)
            print(f"Filled {df_processed[col].isnull().sum()} missing values in {col} with median: {median_val:.2f}")
    
    # Extract numerical features
    X = df_processed[numerical_columns].copy()
    
    # Scale the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    print(f"\nPreprocessing completed!")
    print(f"Feature matrix shape: {X_scaled.shape}")
    print(f"Features used: {numerical_columns}")
    
    return X_scaled, scaler, df_processed

# Apply preprocessing
X_scaled, scaler, df_processed = preprocess_data(df, numerical_columns)

## 4. Principal Component Analysis (PCA)

In [None]:
def apply_pca(X_scaled, max_components=10):
    """
    Apply PCA for dimensionality reduction.
    
    Args:
        X_scaled: Scaled feature matrix
        max_components: Maximum number of components
    
    Returns:
        X_pca: PCA-transformed features
        pca: Fitted PCA object
    """
    n_features = X_scaled.shape[1]
    n_components = min(max_components, n_features)
    
    # Apply PCA
    pca = PCA(n_components=n_components, random_state=42)
    X_pca = pca.fit_transform(X_scaled)
    
    # Print explained variance ratios
    print(f"\n=== PCA Results ===")
    print(f"Number of components: {n_components}")
    print(f"Explained variance ratios:")
    
    cumulative_variance = 0
    for i, ratio in enumerate(pca.explained_variance_ratio_):
        cumulative_variance += ratio
        print(f"  PC{i+1}: {ratio:.4f} (cumulative: {cumulative_variance:.4f})")
    
    print(f"\nTotal explained variance: {cumulative_variance:.4f}")
    
    return X_pca, pca

# Apply PCA
X_pca, pca = apply_pca(X_scaled, max_components=10)

## 5. Correlation Analysis

In [None]:
def analyze_correlations(df, numerical_columns):
    """
    Compute and display correlation matrix.
    
    Args:
        df: Input dataframe
        numerical_columns: List of numerical columns
    """
    # Compute correlation matrix
    correlation_matrix = df[numerical_columns].corr()
    
    print("\n=== Correlation Matrix ===")
    print(correlation_matrix.round(3))
    
    # Find highly correlated pairs (>0.8 or <-0.8)
    high_corr_pairs = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            corr_val = correlation_matrix.iloc[i, j]
            if abs(corr_val) > 0.8:
                high_corr_pairs.append((
                    correlation_matrix.columns[i], 
                    correlation_matrix.columns[j], 
                    corr_val
                ))
    
    if high_corr_pairs:
        print("\n=== Highly Correlated Features (|r| > 0.8) ===")
        for feat1, feat2, corr in high_corr_pairs:
            print(f"{feat1} <-> {feat2}: {corr:.3f}")
    else:
        print("\nNo highly correlated feature pairs found (|r| > 0.8)")
    
    return correlation_matrix

# Analyze correlations
correlation_matrix = analyze_correlations(df_processed, numerical_columns)

## 6. KMeans Clustering with Optuna Hyperparameter Tuning

In [None]:
def optimize_kmeans(X, n_trials=50):
    """
    Optimize KMeans hyperparameters using Optuna.
    
    Args:
        X: Feature matrix for clustering
        n_trials: Number of optimization trials
    
    Returns:
        best_params: Best hyperparameters found
        best_score: Best silhouette score achieved
    """
    def objective(trial):
        # Define hyperparameter search space
        n_clusters = trial.suggest_int('n_clusters', 2, 12)
        init_method = trial.suggest_categorical('init', ['k-means++', 'random'])
        n_init = trial.suggest_int('n_init', 10, 50)
        
        # Fit KMeans with suggested parameters
        kmeans = KMeans(
            n_clusters=n_clusters,
            init=init_method,
            n_init=n_init,
            random_state=42
        )
        
        cluster_labels = kmeans.fit_predict(X)
        
        # Calculate silhouette score (objective to maximize)
        score = silhouette_score(X, cluster_labels)
        
        return score
    
    # Create and run optimization study
    study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    
    best_params = study.best_params
    best_score = study.best_value
    
    print(f"\n=== Optuna Optimization Results ===")
    print(f"Best silhouette score: {best_score:.4f}")
    print(f"Best parameters:")
    for param, value in best_params.items():
        print(f"  {param}: {value}")
    
    return best_params, best_score

# Optimize KMeans hyperparameters
print("Starting hyperparameter optimization...")
best_params, best_score = optimize_kmeans(X_pca, n_trials=50)

In [None]:
# Fit the best KMeans model
best_kmeans = KMeans(
    n_clusters=best_params['n_clusters'],
    init=best_params['init'],
    n_init=best_params['n_init'],
    random_state=42
)

cluster_labels = best_kmeans.fit_predict(X_pca)

# Add cluster labels to the dataframe
df_clustered = df_processed.copy()
df_clustered['cluster_label'] = cluster_labels

print(f"\n=== Clustering Results ===")
print(f"Number of clusters: {best_params['n_clusters']}")
print(f"Silhouette score: {best_score:.4f}")
print(f"\nCluster sizes:")
cluster_counts = pd.Series(cluster_labels).value_counts().sort_index()
for cluster, count in cluster_counts.items():
    print(f"  Cluster {cluster}: {count} samples ({count/len(cluster_labels)*100:.1f}%)")

## 7. MAACLI Framework Implementation

### Model-Agnostic Algorithm-Agnostic Cluster Label Interpretation

The MAACLI framework provides interpretability through:
- **Model-Agnostic**: Works with any clustering algorithm
- **Algorithm-Agnostic**: Uses surrogate models for interpretation
- **Local + Global Interpretability**: Feature importance analysis

In [None]:
def train_surrogate_model(X, y, use_xgboost=True):
    """
    Train a surrogate classifier to interpret cluster assignments.
    
    This implements the core of the MAACLI framework:
    - Uses original features to predict cluster labels
    - Provides feature importance for cluster interpretation
    
    Args:
        X: Original feature matrix (not PCA-transformed)
        y: Cluster labels
        use_xgboost: Whether to use XGBoost (if available)
    
    Returns:
        surrogate_model: Trained surrogate classifier
        feature_importance: Feature importance scores
        X_train, X_test, y_train, y_test: Train/test splits
    """
    # Split data for surrogate model training
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    
    # Choose surrogate model
    if use_xgboost and XGBOOST_AVAILABLE:
        print("\n=== Training XGBoost Surrogate Model ===")
        surrogate_model = xgb.XGBClassifier(
            random_state=42,
            eval_metric='mlogloss'
        )
        model_type = "XGBoost"
    else:
        print("\n=== Training Decision Tree Surrogate Model ===")
        surrogate_model = DecisionTreeClassifier(
            random_state=42,
            max_depth=10,
            min_samples_split=20,
            min_samples_leaf=10
        )
        model_type = "Decision Tree"
    
    # Train the surrogate model
    surrogate_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = surrogate_model.predict(X_test)
    
    # Evaluate surrogate model performance
    accuracy = (y_pred == y_test).mean()
    print(f"\n{model_type} Surrogate Model Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    
    # Get feature importance
    if hasattr(surrogate_model, 'feature_importances_'):
        feature_importance = surrogate_model.feature_importances_
    else:
        feature_importance = np.zeros(X.shape[1])  # Fallback
    
    return surrogate_model, feature_importance, (X_train, X_test, y_train, y_test)

# Train surrogate model using original scaled features
surrogate_model, feature_importance, split_data = train_surrogate_model(
    X_scaled, cluster_labels, use_xgboost=True
)

In [None]:
# MAACLI Interpretation: Feature Importance Analysis
print("\n=== MAACLI Framework: Feature Importance Analysis ===")
print("This provides Global Interpretability for cluster assignments\n")

# Create feature importance dataframe
importance_df = pd.DataFrame({
    'feature': numerical_columns,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Feature Importance Ranking:")
for idx, row in importance_df.iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")

# Identify top contributing features
top_features = importance_df.head(3)['feature'].tolist()
print(f"\nTop 3 most important features for cluster separation:")
print(f"{top_features}")

print("\n=== MAACLI Interpretation Summary ===")
print("✓ Model-Agnostic: Works with KMeans clustering")
print("✓ Algorithm-Agnostic: Uses surrogate classifier for interpretation")
print("✓ Global Interpretability: Feature importance across all clusters")
print("✓ Local Interpretability: Can predict individual cluster assignments")

## 8. Cluster Quantile Profiles

In [None]:
def compute_cluster_quantiles(df_clustered, numerical_columns, quantiles=[0.05, 0.25, 0.50, 0.75, 0.95]):
    """
    Compute quantile profiles for each cluster.
    
    Args:
        df_clustered: Dataframe with cluster labels
        numerical_columns: List of numerical columns
        quantiles: List of quantiles to compute
    
    Returns:
        quantile_profiles: Dataframe with quantile profiles
    """
    quantile_data = []
    
    for cluster in sorted(df_clustered['cluster_label'].unique()):
        cluster_data = df_clustered[df_clustered['cluster_label'] == cluster]
        
        for feature in numerical_columns:
            feature_values = cluster_data[feature]
            
            for q in quantiles:
                quantile_value = feature_values.quantile(q)
                quantile_data.append({
                    'cluster': cluster,
                    'feature': feature,
                    'quantile': f'Q{int(q*100):02d}',
                    'value': quantile_value
                })
    
    quantile_profiles = pd.DataFrame(quantile_data)
    return quantile_profiles

# Compute cluster quantile profiles
quantile_profiles = compute_cluster_quantiles(df_clustered, numerical_columns)

print("=== Cluster Quantile Profiles ===")
print("\nThis table shows the distribution characteristics of each feature within each cluster")
print("Quantiles: Q05 (5%), Q25 (25%), Q50 (50%/median), Q75 (75%), Q95 (95%)\n")

# Display quantile table in a readable format
for cluster in sorted(df_clustered['cluster_label'].unique()):
    print(f"\n--- Cluster {cluster} ---")
    cluster_profile = quantile_profiles[quantile_profiles['cluster'] == cluster]
    
    # Pivot to get features as rows and quantiles as columns
    pivot_table = cluster_profile.pivot(index='feature', columns='quantile', values='value')
    print(pivot_table.round(3))

## 9. Results Summary and Data Export

In [None]:
# Print comprehensive summary
print("=== COMPLETE CLUSTERING ANALYSIS SUMMARY ===")
print(f"\nDataset Information:")
print(f"  • Original shape: {df.shape}")
print(f"  • Features used: {len(numerical_columns)} numerical features")
print(f"  • Features: {numerical_columns}")

print(f"\nPCA Results:")
print(f"  • Components: {X_pca.shape[1]}")
print(f"  • Total explained variance: {sum(pca.explained_variance_ratio_):.3f}")

print(f"\nOptimal Clustering Configuration:")
print(f"  • Number of clusters: {best_params['n_clusters']}")
print(f"  • Initialization: {best_params['init']}")
print(f"  • N_init: {best_params['n_init']}")
print(f"  • Silhouette score: {best_score:.4f}")

print(f"\nCluster Distribution:")
for cluster, count in cluster_counts.items():
    print(f"  • Cluster {cluster}: {count} samples ({count/len(cluster_labels)*100:.1f}%)")

print(f"\nMAACLI Interpretability:")
print(f"  • Surrogate model accuracy: {(split_data[3] == surrogate_model.predict(split_data[1])).mean():.3f}")
print(f"  • Top 3 important features: {top_features}")

# Save results
try:
    # Save enriched dataset with cluster labels
    df_clustered.to_csv('clustered_data.csv', index=False)
    print(f"\n✓ Clustered dataset saved as 'clustered_data.csv'")
    
    # Save quantile summary
    quantile_summary = quantile_profiles.pivot_table(
        index=['cluster', 'feature'], 
        columns='quantile', 
        values='value'
    )
    quantile_summary.to_csv('cluster_quantile_profiles.csv')
    print(f"✓ Quantile profiles saved as 'cluster_quantile_profiles.csv'")
    
    # Save feature importance
    importance_df.to_csv('feature_importance_maacli.csv', index=False)
    print(f"✓ Feature importance saved as 'feature_importance_maacli.csv'")
    
except Exception as e:
    print(f"Warning: Could not save files - {e}")

print(f"\n=== Analysis Complete ===")
print(f"This clustering analysis successfully implemented:")
print(f"  ✓ End-to-end preprocessing pipeline")
print(f"  ✓ PCA dimensionality reduction")
print(f"  ✓ Correlation analysis")
print(f"  ✓ Optuna hyperparameter optimization")
print(f"  ✓ MAACLI interpretability framework")
print(f"  ✓ Comprehensive cluster profiling")