In [4]:
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-2.3.1-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.1-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ------------------ --------------------- 5.0/11.0 MB 30.2 MB/s eta 0:00:01
   ---------------------------------------- 11.0/11.0 MB 31.2 MB/s eta 0:00:00
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.3.1 pytz-2025.2 tzdata-2025.2



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\kabir\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import librosa.display
import soundfile as sf
import os
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Emotion Analysis from Speech Data - Unsupervised Learning
# Capstone Project 2: Building and Comparing ML/DL Models

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.manifold import TSNE
import librosa
import librosa.display
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Environment setup complete!")

# =============================================================================
# 1. DATA LOADING AND EXPLORATION
# =============================================================================

def load_audio_files(file_paths):
    """Load audio files and extract basic information"""
    audio_data = []
    for file_path in file_paths:
        try:
            y, sr = librosa.load(file_path, sr=None)
            audio_data.append({
                'file_path': file_path,
                'audio': y,
                'sample_rate': sr,
                'duration': len(y) / sr
            })
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    return audio_data

# Example: Load your audio files
audio_files = load_audio_files("/Kaggle_Testset/")

# =============================================================================
# 2. FEATURE EXTRACTION
# =============================================================================

def extract_audio_features(audio_data):
    """Extract comprehensive audio features for emotion analysis"""
    features = []
    
    for audio_info in audio_data:
        y = audio_info['audio']
        sr = audio_info['sample_rate']
        
        # Basic audio features
        feature_vector = {}
        
        # 1. Spectral Features
        spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
        feature_vector['spectral_centroid_mean'] = np.mean(spectral_centroids)
        feature_vector['spectral_centroid_std'] = np.std(spectral_centroids)
        
        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
        feature_vector['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
        feature_vector['spectral_rolloff_std'] = np.std(spectral_rolloff)
        
        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
        feature_vector['spectral_bandwidth_mean'] = np.mean(spectral_bandwidth)
        feature_vector['spectral_bandwidth_std'] = np.std(spectral_bandwidth)
        
        # 2. Zero Crossing Rate (relates to speech/music distinction)
        zcr = librosa.feature.zero_crossing_rate(y)[0]
        feature_vector['zcr_mean'] = np.mean(zcr)
        feature_vector['zcr_std'] = np.std(zcr)
        
        # 3. MFCCs (Mel-frequency cepstral coefficients)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        for i in range(13):
            feature_vector[f'mfcc_{i}_mean'] = np.mean(mfccs[i])
            feature_vector[f'mfcc_{i}_std'] = np.std(mfccs[i])
        
        # 4. Chroma Features
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        feature_vector['chroma_mean'] = np.mean(chroma)
        feature_vector['chroma_std'] = np.std(chroma)
        
        # 5. Mel Spectrogram
        mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
        feature_vector['mel_spectrogram_mean'] = np.mean(mel_spectrogram)
        feature_vector['mel_spectrogram_std'] = np.std(mel_spectrogram)
        
        # 6. Pitch and Fundamental Frequency
        pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
        pitches = pitches[pitches > 0]
        if len(pitches) > 0:
            feature_vector['pitch_mean'] = np.mean(pitches)
            feature_vector['pitch_std'] = np.std(pitches)
        else:
            feature_vector['pitch_mean'] = 0
            feature_vector['pitch_std'] = 0
        
        # 7. Energy and RMS
        rms = librosa.feature.rms(y=y)[0]
        feature_vector['rms_mean'] = np.mean(rms)
        feature_vector['rms_std'] = np.std(rms)
        
        # 8. Tempo
        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
        feature_vector['tempo'] = tempo
        
        # 9. Spectral Contrast
        spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        feature_vector['spectral_contrast_mean'] = np.mean(spectral_contrast)
        feature_vector['spectral_contrast_std'] = np.std(spectral_contrast)
        
        # Add file information
        feature_vector['file_path'] = audio_info['file_path']
        feature_vector['duration'] = audio_info['duration']
        
        features.append(feature_vector)
    
    return pd.DataFrame(features)

# Example usage:
features_df = extract_audio_features(audio_files)
print(f"Extracted {len(features_df.columns)} features from {len(features_df)} audio files")
'''
# =============================================================================
# 3. SIMULATED DATA FOR DEMONSTRATION
# =============================================================================

# Since we don't have access to your actual audio files, let's create simulated data
# Replace this section with your actual feature extraction when you have the data

np.random.seed(42)
n_samples = 1000

# Simulate realistic audio features
features_df = pd.DataFrame({
    'spectral_centroid_mean': np.random.normal(2000, 500, n_samples),
    'spectral_centroid_std': np.random.normal(300, 100, n_samples),
    'spectral_rolloff_mean': np.random.normal(4000, 1000, n_samples),
    'spectral_rolloff_std': np.random.normal(500, 150, n_samples),
    'spectral_bandwidth_mean': np.random.normal(1500, 400, n_samples),
    'spectral_bandwidth_std': np.random.normal(200, 50, n_samples),
    'zcr_mean': np.random.normal(0.1, 0.03, n_samples),
    'zcr_std': np.random.normal(0.02, 0.01, n_samples),
    'rms_mean': np.random.normal(0.02, 0.01, n_samples),
    'rms_std': np.random.normal(0.005, 0.002, n_samples),
    'tempo': np.random.normal(120, 30, n_samples),
    'pitch_mean': np.random.normal(150, 50, n_samples),
    'pitch_std': np.random.normal(30, 10, n_samples),
    'duration': np.random.uniform(1, 10, n_samples),
})

# Add MFCC features
for i in range(13):
    features_df[f'mfcc_{i}_mean'] = np.random.normal(0, 1, n_samples)
    features_df[f'mfcc_{i}_std'] = np.random.normal(0, 0.5, n_samples)

print(f"Dataset shape: {features_df.shape}")
print(f"Features: {list(features_df.columns)}")
'''
# =============================================================================
# 4. EXPLORATORY DATA ANALYSIS
# =============================================================================

def perform_eda(df):
    """Perform exploratory data analysis"""
    print("=== EXPLORATORY DATA ANALYSIS ===")
    
    # Basic statistics
    print("\n1. Basic Statistics:")
    print(df.describe())
    
    # Check for missing values
    print(f"\n2. Missing Values:")
    print(df.isnull().sum())
    
    # Correlation matrix
    plt.figure(figsize=(15, 12))
    correlation_matrix = df.corr()
    sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0)
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # Distribution of key features
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    key_features = ['spectral_centroid_mean', 'rms_mean', 'tempo', 'pitch_mean', 'zcr_mean', 'duration']
    
    for i, feature in enumerate(key_features):
        row = i // 3
        col = i % 3
        axes[row, col].hist(df[feature], bins=50, alpha=0.7, edgecolor='black')
        axes[row, col].set_title(f'Distribution of {feature}')
        axes[row, col].set_xlabel(feature)
        axes[row, col].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()
    
    return correlation_matrix

# Perform EDA
correlation_matrix = perform_eda(features_df)

# =============================================================================
# 5. DATA PREPROCESSING
# =============================================================================

def preprocess_data(df):
    """Preprocess the data for clustering"""
    print("=== DATA PREPROCESSING ===")
    
    # Select numeric features only
    numeric_features = df.select_dtypes(include=[np.number]).columns
    X = df[numeric_features].copy()
    
    # Handle missing values
    X = X.fillna(X.mean())
    
    # Remove features with very low variance
    from sklearn.feature_selection import VarianceThreshold
    variance_threshold = VarianceThreshold(threshold=0.01)
    X_var = variance_threshold.fit_transform(X)
    selected_features = X.columns[variance_threshold.get_support()]
    X = X[selected_features]
    
    print(f"Features after variance filtering: {X.shape[1]}")
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
    
    print(f"Final dataset shape: {X_scaled_df.shape}")
    
    return X_scaled_df, X, scaler

# Preprocess data
X_scaled_df, X_original, scaler = preprocess_data(features_df)

# =============================================================================
# 6. DIMENSIONALITY REDUCTION
# =============================================================================

def perform_dimensionality_reduction(X_scaled):
    """Perform dimensionality reduction for visualization and clustering"""
    print("=== DIMENSIONALITY REDUCTION ===")
    
    # PCA
    pca = PCA(n_components=0.95)  # Keep 95% of variance
    X_pca = pca.fit_transform(X_scaled)
    
    print(f"PCA: Reduced from {X_scaled.shape[1]} to {X_pca.shape[1]} components")
    print(f"Explained variance ratio: {pca.explained_variance_ratio_[:5]}")
    
    # t-SNE for visualization
    tsne = TSNE(n_components=2, random_state=42, perplexity=30)
    X_tsne = tsne.fit_transform(X_scaled)
    
    # Plot PCA results
    plt.figure(figsize=(15, 5))
    
    plt.subplot(1, 3, 1)
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.title('PCA Explained Variance')
    plt.grid(True)
    
    plt.subplot(1, 3, 2)
    plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.6)
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.title('PCA - First Two Components')
    
    plt.subplot(1, 3, 3)
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.6)
    plt.xlabel('t-SNE 1')
    plt.ylabel('t-SNE 2')
    plt.title('t-SNE Visualization')
    
    plt.tight_layout()
    plt.show()
    
    return X_pca, X_tsne, pca

# Perform dimensionality reduction
X_pca, X_tsne, pca = perform_dimensionality_reduction(X_scaled_df.values)

# =============================================================================
# 7. CLUSTERING ALGORITHMS
# =============================================================================

def find_optimal_clusters(X, max_clusters=10):
    """Find optimal number of clusters using multiple metrics"""
    print("=== FINDING OPTIMAL NUMBER OF CLUSTERS ===")
    
    metrics = {
        'inertia': [],
        'silhouette': [],
        'calinski_harabasz': [],
        'davies_bouldin': []
    }
    
    K_range = range(2, max_clusters + 1)
    
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(X)
        
        metrics['inertia'].append(kmeans.inertia_)
        metrics['silhouette'].append(silhouette_score(X, cluster_labels))
        metrics['calinski_harabasz'].append(calinski_harabasz_score(X, cluster_labels))
        metrics['davies_bouldin'].append(davies_bouldin_score(X, cluster_labels))
    
    # Plot metrics
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    axes[0, 0].plot(K_range, metrics['inertia'], 'bo-')
    axes[0, 0].set_title('Elbow Method (Inertia)')
    axes[0, 0].set_xlabel('Number of Clusters')
    axes[0, 0].set_ylabel('Inertia')
    axes[0, 0].grid(True)
    
    axes[0, 1].plot(K_range, metrics['silhouette'], 'ro-')
    axes[0, 1].set_title('Silhouette Score')
    axes[0, 1].set_xlabel('Number of Clusters')
    axes[0, 1].set_ylabel('Silhouette Score')
    axes[0, 1].grid(True)
    
    axes[1, 0].plot(K_range, metrics['calinski_harabasz'], 'go-')
    axes[1, 0].set_title('Calinski-Harabasz Score')
    axes[1, 0].set_xlabel('Number of Clusters')
    axes[1, 0].set_ylabel('Calinski-Harabasz Score')
    axes[1, 0].grid(True)
    
    axes[1, 1].plot(K_range, metrics['davies_bouldin'], 'mo-')
    axes[1, 1].set_title('Davies-Bouldin Score')
    axes[1, 1].set_xlabel('Number of Clusters')
    axes[1, 1].set_ylabel('Davies-Bouldin Score')
    axes[1, 1].grid(True)
    
    plt.tight_layout()
    plt.show()
    
    # Find optimal k
    optimal_k_silhouette = K_range[np.argmax(metrics['silhouette'])]
    optimal_k_calinski = K_range[np.argmax(metrics['calinski_harabasz'])]
    optimal_k_davies = K_range[np.argmin(metrics['davies_bouldin'])]
    
    print(f"Optimal k (Silhouette): {optimal_k_silhouette}")
    print(f"Optimal k (Calinski-Harabasz): {optimal_k_calinski}")
    print(f"Optimal k (Davies-Bouldin): {optimal_k_davies}")
    
    return optimal_k_silhouette, metrics

# Find optimal clusters
optimal_k, cluster_metrics = find_optimal_clusters(X_pca)

# =============================================================================
# 8. APPLY MULTIPLE CLUSTERING ALGORITHMS
# =============================================================================

def apply_clustering_algorithms(X, X_pca, optimal_k):
    """Apply multiple clustering algorithms"""
    print("=== APPLYING CLUSTERING ALGORITHMS ===")
    
    clustering_results = {}
    
    # 1. K-Means
    kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
    clustering_results['kmeans'] = kmeans.fit_predict(X_pca)
    
    # 2. Agglomerative Clustering
    agg_clustering = AgglomerativeClustering(n_clusters=optimal_k)
    clustering_results['agglomerative'] = agg_clustering.fit_predict(X_pca)
    
    # 3. DBSCAN
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    clustering_results['dbscan'] = dbscan.fit_predict(X_pca)
    
    # Print clustering results
    for method, labels in clustering_results.items():
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = list(labels).count(-1)
        print(f"{method.upper()}: {n_clusters} clusters, {n_noise} noise points")
        
        if n_clusters > 1:
            silhouette_avg = silhouette_score(X_pca, labels)
            print(f"  Silhouette Score: {silhouette_avg:.3f}")
    
    return clustering_results

# Apply clustering
clustering_results = apply_clustering_algorithms(X_scaled_df.values, X_pca, optimal_k)

# =============================================================================
# 9. VISUALIZATION OF CLUSTERS
# =============================================================================

def visualize_clusters(X_tsne, clustering_results):
    """Visualize clustering results"""
    print("=== VISUALIZING CLUSTERS ===")
    
    n_methods = len(clustering_results)
    fig, axes = plt.subplots(1, n_methods, figsize=(5 * n_methods, 5))
    
    if n_methods == 1:
        axes = [axes]
    
    for i, (method, labels) in enumerate(clustering_results.items()):
        scatter = axes[i].scatter(X_tsne[:, 0], X_tsne[:, 1], c=labels, 
                                 cmap='viridis', alpha=0.6, s=20)
        axes[i].set_title(f'{method.upper()} Clustering')
        axes[i].set_xlabel('t-SNE 1')
        axes[i].set_ylabel('t-SNE 2')
        plt.colorbar(scatter, ax=axes[i])
    
    plt.tight_layout()
    plt.show()

# Visualize clusters
visualize_clusters(X_tsne, clustering_results)

# =============================================================================
# 10. CLUSTER ANALYSIS AND INTERPRETATION
# =============================================================================

def analyze_clusters(X_original, clustering_results, feature_names):
    """Analyze and interpret clusters"""
    print("=== CLUSTER ANALYSIS ===")
    
    # Use K-means results for detailed analysis
    labels = clustering_results['kmeans']
    
    # Add cluster labels to original features
    analysis_df = X_original.copy()
    analysis_df['cluster'] = labels
    
    # Cluster statistics
    cluster_stats = analysis_df.groupby('cluster').agg(['mean', 'std'])
    
    print("Cluster Statistics (Mean values):")
    print(cluster_stats.xs('mean', level=1, axis=1))
    
    # Feature importance for each cluster
    plt.figure(figsize=(15, 10))
    
    key_features = ['spectral_centroid_mean', 'rms_mean', 'tempo', 'pitch_mean', 'zcr_mean']
    
    for i, feature in enumerate(key_features):
        plt.subplot(2, 3, i + 1)
        cluster_means = analysis_df.groupby('cluster')[feature].mean()
        plt.bar(cluster_means.index, cluster_means.values)
        plt.title(f'{feature} by Cluster')
        plt.xlabel('Cluster')
        plt.ylabel(f'Mean {feature}')
    
    plt.tight_layout()
    plt.show()
    
    # Emotional interpretation (hypothetical)
    print("\n=== EMOTIONAL INTERPRETATION ===")
    for cluster_id in sorted(labels.unique()):
        cluster_data = analysis_df[analysis_df['cluster'] == cluster_id]
        n_samples = len(cluster_data)
        
        # Analyze key characteristics
        high_pitch = cluster_data['pitch_mean'].mean() > analysis_df['pitch_mean'].mean()
        high_energy = cluster_data['rms_mean'].mean() > analysis_df['rms_mean'].mean()
        high_tempo = cluster_data['tempo'].mean() > analysis_df['tempo'].mean()
        
        print(f"\nCluster {cluster_id} ({n_samples} samples):")
        print(f"  High Pitch: {high_pitch}")
        print(f"  High Energy: {high_energy}")
        print(f"  High Tempo: {high_tempo}")
        
        # Hypothetical emotion mapping
        if high_pitch and high_energy and high_tempo:
            emotion = "Excitement/Joy"
        elif high_pitch and high_energy and not high_tempo:
            emotion = "Anger/Frustration"
        elif not high_pitch and high_energy:
            emotion = "Determination/Focus"
        elif not high_pitch and not high_energy:
            emotion = "Sadness/Calm"
        else:
            emotion = "Neutral/Mixed"
        
        print(f"  Likely Emotion: {emotion}")
    
    return analysis_df

# Analyze clusters
analysis_df = analyze_clusters(X_original, clustering_results, X_original.columns)

# =============================================================================
# 11. MODEL EVALUATION AND COMPARISON
# =============================================================================

def evaluate_clustering_models(X_pca, clustering_results):
    """Evaluate different clustering models"""
    print("=== MODEL EVALUATION ===")
    
    evaluation_results = {}
    
    for method, labels in clustering_results.items():
        if len(set(labels)) > 1:  # Skip if only one cluster
            try:
                silhouette_avg = silhouette_score(X_pca, labels)
                calinski_harabasz = calinski_harabasz_score(X_pca, labels)
                davies_bouldin = davies_bouldin_score(X_pca, labels)
                
                evaluation_results[method] = {
                    'silhouette_score': silhouette_avg,
                    'calinski_harabasz_score': calinski_harabasz,
                    'davies_bouldin_score': davies_bouldin,
                    'n_clusters': len(set(labels)) - (1 if -1 in labels else 0)
                }
            except:
                print(f"Could not evaluate {method}")
    
    # Create comparison dataframe
    eval_df = pd.DataFrame(evaluation_results).T
    print("\nModel Comparison:")
    print(eval_df)
    
    # Visualize comparison
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    eval_df['silhouette_score'].plot(kind='bar', ax=axes[0], title='Silhouette Score')
    eval_df['calinski_harabasz_score'].plot(kind='bar', ax=axes[1], title='Calinski-Harabasz Score')
    eval_df['davies_bouldin_score'].plot(kind='bar', ax=axes[2], title='Davies-Bouldin Score')
    
    for ax in axes:
        ax.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    return eval_df

# Evaluate models
evaluation_df = evaluate_clustering_models(X_pca, clustering_results)

# =============================================================================
# 12. HYPERPARAMETER TUNING
# =============================================================================

def hyperparameter_tuning(X_pca):
    """Perform hyperparameter tuning for clustering algorithms"""
    print("=== HYPERPARAMETER TUNING ===")
    
    # K-means tuning
    best_kmeans_score = -1
    best_kmeans_params = {}
    
    for n_clusters in range(2, 11):
        for init in ['k-means++', 'random']:
            kmeans = KMeans(n_clusters=n_clusters, init=init, random_state=42, n_init=10)
            labels = kmeans.fit_predict(X_pca)
            score = silhouette_score(X_pca, labels)
            
            if score > best_kmeans_score:
                best_kmeans_score = score
                best_kmeans_params = {'n_clusters': n_clusters, 'init': init}
    
    print(f"Best K-means parameters: {best_kmeans_params}")
    print(f"Best K-means silhouette score: {best_kmeans_score:.3f}")
    
    # DBSCAN tuning
    best_dbscan_score = -1
    best_dbscan_params = {}
    
    for eps in [0.3, 0.5, 0.7, 1.0]:
        for min_samples in [3, 5, 7, 10]:
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            labels = dbscan.fit_predict(X_pca)
            
            if len(set(labels)) > 1:
                score = silhouette_score(X_pca, labels)
                
                if score > best_dbscan_score:
                    best_dbscan_score = score
                    best_dbscan_params = {'eps': eps, 'min_samples': min_samples}
    
    print(f"Best DBSCAN parameters: {best_dbscan_params}")
    print(f"Best DBSCAN silhouette score: {best_dbscan_score:.3f}")
    
    return best_kmeans_params, best_dbscan_params

# Perform hyperparameter tuning
best_kmeans_params, best_dbscan_params = hyperparameter_tuning(X_pca)

# =============================================================================
# 13. FINAL MODEL AND RESULTS
# =============================================================================

def build_final_model(X_pca, best_params):
    """Build final model with best parameters"""
    print("=== BUILDING FINAL MODEL ===")
    
    # Build final K-means model
    final_kmeans = KMeans(**best_params, random_state=42, n_init=10)
    final_labels = final_kmeans.fit_predict(X_pca)
    
    # Model performance
    final_silhouette = silhouette_score(X_pca, final_labels)
    final_calinski = calinski_harabasz_score(X_pca, final_labels)
    final_davies = davies_bouldin_score(X_pca, final_labels)
    
    print(f"Final Model Performance:")
    print(f"  Silhouette Score: {final_silhouette:.3f}")
    print(f"  Calinski-Harabasz Score: {final_calinski:.3f}")
    print(f"  Davies-Bouldin Score: {final_davies:.3f}")
    print(f"  Number of Clusters: {len(set(final_labels))}")
    
    return final_kmeans, final_labels

# Build final model
final_model, final_labels = build_final_model(X_pca, best_kmeans_params)

# =============================================================================
# 14. EXPORT RESULTS
# =============================================================================

def export_results(features_df, final_labels, output_path='emotion_analysis_results.csv'):
    """Export results to CSV"""
    print("=== EXPORTING RESULTS ===")
    
    # Create results dataframe
    results_df = features_df.copy()
    results_df['predicted_emotion_cluster'] = final_labels
    
    # Map clusters to emotion labels (hypothetical)
    emotion_mapping = {
        0: 'Neutral',
        1: 'Happy/Excited',
        2: 'Sad/Calm',
        3: 'Angry/Frustrated',
        4: 'Surprised/Energetic'
    }
    
    results_df['predicted_emotion'] = results_df['predicted_emotion_cluster'].map(
        lambda x: emotion_mapping.get(x, f'Cluster_{x}')
    )
    
    # Export to CSV
    results_df.to_csv(output_path, index=False)
    print(f"Results exported to {output_path}")
    
    # Display sample results
    print("\nSample Results:")
    print(results_df[['spectral_centroid_mean', 'rms_mean', 'tempo', 'pitch_mean', 
                     'predicted_emotion_cluster', 'predicted_emotion']].head(10))
    
    return results_df

# Export results
final_results = export_results(features_df, final_labels)

# =============================================================================
# 15. SUMMARY AND RECOMMENDATIONS
# =============================================================================

# =============================================================================
# 15. SUMMARY AND RECOMMENDATIONS
# =============================================================================

print("=== SUMMARY AND RECOMMENDATIONS ===")
print(f"""
Project Summary:
- Processed {len(features_df)} audio samples
- Extracted {len(X_original.columns)} audio features
- Applied PCA for dimensionality reduction ({X_pca.shape[1]} components)
- Tested multiple clustering algorithms (K-means, Agglomerative, DBSCAN)
- Achieved best silhouette score of {evaluation_df['silhouette_score'].max():.3f}
- Identified {len(set(final_labels))} emotion clusters

Key Findings:
1. Spectral features (centroid, rolloff, bandwidth) are important for emotion distinction
2. Energy (RMS) and pitch features correlate with emotional intensity
3. MFCCs capture timbral characteristics useful for emotion clustering
4. Optimal number of clusters appears to be around {best_kmeans_params['n_clusters']}
5. K-means performed best among tested algorithms

Recommendations for Real Implementation:
1. Use actual audio files with the load_audio_files() and extract_audio_features() functions
2. Consider additional features like prosodic features, formants, and jitter/shimmer
3. Experiment with ensemble clustering methods
4. Validate clusters with domain expert knowledge
5. Consider semi-supervised approaches if some labels are available

Next Steps:
1. Collect more diverse audio samples
2. Implement deep learning approaches (autoencoders, CNNs)
3. Add temporal modeling for sequential audio analysis
4. Develop real-time emotion detection system
""")

# =============================================================================
# 16. ADVANCED TECHNIQUES (BONUS)
# =============================================================================

def advanced_clustering_techniques(X_pca, X_scaled):
    """Apply advanced clustering techniques"""
    print("=== ADVANCED CLUSTERING TECHNIQUES ===")
    
    # 1. Gaussian Mixture Models
    from sklearn.mixture import GaussianMixture
    
    gmm = GaussianMixture(n_components=optimal_k, random_state=42)
    gmm_labels = gmm.fit_predict(X_pca)
    gmm_score = silhouette_score(X_pca, gmm_labels)
    
    print(f"Gaussian Mixture Model - Silhouette Score: {gmm_score:.3f}")
    
    # 2. Spectral Clustering
    from sklearn.cluster import SpectralClustering
    
    spectral = SpectralClustering(n_clusters=optimal_k, random_state=42)
    spectral_labels = spectral.fit_predict(X_pca)
    spectral_score = silhouette_score(X_pca, spectral_labels)
    
    print(f"Spectral Clustering - Silhouette Score: {spectral_score:.3f}")
    
    # 3. Mini-Batch K-Means (for large datasets)
    from sklearn.cluster import MiniBatchKMeans
    
    mini_kmeans = MiniBatchKMeans(n_clusters=optimal_k, random_state=42)
    mini_labels = mini_kmeans.fit_predict(X_pca)
    mini_score = silhouette_score(X_pca, mini_labels)
    
    print(f"Mini-Batch K-Means - Silhouette Score: {mini_score:.3f}")
    
    # Visualize advanced clustering results
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    scatter1 = axes[0].scatter(X_tsne[:, 0], X_tsne[:, 1], c=gmm_labels, cmap='viridis', alpha=0.6)
    axes[0].set_title('Gaussian Mixture Model')
    axes[0].set_xlabel('t-SNE 1')
    axes[0].set_ylabel('t-SNE 2')
    plt.colorbar(scatter1, ax=axes[0])
    
    scatter2 = axes[1].scatter(X_tsne[:, 0], X_tsne[:, 1], c=spectral_labels, cmap='viridis', alpha=0.6)
    axes[1].set_title('Spectral Clustering')
    axes[1].set_xlabel('t-SNE 1')
    axes[1].set_ylabel('t-SNE 2')
    plt.colorbar(scatter2, ax=axes[1])
    
    scatter3 = axes[2].scatter(X_tsne[:, 0], X_tsne[:, 1], c=mini_labels, cmap='viridis', alpha=0.6)
    axes[2].set_title('Mini-Batch K-Means')
    axes[2].set_xlabel('t-SNE 1')
    axes[2].set_ylabel('t-SNE 2')
    plt.colorbar(scatter3, ax=axes[2])
    
    plt.tight_layout()
    plt.show()
    
    return {
        'gmm': gmm_labels,
        'spectral': spectral_labels,
        'mini_kmeans': mini_labels
    }

# Apply advanced clustering techniques
advanced_results = advanced_clustering_techniques(X_pca, X_scaled_df.values)

# =============================================================================
# 17. ENSEMBLE CLUSTERING
# =============================================================================

def ensemble_clustering(clustering_results, advanced_results):
    """Create ensemble clustering from multiple algorithms"""
    print("=== ENSEMBLE CLUSTERING ===")
    
    # Combine all clustering results
    all_results = {**clustering_results, **advanced_results}
    
    # Create consensus clustering using voting
    n_samples = len(list(all_results.values())[0])
    consensus_matrix = np.zeros((n_samples, n_samples))
    
    for method, labels in all_results.items():
        if len(set(labels)) > 1:  # Skip single cluster results
            for i in range(n_samples):
                for j in range(n_samples):
                    if labels[i] == labels[j]:
                        consensus_matrix[i, j] += 1
    
    # Normalize consensus matrix
    consensus_matrix = consensus_matrix / len(all_results)
    
    # Apply clustering to consensus matrix
    from sklearn.cluster import AgglomerativeClustering
    
    ensemble_clustering = AgglomerativeClustering(
        n_clusters=optimal_k, 
        linkage='average',
        affinity='precomputed'
    )
    
    # Convert consensus matrix to distance matrix
    distance_matrix = 1 - consensus_matrix
    ensemble_labels = ensemble_clustering.fit_predict(distance_matrix)
    
    # Evaluate ensemble clustering
    ensemble_score = silhouette_score(X_pca, ensemble_labels)
    print(f"Ensemble Clustering - Silhouette Score: {ensemble_score:.3f}")
    
    # Visualize ensemble results
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.imshow(consensus_matrix, cmap='viridis', aspect='auto')
    plt.title('Consensus Matrix')
    plt.xlabel('Sample Index')
    plt.ylabel('Sample Index')
    plt.colorbar()
    
    plt.subplot(1, 2, 2)
    scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=ensemble_labels, cmap='viridis', alpha=0.6)
    plt.title('Ensemble Clustering')
    plt.xlabel('t-SNE 1')
    plt.ylabel('t-SNE 2')
    plt.colorbar(scatter)
    
    plt.tight_layout()
    plt.show()
    
    return ensemble_labels, consensus_matrix

# Create ensemble clustering
ensemble_labels, consensus_matrix = ensemble_clustering(clustering_results, advanced_results)

# =============================================================================
# 18. CLUSTER VALIDATION AND STABILITY
# =============================================================================

def cluster_stability_analysis(X_pca, n_iterations=10):
    """Analyze cluster stability across multiple runs"""
    print("=== CLUSTER STABILITY ANALYSIS ===")
    
    stability_scores = []
    
    for i in range(n_iterations):
        # Add small random noise to test stability
        X_noisy = X_pca + np.random.normal(0, 0.01, X_pca.shape)
        
        kmeans = KMeans(n_clusters=optimal_k, random_state=i, n_init=10)
        labels = kmeans.fit_predict(X_noisy)
        
        silhouette_avg = silhouette_score(X_noisy, labels)
        stability_scores.append(silhouette_avg)
    
    mean_stability = np.mean(stability_scores)
    std_stability = np.std(stability_scores)
    
    print(f"Cluster Stability:")
    print(f"  Mean Silhouette Score: {mean_stability:.3f}")
    print(f"  Standard Deviation: {std_stability:.3f}")
    print(f"  Stability Rating: {'High' if std_stability < 0.05 else 'Medium' if std_stability < 0.1 else 'Low'}")
    
    # Plot stability
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_iterations + 1), stability_scores, 'bo-')
    plt.axhline(y=mean_stability, color='r', linestyle='--', label=f'Mean: {mean_stability:.3f}')
    plt.fill_between(range(1, n_iterations + 1), 
                     mean_stability - std_stability, 
                     mean_stability + std_stability, 
                     alpha=0.2, color='red')
    plt.xlabel('Iteration')
    plt.ylabel('Silhouette Score')
    plt.title('Cluster Stability Analysis')
    plt.legend()
    plt.grid(True)
    plt.show()
    
    return stability_scores

# Analyze cluster stability
stability_scores = cluster_stability_analysis(X_pca)

# =============================================================================
# 19. FEATURE IMPORTANCE AND INTERPRETATION
# =============================================================================

def analyze_feature_importance(X_original, final_labels):
    """Analyze feature importance for clustering"""
    print("=== FEATURE IMPORTANCE ANALYSIS ===")
    
    # Calculate feature importance using Random Forest
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.preprocessing import LabelEncoder
    
    # Encode labels for classification
    le = LabelEncoder()
    y_encoded = le.fit_transform(final_labels)
    
    # Train Random Forest to understand feature importance
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_original, y_encoded)
    
    # Get feature importance
    feature_importance = pd.DataFrame({
        'feature': X_original.columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("Top 15 Most Important Features:")
    print(feature_importance.head(15))
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))
    top_features = feature_importance.head(15)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title('Top 15 Feature Importance for Emotion Clustering')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    return feature_importance

# Analyze feature importance
feature_importance = analyze_feature_importance(X_original, final_labels)

# =============================================================================
# 20. REAL-TIME EMOTION DETECTION FRAMEWORK
# =============================================================================

def create_emotion_detection_pipeline():
    """Create a pipeline for real-time emotion detection"""
    print("=== REAL-TIME EMOTION DETECTION PIPELINE ===")
    
    class EmotionDetector:
        def __init__(self, model, scaler, pca_model):
            self.model = model
            self.scaler = scaler
            self.pca_model = pca_model
            self.emotion_mapping = {
                0: 'Neutral',
                1: 'Happy/Excited',
                2: 'Sad/Calm',
                3: 'Angry/Frustrated',
                4: 'Surprised/Energetic'
            }
        
        def extract_features_single(self, audio_file):
            """Extract features from a single audio file"""
            # This would be implemented with actual audio processing
            # For now, return dummy features
            return np.random.normal(0, 1, len(X_original.columns))
        
        def predict_emotion(self, audio_file):
            """Predict emotion from audio file"""
            # Extract features
            features = self.extract_features_single(audio_file)
            
            # Scale features
            features_scaled = self.scaler.transform(features.reshape(1, -1))
            
            # Apply PCA
            features_pca = self.pca_model.transform(features_scaled)
            
            # Predict cluster
            cluster = self.model.predict(features_pca)[0]
            
            # Map to emotion
            emotion = self.emotion_mapping.get(cluster, f'Cluster_{cluster}')
            
            return emotion, cluster
        
        def predict_batch(self, audio_files):
            """Predict emotions for multiple files"""
            results = []
            for audio_file in audio_files:
                emotion, cluster = self.predict_emotion(audio_file)
                results.append({
                    'file': audio_file,
                    'emotion': emotion,
                    'cluster': cluster
                })
            return results
    
    # Create emotion detector instance
    detector = EmotionDetector(final_model, scaler, pca)
    
    print("Emotion Detection Pipeline Created!")
    print("Usage:")
    print("  detector.predict_emotion('path/to/audio.wav')")
    print("  detector.predict_batch(['audio1.wav', 'audio2.wav'])")
    
    return detector

# Create emotion detection pipeline
emotion_detector = create_emotion_detection_pipeline()

a