# Phase 3: Clustering Experiments
## Code Summarization and Generation Project

This notebook implements and evaluates clustering algorithms:
- K-Means
- DBSCAN
- Hierarchical Clustering
- t-SNE Visualization

In [None]:
import sys
sys.path.append('../src')

import yaml
import pandas as pd
import numpy as np
import json
from clustering_models import ClusteringPipeline
from visualization import ClusteringVisualizer
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## 1. Load Data and Configuration

In [None]:
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Load features
df = pd.read_csv('../data/processed/all_features.csv')

print(f"Dataset shape: {df.shape}")
print(f"Languages: {df['language'].value_counts()}")

## 2. Prepare Data for Clustering

In [None]:
pipeline = ClusteringPipeline(config)

# Prepare data
X_scaled, feature_df = pipeline.prepare_data(df)

print(f"Scaled feature matrix shape: {X_scaled.shape}")
print(f"Features used: {list(feature_df.columns)[:10]}...")  # Show first 10 features

## 3. Apply PCA for Dimensionality Reduction

In [None]:
X_pca = pipeline.apply_pca(X_scaled, n_components=50)

print(f"Reduced dimensions: {X_pca.shape}")
print(f"Explained variance: {pipeline.pca.explained_variance_ratio_.sum():.4f}")

# Plot explained variance
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pipeline.pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.grid(True)
plt.savefig('../results/visualizations/pca_explained_variance.png', dpi=300)
plt.show()

## 4. K-Means Clustering

In [None]:
kmeans_results = pipeline.kmeans_clustering(X_pca)

print("\nK-Means Results:")
print(f"Best parameters: {kmeans_results['best_params']}")
print(f"Best silhouette score: {kmeans_results['best_score']:.4f}")

# Display all results
results_df = pd.DataFrame(kmeans_results['all_results'])
print("\nAll K-Means results:")
print(results_df)

In [None]:
# Plot elbow curve
visualizer = ClusteringVisualizer(config)
visualizer.plot_elbow_curve(kmeans_results['all_results'])

## 5. DBSCAN Clustering

In [None]:
dbscan_results = pipeline.dbscan_clustering(X_pca)

if dbscan_results:
    print("\nDBSCAN Results:")
    print(f"Best parameters: {dbscan_results['best_params']}")
    print(f"Best silhouette score: {dbscan_results['best_score']:.4f}")
    
    results_df = pd.DataFrame(dbscan_results['all_results'])
    print("\nAll DBSCAN results:")
    print(results_df)
else:
    print("DBSCAN: No valid clustering found")

## 6. Hierarchical Clustering

In [None]:
hierarchical_results = pipeline.hierarchical_clustering(X_pca)

print("\nHierarchical Clustering Results:")
print(f"Best parameters: {hierarchical_results['best_params']}")
print(f"Best silhouette score: {hierarchical_results['best_score']:.4f}")

results_df = pd.DataFrame(hierarchical_results['all_results'])
print("\nAll Hierarchical results:")
print(results_df)

## 7. Compare All Methods

In [None]:
all_results = {
    'kmeans': kmeans_results,
    'dbscan': dbscan_results,
    'hierarchical': hierarchical_results
}

visualizer.plot_silhouette_comparison(all_results)

## 8. Visualize Best Clustering with t-SNE

In [None]:
# Use best performing method (highest silhouette score)
best_method = max(all_results.items(), key=lambda x: x[1]['best_score'] if x[1] else -1)
best_labels = best_method[1]['best_labels']
languages = df['language'].values

print(f"\nBest method: {best_method[0].upper()}")

# t-SNE visualization
visualizer.plot_tsne(X_pca, best_labels, languages, 
                      title=f'{best_method[0].upper()} Clustering')

In [None]:
# PCA visualization
visualizer.plot_pca(X_pca, best_labels, languages,
                    title=f'{best_method[0].upper()} Clustering')

## 9. Analyze Cluster Composition

In [None]:
analysis = pipeline.analyze_clusters(df, best_labels, best_method[0])

print("\nCluster Analysis:")
print(json.dumps(analysis, indent=2))

# Plot cluster distribution
visualizer.plot_cluster_distribution(best_labels, languages)

## 10. Save Results

In [None]:
# Save clustering results
pipeline.save_results(all_results, '../results/metrics/clustering_results.json')

# Add cluster labels to dataframe and save
df['cluster'] = best_labels
df.to_csv('../data/processed/features_with_clusters.csv', index=False)

print("Results saved successfully!")

## 11. Hypothesis Testing

Test the hypothesis: Do statically-typed languages cluster together?

In [None]:
# Group languages by typing discipline
static_langs = ['Java', 'Rust']
dynamic_langs = ['Python', 'JavaScript']

# Check cluster purity
for cluster_id in set(best_labels):
    if cluster_id == -1:  # Skip noise in DBSCAN
        continue
    
    cluster_mask = best_labels == cluster_id
    cluster_langs = df[cluster_mask]['language'].value_counts()
    
    print(f"\nCluster {cluster_id}:")
    print(cluster_langs)
    
    # Check if predominantly static or dynamic
    static_count = cluster_langs.get('Java', 0) + cluster_langs.get('Rust', 0)
    dynamic_count = cluster_langs.get('Python', 0) + cluster_langs.get('JavaScript', 0)
    
    if static_count > dynamic_count * 1.5:
        print("  → Predominantly STATIC-typed languages")
    elif dynamic_count > static_count * 1.5:
        print("  → Predominantly DYNAMIC-typed languages")
    else:
        print("  → Mixed typing disciplines")