# 1. Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import seaborn as sns
from collections import Counter
import warnings
import os

warnings.filterwarnings('ignore')
np.random.seed(42)

# 2. Generate synthetic HAR-like dataset

In [None]:
def load_synthetic_har_data():
    n_samples, n_features = 1000, 50
    X = np.random.randn(n_samples, n_features)
    y = np.random.randint(1, 7, size=n_samples)
    
    feature_names = [f"feature_{i+1}" for i in range(n_features)]
    activity_map = {
        1: "WALKING", 2: "WALKING_UPSTAIRS", 3: "WALKING_DOWNSTAIRS",
        4: "SITTING", 5: "STANDING", 6: "LAYING"
    }
    y_names = [activity_map[label] for label in y]
    
    return X, y, y_names, feature_names, activity_map

X, true_labels, activity_names, feature_names, activity_map = load_synthetic_har_data()

print(f"Dataset shape: {X.shape}")
print(f"Activities: {list(activity_map.values())}")

# 3.  Preprocess data

In [None]:
def preprocess_data(X):
    scaler = StandardScaler()
    return scaler.fit_transform(X)

X_processed = preprocess_data(X)

# 4. Run K-Means experiments

In [None]:
def run_kmeans_experiment(X, k_values, n_init=10, max_iter=300, random_state=42):
    results = {}
    for k in k_values:
        print(f"Running k-means with k={k}...")
        kmeans = KMeans(n_clusters=k, init='k-means++', n_init=n_init, max_iter=max_iter, random_state=random_state)
        kmeans.fit(X)
        results[k] = {'model': kmeans, 'labels': kmeans.labels_}
    return results

k_values = [2, 3, 4, 5, 6, 8, 10, 12]
kmeans_results = run_kmeans_experiment(X_processed, k_values)

# 5. Evaluate clustering quality

In [None]:
def calculate_clustering_quality(X, kmeans_results):
    metrics = {}
    for k, result in kmeans_results.items():
        labels = result['labels']
        model = result['model']
        metrics[k] = {
            'inertia': model.inertia_,
            'silhouette': silhouette_score(X, labels) if k > 1 else 0,
            'calinski_harabasz': calinski_harabasz_score(X, labels) if k > 1 else 0,
            'davies_bouldin': davies_bouldin_score(X, labels) if k > 1 else float('inf')
        }
    return metrics

quality_metrics = calculate_clustering_quality(X_processed, kmeans_results)

# 6. Save metrics to CSV

In [None]:
def save_metrics_to_csv(metrics):
    df = pd.DataFrame([{'k': k, **m} for k, m in metrics.items()]).sort_values('k')
    df.to_csv('C:/ML/Labwork2/dataset/kmeans_quality_metrics.csv', index=False)

save_metrics_to_csv(quality_metrics)

# 7. Evaluate clustering vs true labels

In [None]:
def evaluate_clustering(true_labels, kmeans_results, activity_map):
    evaluation = {}
    for k, result in kmeans_results.items():
        cluster_labels = result['labels']
        contingency = pd.crosstab(pd.Series(cluster_labels, name='Cluster'),
                                  pd.Series([activity_map[l] for l in true_labels], name='Activity'))
        cluster_homogeneity = {
            cluster_id: {
                'dominant_activity': cluster_counts.idxmax(),
                'homogeneity': cluster_counts.max() / cluster_counts.sum()
            }
            for cluster_id, cluster_counts in contingency.iterrows()
        }
        evaluation[k] = {'contingency': contingency, 'cluster_homogeneity': cluster_homogeneity}
    return evaluation

evaluation = evaluate_clustering(true_labels, kmeans_results, activity_map)

# Show contingency for k=6
evaluation[6]['contingency']

# 8. Visualize clusters in 2D (PCA)

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_processed)

# Convert true labels to numeric values
true_numeric = [list(activity_map.keys())[list(activity_map.values()).index(name)] 
                for name in [activity_map[l] for l in true_labels]]

save_dir = "C:/ML/Labwork2"

for k, result in kmeans_results.items():
    if k > 10:
        continue
    cluster_labels = result['labels']

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

    # Left: K-means Clusters
    ax1.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis', alpha=0.6)
    ax1.set_title(f'K-means Clusters (k={k})')

    # Right: True Activities
    ax2.scatter(X_pca[:, 0], X_pca[:, 1], c=true_numeric, cmap='tab10', alpha=0.6)
    ax2.set_title('True Activities')

    plt.savefig(f"{save_dir}clusters_vs_true_k{k}.png")
    plt.show()
    plt.close()

# 9. Visualize metrics 

In [None]:
def visualize_metrics(metrics):
    k_vals = sorted(metrics.keys())
    inertia = [metrics[k]['inertia'] for k in k_vals]
    silhouette = [metrics[k]['silhouette'] for k in k_vals]
    ch_index = [metrics[k]['calinski_harabasz'] for k in k_vals]
    db_index = [metrics[k]['davies_bouldin'] for k in k_vals]

    fig, axs = plt.subplots(2, 2, figsize=(14, 10))
    axs[0, 0].plot(k_vals, inertia, 'bo-'); axs[0, 0].set_title("Elbow (Inertia)")
    axs[0, 1].plot(k_vals, silhouette, 'go-'); axs[0, 1].set_title("Silhouette Score")
    axs[1, 0].plot(k_vals, ch_index, 'ro-'); axs[1, 0].set_title("Calinski-Harabasz Index")
    axs[1, 1].plot(k_vals, db_index, 'mo-'); axs[1, 1].set_title("Davies-Bouldin Index")
    for ax in axs.flat: ax.set_xlabel("Number of Clusters (k)"); ax.grid(True)
    plt.tight_layout()
    plt.savefig("C:/ML/Labwork2/visualizations/HAR_metrics.png")
    plt.show()

visualize_metrics(quality_metrics)

# 10. Summary

In [None]:
best_k_silhouette = max(quality_metrics, key=lambda k: quality_metrics[k]['silhouette'])
best_k_calinski = max(quality_metrics, key=lambda k: quality_metrics[k]['calinski_harabasz'])
best_k_davies = min(quality_metrics, key=lambda k: quality_metrics[k]['davies_bouldin'])

print(f"Best k (Silhouette): {best_k_silhouette}")
print(f"Best k (Calinski-Harabasz): {best_k_calinski}")
print(f"Best k (Davies-Bouldin): {best_k_davies}")
print("Ground truth: 6 activities")