In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, adjusted_rand_score
Path('artifacts/figures').mkdir(parents=True, exist_ok=True)
Path('artifacts/labels').mkdir(parents=True, exist_ok=True)
datasets = ['S07-hw-dataset-01.csv', 'S07-hw-dataset-02.csv', 'S07-hw-dataset-03.csv']
results = {}
all_metrics = {}
best_configs = {}
for idx, ds_name in enumerate(datasets, 1):
    print(f"\n \nДатасет {idx}: {ds_name}\n ")
    df = pd.read_csv(f'data/{ds_name}')
    print(f"Размер: {df.shape}")
    print(df.info())
    print(f"\nПропуски:\n{df.isnull().sum()}")
    X = df.drop('sample_id', axis=1)
    sample_ids = df['sample_id']
    numeric_cols = X.select_dtypes(include=[np.number]).columns
    X_clean = SimpleImputer(strategy='median').fit_transform(X[numeric_cols])
    X_scaled = StandardScaler().fit_transform(X_clean)
    print(f"\nДанные после препроцессинга: {X_scaled.shape}")
    print(f"\n--- KMeans ---")
    k_range = range(2, 11)
    kmeans_scores = []
    for k in k_range:
        km = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = km.fit_predict(X_scaled)
        sil = silhouette_score(X_scaled, labels)
        kmeans_scores.append(sil)
    best_k = k_range[np.argmax(kmeans_scores)]
    kmeans_best = KMeans(n_clusters=best_k, random_state=42, n_init=10)
    kmeans_labels = kmeans_best.fit_predict(X_scaled)
    kmeans_metrics = {
        'silhouette': silhouette_score(X_scaled, kmeans_labels),
        'davies_bouldin': davies_bouldin_score(X_scaled, kmeans_labels),
        'calinski_harabasz': calinski_harabasz_score(X_scaled, kmeans_labels),
        'best_k': int(best_k)
    }
    print(f"Лучший k={best_k}, Silhouette={kmeans_metrics['silhouette']:.3f}")
    plt.figure(figsize=(8, 5))
    plt.plot(k_range, kmeans_scores, 'o-')
    plt.xlabel('k')
    plt.ylabel('Silhouette Score')
    plt.title(f'Dataset {idx}: KMeans - Silhouette vs k')
    plt.grid(True, alpha=0.3)
    plt.savefig(f'artifacts/figures/ds{idx}_kmeans_elbow.png', dpi=150, bbox_inches='tight')
    plt.close()
    print(f"\n--- DBSCAN ---")
    eps_range = [0.3, 0.5, 0.7, 1.0, 1.5]
    dbscan_scores = []
    for eps in eps_range:
        db = DBSCAN(eps=eps, min_samples=5)
        labels = db.fit_predict(X_scaled)
        mask = labels != -1
        if len(np.unique(labels[mask])) > 1:
            sil = silhouette_score(X_scaled[mask], labels[mask])
        else:
            sil = -1
        dbscan_scores.append(sil)
    best_eps = eps_range[np.argmax(dbscan_scores)]
    dbscan_best = DBSCAN(eps=best_eps, min_samples=5)
    dbscan_labels = dbscan_best.fit_predict(X_scaled)
    mask = dbscan_labels != -1
    n_noise = np.sum(dbscan_labels == -1)
    dbscan_metrics = {
        'n_noise': int(n_noise),
        'noise_ratio': float(n_noise / len(dbscan_labels)),
        'best_eps': best_eps
    }
    if len(np.unique(dbscan_labels[mask])) > 1:
        dbscan_metrics['silhouette'] = silhouette_score(X_scaled[mask], dbscan_labels[mask])
        dbscan_metrics['davies_bouldin'] = davies_bouldin_score(X_scaled[mask], dbscan_labels[mask])
        dbscan_metrics['calinski_harabasz'] = calinski_harabasz_score(X_scaled[mask], dbscan_labels[mask])
    print(f"Лучший eps={best_eps}, Шум={n_noise} ({dbscan_metrics['noise_ratio']:.2%})")
    plt.figure(figsize=(8, 5))
    plt.plot(eps_range, dbscan_scores, 'o-')
    plt.xlabel('eps')
    plt.ylabel('Silhouette Score')
    plt.title(f'Dataset {idx}: DBSCAN - Silhouette vs eps')
    plt.grid(True, alpha=0.3)
    plt.savefig(f'artifacts/figures/ds{idx}_dbscan_eps.png', dpi=150, bbox_inches='tight')
    plt.close()
    print(f"\n--- Agglomerative ---")
    agg = AgglomerativeClustering(n_clusters=best_k, linkage='ward')
    agg_labels = agg.fit_predict(X_scaled)
    agg_metrics = {
        'silhouette': silhouette_score(X_scaled, agg_labels),
        'davies_bouldin': davies_bouldin_score(X_scaled, agg_labels),
        'calinski_harabasz': calinski_harabasz_score(X_scaled, agg_labels)
    }
    print(f"Silhouette={agg_metrics['silhouette']:.3f}")
    scores = {
        'kmeans': kmeans_metrics.get('silhouette', -1),
        'dbscan': dbscan_metrics.get('silhouette', -1),
        'agglomerative': agg_metrics.get('silhouette', -1)
    }
    best_method = max(scores, key=scores.get)
    best_labels = {'kmeans': kmeans_labels, 'dbscan': dbscan_labels, 'agglomerative': agg_labels}[best_method]
    print(f"\n>>> Лучший метод: {best_method.upper()} (Silhouette={scores[best_method]:.3f})")
    pca = PCA(n_components=2, random_state=42)
    X_pca = pca.fit_transform(X_scaled)
    plt.figure(figsize=(9, 6))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=best_labels, cmap='viridis', alpha=0.6, edgecolors='k', s=30)
    plt.colorbar(label='Cluster')
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})')
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})')
    plt.title(f'Dataset {idx}: {best_method.upper()} - PCA')
    plt.grid(True, alpha=0.3)
    plt.savefig(f'artifacts/figures/ds{idx}_best_pca.png', dpi=150, bbox_inches='tight')
    plt.close()
    pd.DataFrame({'sample_id': sample_ids, 'cluster_label': best_labels}).to_csv(
        f'artifacts/labels/labels_hw07_ds{idx}.csv', index=False
    )
    all_metrics[f'dataset_{idx}'] = {
        'kmeans': kmeans_metrics,
        'dbscan': dbscan_metrics,
        'agglomerative': agg_metrics
    }
    best_configs[f'dataset_{idx}'] = {
        'method': best_method,
        'silhouette': float(scores[best_method])
    }
print(f"\n\nПроверка устойчивости (Dataset 1)\n")

df = pd.read_csv(f'data/{datasets[0]}')
X = df.drop('sample_id', axis=1)
X_clean = SimpleImputer(strategy='median').fit_transform(X.select_dtypes(include=[np.number]))
X_scaled = StandardScaler().fit_transform(X_clean)


km_base = KMeans(n_clusters=3, random_state=42, n_init=10)
labels_base = km_base.fit_predict(X_scaled)

ari_scores = []
for seed in [100, 200, 300, 400, 500]:
    km = KMeans(n_clusters=3, random_state=seed, n_init=10)
    labels = km.fit_predict(X_scaled)
    ari = adjusted_rand_score(labels_base, labels)
    ari_scores.append(ari)

print(f"ARI scores: {ari_scores}")
print(f"Средний ARI: {np.mean(ari_scores):.3f} ± {np.std(ari_scores):.3f}")
with open('artifacts/metrics_summary.json', 'w') as f:
    json.dump(all_metrics, f, indent=2)
with open('artifacts/best_configs.json', 'w') as f:
    json.dump(best_configs, f, indent=2)
print(f"\n\nВсе артефакты сохранены!\n")


 
Датасет 1: S07-hw-dataset-01.csv
 
Размер: (12000, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sample_id  12000 non-null  int64  
 1   f01        12000 non-null  float64
 2   f02        12000 non-null  float64
 3   f03        12000 non-null  float64
 4   f04        12000 non-null  float64
 5   f05        12000 non-null  float64
 6   f06        12000 non-null  float64
 7   f07        12000 non-null  float64
 8   f08        12000 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 843.9 KB
None

Пропуски:
sample_id    0
f01          0
f02          0
f03          0
f04          0
f05          0
f06          0
f07          0
f08          0
dtype: int64

Данные после препроцессинга: (12000, 8)

--- KMeans ---
Лучший k=2, Silhouette=0.522

--- DBSCAN ---
Лучший eps=1.5, Шум=0 (0.00%)

--- Agglomerative ---
Silhouette=0.522

>>> Лу