## 1. Mount Google Drive

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 2. Setup Environment

In [9]:
# Chuy·ªÉn ƒë·∫øn th∆∞ m·ª•c project
%cd "/content/drive/MyDrive/ƒê·ªì √°n Python/coffee_project"

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', module='sklearn')

# Import libraries
import sys
import os
sys.path.append(os.getcwd())

from src.models.trainer import ModelTrainer, TrainingConfig
from src.models.evaluator import ClusteringEvaluator
from src.models.tuning import HyperparameterTuner, TuningConfig
import pandas as pd
import logging

# Setup logger
logging.basicConfig(level=logging.INFO, format='%(message)s')

print("‚úÖ Environment setup complete!")

/content/drive/MyDrive/ƒê·ªì √°n Python/coffee_project
‚úÖ Environment setup complete!


## 3. Hyperparameter Tuning (Advanced)
Grid search t·∫•t c·∫£ models ƒë·ªÉ t√¨m best hyperparameters

In [14]:
print("="*80)
print("K-MEANS HYPERPARAMETER TUNING")
print("="*80)

# 1. Thi·∫øt l·∫≠p Grid cho K-Means
# n_clusters: S·ªë l∆∞·ª£ng c·ª•m mu·ªën th·ª≠ (quan tr·ªçng nh·∫•t)
# init: Ph∆∞∆°ng ph√°p kh·ªüi t·∫°o t√¢m c·ª•m
# n_init: S·ªë l·∫ßn ch·∫°y thu·∫≠t to√°n v·ªõi c√°c t√¢m ng·∫´u nhi√™n kh√°c nhau
kmeans_grid = {
    "n_clusters": [3, 4, 5, 6, 7, 8, 9, 10],
    "init": ["k-means++"],
    "n_init": [10, 20],
    "max_iter": [300]
}

print("Grid parameters:")
for key, values in kmeans_grid.items():
    print(f"  {key}: {values}")

total = 1
for v in kmeans_grid.values():
    total *= len(v)
print(f"\nTotal combinations: {total}")

# 2. Kh·ªüi t·∫°o Tuner
evaluator = ClusteringEvaluator()
tuning_config = TuningConfig(
    data_path="data/processed/encoded_data.csv",
    results_path="results/kmeans_tuning.csv",  # ƒê·ªïi t√™n file l∆∞u
    metric_selection="silhouette"              # Metric ch√≠nh ƒë·ªÉ ch·ªçn model
)
tuner = HyperparameterTuner(config=tuning_config, evaluator=evaluator)

# 3. Ch·∫°y Tuning
# L∆∞u √Ω: "kmeans" ph·∫£i kh·ªõp v·ªõi t√™n model trong ModelTrainer
tuner.run_grid_search("kmeans", kmeans_grid)

# 4. L∆∞u k·∫øt qu·∫£ t·ªïng h·ª£p
tuner.save_results()

# 5. L·∫•y k·∫øt qu·∫£ t·ªët nh·∫•t v√† Train l·∫°i
df_results = tuner.get_summary()

if len(df_results) > 0:
    best = df_results.iloc[0]

    print("\n" + "="*80)
    print("üèÜ BEST K-MEANS CONFIG")
    print("="*80)
    # Hi·ªÉn th·ªã c√°c tham s·ªë c·ªßa K-Means
    print(f"  n_clusters             : {int(best['n_clusters'])}")
    print(f"  init                   : {best['init']}")
    print(f"  n_init                 : {best['n_init']}")

    print(f"\n  Silhouette Score       : {best['silhouette']:.4f}")
    print(f"  Calinski-Harabasz      : {best['calinski_harabasz']:.2f}")
    print(f"  Davies-Bouldin Index   : {best['davies_bouldin']:.4f}")

    # Train v√† l∆∞u model t·ªët nh·∫•t
    print("\nüîÑ Training best model...")

    # Chu·∫©n b·ªã params (n_clusters th∆∞·ªùng ƒë∆∞·ª£c t√°ch ri√™ng trong logic c·ªßa trainer)
    best_params = {
        "init": best['init'],
        "n_init": int(best['n_init']),
        "max_iter": int(best['max_iter'])
    }

    config = TrainingConfig(
        data_path="data/processed/encoded_data.csv",
        model_type="kmeans",
        n_clusters=int(best['n_clusters']), # Quan tr·ªçng: KMeans c·∫ßn n_clusters
        model_params=best_params,
        model_path="results/kmeans_best.pkl"
    )

    trainer = ModelTrainer(config=config, evaluator=evaluator)
    trainer.load_data()
    trainer.train_model()
    trainer.save_model()
    trainer.save_labels("results/kmeans_best_labels.csv")

    print("‚úÖ Best K-Means model saved!")
    print("   Model: results/kmeans_best.pkl")
    print("   Labels: results/kmeans_best_labels.csv")

    # Show top 5 configs
    print("\n" + "="*80)
    print("üìä TOP 5 CONFIGS")
    print("="*80)
    # Ch·ªâ ch·ªçn c√°c c·ªôt c√≥ √Ω nghƒ©a v·ªõi K-Means
    cols = ['n_clusters', 'init', 'n_init',
            'silhouette', 'calinski_harabasz', 'davies_bouldin']
    # L·ªçc c√°c c·ªôt th·ª±c s·ª± t·ªìn t·∫°i trong df k·∫øt qu·∫£ (ƒë·ªÅ ph√≤ng l·ªói key)
    cols = [c for c in cols if c in df_results.columns]
    print(df_results[cols].head(5).to_string(index=False))

else:
    print("‚ö†Ô∏è No valid results found")

üìÇ Loading data for tuning from data/processed/encoded_data.csv...
  ‚úì Data loaded: (3685, 58)

üöÄ Starting Grid Search for: KMEANS
  Total combinations to test: 16


K-MEANS HYPERPARAMETER TUNING
Grid parameters:
  n_clusters: [3, 4, 5, 6, 7, 8, 9, 10]
  init: ['k-means++']
  n_init: [10, 20]
  max_iter: [300]

Total combinations: 16


  [1/16] KMEANS_n_clusters3_initk-means++_n_init10_max_iter300: Sil=0.2544, CH=1269.22, DB=1.5338, Clusters=3
  [2/16] KMEANS_n_clusters3_initk-means++_n_init20_max_iter300: Sil=0.2544, CH=1269.22, DB=1.5338, Clusters=3
  [3/16] KMEANS_n_clusters4_initk-means++_n_init10_max_iter300: Sil=0.2355, CH=1030.48, DB=1.8631, Clusters=4
  [4/16] KMEANS_n_clusters4_initk-means++_n_init20_max_iter300: Sil=0.2355, CH=1030.48, DB=1.8631, Clusters=4
  [5/16] KMEANS_n_clusters5_initk-means++_n_init10_max_iter300: Sil=0.2352, CH=894.63, DB=1.7419, Clusters=5
  [6/16] KMEANS_n_clusters5_initk-means++_n_init20_max_iter300: Sil=0.2352, CH=894.63, DB=1.7419, Clusters=5
  [7/16] KMEANS_n_clusters6_initk-means++_n_init10_max_iter300: Sil=0.1937, CH=820.04, DB=1.8492, Clusters=6
  [8/16] KMEANS_n_clusters6_initk-means++_n_init20_max_iter300: Sil=0.1938, CH=820.06, DB=1.8499, Clusters=6
  [9/16] KMEANS_n_clusters7_initk-means++_n_init10_max_iter300: Sil=0.2019, CH=777.00, DB=1.6383, Clusters=7
  [10/16] KMEAN

model_type  n_clusters  silhouette  calinski_harabasz  davies_bouldin
    kmeans           3    0.254433        1269.217570        1.533831
    kmeans           3    0.254433        1269.217570        1.533831
    kmeans           4    0.235535        1030.477913        1.863101

üèÜ BEST K-MEANS CONFIG
  n_clusters             : 3
  init                   : k-means++
  n_init                 : 10

  Silhouette Score       : 0.2544
  Calinski-Harabasz      : 1269.22
  Davies-Bouldin Index   : 1.5338

üîÑ Training best model...


üíæ Labels saved: results/kmeans_best_labels.csv


‚úÖ Best K-Means model saved!
   Model: results/kmeans_best.pkl
   Labels: results/kmeans_best_labels.csv

üìä TOP 5 CONFIGS
 n_clusters      init  n_init  silhouette  calinski_harabasz  davies_bouldin
          3 k-means++      10    0.254433        1269.217570        1.533831
          3 k-means++      20    0.254433        1269.217570        1.533831
          4 k-means++      10    0.235535        1030.477913        1.863101
          4 k-means++      20    0.235535        1030.477913        1.863101
          5 k-means++      10    0.235202         894.634671        1.741909


In [15]:
print("="*80)
print("K-MEANS HYPERPARAMETER TUNING")
print("="*80)

# 1. Thi·∫øt l·∫≠p Grid cho K-Means
# n_clusters: S·ªë l∆∞·ª£ng c·ª•m mu·ªën th·ª≠ (quan tr·ªçng nh·∫•t)
# init: Ph∆∞∆°ng ph√°p kh·ªüi t·∫°o t√¢m c·ª•m
# n_init: S·ªë l·∫ßn ch·∫°y thu·∫≠t to√°n v·ªõi c√°c t√¢m ng·∫´u nhi√™n kh√°c nhau
kmeans_grid = {
    "n_clusters": [3, 4, 5, 6, 7, 8, 9, 10],
    "init": ["k-means++"],
    "n_init": [10, 20],
    "max_iter": [300]
}

print("Grid parameters:")
for key, values in kmeans_grid.items():
    print(f"  {key}: {values}")

total = 1
for v in kmeans_grid.values():
    total *= len(v)
print(f"\nTotal combinations: {total}")

# 2. Kh·ªüi t·∫°o Tuner
evaluator = ClusteringEvaluator()
tuning_config = TuningConfig(
    data_path="data/processed/encoded_data.csv",
    results_path="results/kmeans_composite_tuning.csv",
    metric_selection="composite",
    silhouette_weight=0.4,
    calinski_weight=0.3,
    davies_weight=0.3
)
tuner = HyperparameterTuner(config=tuning_config, evaluator=evaluator)

# 3. Ch·∫°y Tuning
# L∆∞u √Ω: "kmeans" ph·∫£i kh·ªõp v·ªõi t√™n model trong ModelTrainer
tuner.run_grid_search("kmeans", kmeans_grid)

# 4. L∆∞u k·∫øt qu·∫£ t·ªïng h·ª£p
tuner.save_results()

# 5. L·∫•y k·∫øt qu·∫£ t·ªët nh·∫•t v√† Train l·∫°i
df_results = tuner.get_summary()

if len(df_results) > 0:
    best = df_results.iloc[0]

    print("\n" + "="*80)
    print("üèÜ BEST K-MEANS CONFIG")
    print("="*80)
    # Hi·ªÉn th·ªã c√°c tham s·ªë c·ªßa K-Means
    print(f"  n_clusters             : {int(best['n_clusters'])}")
    print(f"  init                   : {best['init']}")
    print(f"  n_init                 : {best['n_init']}")

    print(f"\n  Silhouette Score       : {best['silhouette']:.4f}")
    print(f"  Calinski-Harabasz      : {best['calinski_harabasz']:.2f}")
    print(f"  Davies-Bouldin Index   : {best['davies_bouldin']:.4f}")

    # Train v√† l∆∞u model t·ªët nh·∫•t
    print("\nüîÑ Training best model...")

    # Chu·∫©n b·ªã params (n_clusters th∆∞·ªùng ƒë∆∞·ª£c t√°ch ri√™ng trong logic c·ªßa trainer)
    best_params = {
        "init": best['init'],
        "n_init": int(best['n_init']),
        "max_iter": int(best['max_iter'])
    }

    config = TrainingConfig(
        data_path="data/processed/encoded_data.csv",
        model_type="kmeans",
        n_clusters=int(best['n_clusters']), # Quan tr·ªçng: KMeans c·∫ßn n_clusters
        model_params=best_params,
        model_path="results/kmeans_composite_best.pkl"
    )

    trainer = ModelTrainer(config=config, evaluator=evaluator)
    trainer.load_data()
    trainer.train_model()
    trainer.save_model()
    trainer.save_labels("results/kmeans_best_composite_labels.csv")

    print("‚úÖ Best K-Means model saved!")
    print("   Model: results/kmeans_composite_best.pkl")
    print("   Labels: results/kmeans_best_composite_labels.csv")

    # Show top 5 configs
    print("\n" + "="*80)
    print("üìä TOP 5 CONFIGS")
    print("="*80)
    # Ch·ªâ ch·ªçn c√°c c·ªôt c√≥ √Ω nghƒ©a v·ªõi K-Means
    cols = ['n_clusters', 'init', 'n_init',
            'silhouette', 'calinski_harabasz', 'davies_bouldin']
    # L·ªçc c√°c c·ªôt th·ª±c s·ª± t·ªìn t·∫°i trong df k·∫øt qu·∫£ (ƒë·ªÅ ph√≤ng l·ªói key)
    cols = [c for c in cols if c in df_results.columns]
    print(df_results[cols].head(5).to_string(index=False))

else:
    print("‚ö†Ô∏è No valid results found")

üìÇ Loading data for tuning from data/processed/encoded_data.csv...
  ‚úì Data loaded: (3685, 58)

üöÄ Starting Grid Search for: KMEANS
  Total combinations to test: 16


K-MEANS HYPERPARAMETER TUNING
Grid parameters:
  n_clusters: [3, 4, 5, 6, 7, 8, 9, 10]
  init: ['k-means++']
  n_init: [10, 20]
  max_iter: [300]

Total combinations: 16


  [1/16] KMEANS_n_clusters3_initk-means++_n_init10_max_iter300: Sil=0.2544, CH=1269.22, DB=1.5338, Clusters=3
  [2/16] KMEANS_n_clusters3_initk-means++_n_init20_max_iter300: Sil=0.2544, CH=1269.22, DB=1.5338, Clusters=3
  [3/16] KMEANS_n_clusters4_initk-means++_n_init10_max_iter300: Sil=0.2355, CH=1030.48, DB=1.8631, Clusters=4
  [4/16] KMEANS_n_clusters4_initk-means++_n_init20_max_iter300: Sil=0.2355, CH=1030.48, DB=1.8631, Clusters=4
  [5/16] KMEANS_n_clusters5_initk-means++_n_init10_max_iter300: Sil=0.2352, CH=894.63, DB=1.7419, Clusters=5
  [6/16] KMEANS_n_clusters5_initk-means++_n_init20_max_iter300: Sil=0.2352, CH=894.63, DB=1.7419, Clusters=5
  [7/16] KMEANS_n_clusters6_initk-means++_n_init10_max_iter300: Sil=0.1937, CH=820.04, DB=1.8492, Clusters=6
  [8/16] KMEANS_n_clusters6_initk-means++_n_init20_max_iter300: Sil=0.1938, CH=820.06, DB=1.8499, Clusters=6
  [9/16] KMEANS_n_clusters7_initk-means++_n_init10_max_iter300: Sil=0.2019, CH=777.00, DB=1.6383, Clusters=7
  [10/16] KMEAN

model_type  n_clusters  silhouette  calinski_harabasz  davies_bouldin  composite_score
    kmeans           3    0.254433        1269.217570        1.533831         0.701773
    kmeans           3    0.254433        1269.217570        1.533831         0.701773
    kmeans           7    0.201851         776.998807        1.638289         0.354408

üèÜ BEST K-MEANS CONFIG
  n_clusters             : 3
  init                   : k-means++
  n_init                 : 10

  Silhouette Score       : 0.2544
  Calinski-Harabasz      : 1269.22
  Davies-Bouldin Index   : 1.5338

üîÑ Training best model...


üíæ Labels saved: results/kmeans_best_composite_labels.csv


‚úÖ Best K-Means model saved!
   Model: results/kmeans_composite_best.pkl
   Labels: results/kmeans_best_composite_labels.csv

üìä TOP 5 CONFIGS
 n_clusters      init  n_init  silhouette  calinski_harabasz  davies_bouldin
          3 k-means++      10    0.254433        1269.217570        1.533831
          3 k-means++      20    0.254433        1269.217570        1.533831
          7 k-means++      20    0.201851         776.998807        1.638289
          7 k-means++      10    0.201851         776.998807        1.638289
          5 k-means++      20    0.235202         894.634671        1.741909


In [None]:
print("="*80)
print("DBSCAN HYPERPARAMETER TUNING - COMPOSITE METRIC")
print("="*80)

# 1. Thi·∫øt l·∫≠p Grid cho DBSCAN
# eps: Kho·∫£ng c√°ch t·ªëi ƒëa gi·ªØa 2 ƒëi·ªÉm ƒë·ªÉ ƒë∆∞·ª£c coi l√† c√πng v√πng l√¢n c·∫≠n
# min_samples: S·ªë ƒëi·ªÉm t·ªëi thi·ªÉu ƒë·ªÉ t·∫°o th√†nh 1 core point
# metric: ƒê·ªô ƒëo kho·∫£ng c√°ch
dbscan_grid = {
    "eps": [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0],
    "min_samples": [5, 10, 15, 20],
    "metric": ["euclidean", "manhattan"]
}

print("Grid parameters:")
for key, values in dbscan_grid.items():
    print(f"  {key}: {values}")

total = 1
for v in dbscan_grid.values():
    total *= len(v)
print(f"\nTotal combinations: {total}")

# 2. Kh·ªüi t·∫°o Tuner v·ªõi COMPOSITE METRIC
evaluator = ClusteringEvaluator()
tuning_config = TuningConfig(
    data_path="data/processed/encoded_data.csv",
    results_path="results/dbscan_composite_tuning.csv",
    metric_selection="composite",  # S·ª≠ d·ª•ng composite score
    silhouette_weight=0.4,
    calinski_weight=0.3,
    davies_weight=0.3
)
tuner = HyperparameterTuner(config=tuning_config, evaluator=evaluator)

print(f"\nüìä Using Composite Score:")
print(f"   Silhouette weight     : {tuning_config.silhouette_weight}")
print(f"   Calinski-Harabasz wt  : {tuning_config.calinski_weight}")
print(f"   Davies-Bouldin wt     : {tuning_config.davies_weight}")

# 3. Ch·∫°y Tuning
print("\nüîÑ Running DBSCAN grid search...")
print("‚ö†Ô∏è  Note: DBSCAN c√≥ th·ªÉ t·∫°o nhi·ªÅu noise points (cluster -1)")
tuner.run_grid_search("dbscan", dbscan_grid)

# 4. L∆∞u k·∫øt qu·∫£
tuner.save_results()

# 5. L·∫•y k·∫øt qu·∫£ t·ªët nh·∫•t
df_results = tuner.get_summary()

if len(df_results) > 0:
    best = df_results.iloc[0]

    print("\n" + "="*80)
    print("üèÜ BEST DBSCAN CONFIG (by Composite Score)")
    print("="*80)
    print(f"  eps                    : {best['eps']}")
    print(f"  min_samples            : {int(best['min_samples'])}")
    print(f"  metric                 : {best['metric']}")
    print(f"\n  üìà METRICS:")
    print(f"  Silhouette Score       : {best['silhouette']:.4f}")
    print(f"  Calinski-Harabasz      : {best['calinski_harabasz']:.2f}")
    print(f"  Davies-Bouldin Index   : {best['davies_bouldin']:.4f}")
    print(f"  Composite Score        : {best['composite_score']:.4f}")
    print(f"  Number of Clusters     : {int(best['n_clusters'])}")

    # Train v√† l∆∞u model t·ªët nh·∫•t
    print("\nüîÑ Training best model...")
    best_params = {
        "eps": float(best['eps']),
        "min_samples": int(best['min_samples']),
        "metric": best['metric']
    }

    config = TrainingConfig(
        data_path="data/processed/encoded_data.csv",
        model_type="dbscan",
        model_params=best_params,
        model_path="results/dbscan_composite_best.pkl"
    )

    trainer = ModelTrainer(config=config, evaluator=evaluator)
    trainer.load_data()
    trainer.train_model()
    
    # Ki·ªÉm tra s·ªë clusters v√† noise
    labels = trainer.get_cluster_labels()
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)
    noise_pct = n_noise / len(labels) * 100
    
    print(f"\nüìä Cluster Analysis:")
    print(f"  Total clusters (excluding noise): {n_clusters}")
    print(f"  Noise points: {n_noise} ({noise_pct:.1f}%)")
    
    trainer.save_model()
    trainer.save_labels("results/dbscan_composite_best_labels.csv")

    print("\n‚úÖ Best DBSCAN model saved!")
    print("   Model: results/dbscan_composite_best.pkl")
    print("   Labels: results/dbscan_composite_best_labels.csv")


## 5D. DBSCAN Tuning

T√¨m best configuration cho DBSCAN (Density-Based Spatial Clustering)

In [None]:
print("="*80)
print("GMM (Gaussian Mixture Model) - COMPOSITE METRIC")
print("="*80)

# 1. Thi·∫øt l·∫≠p Grid cho GMM
# n_clusters: S·ªë l∆∞·ª£ng Gaussian components
# covariance_type: Lo·∫°i ma tr·∫≠n covariance
#   - 'full': M·ªói component c√≥ ma tr·∫≠n covariance ri√™ng (linh ho·∫°t nh·∫•t)
#   - 'tied': T·∫•t c·∫£ components d√πng chung 1 ma tr·∫≠n covariance
#   - 'diag': Ma tr·∫≠n ƒë∆∞·ªùng ch√©o (assume features ƒë·ªôc l·∫≠p)
#   - 'spherical': Ma tr·∫≠n ƒë∆∞·ªùng ch√©o v·ªõi variance b·∫±ng nhau
# n_init: S·ªë l·∫ßn kh·ªüi t·∫°o ng·∫´u nhi√™n
# max_iter: S·ªë v√≤ng l·∫∑p t·ªëi ƒëa cho EM algorithm
gmm_grid = {
    "n_clusters": [3, 4, 5, 6, 7, 8],
    "covariance_type": ["full", "tied", "diag", "spherical"],
    "n_init": [10],
    "max_iter": [200]
}

print("Grid parameters:")
for key, values in gmm_grid.items():
    print(f"  {key}: {values}")

total = 1
for v in gmm_grid.values():
    total *= len(v)
print(f"\nTotal combinations: {total}")

# 2. Kh·ªüi t·∫°o Tuner v·ªõi COMPOSITE METRIC
evaluator = ClusteringEvaluator()
tuning_config = TuningConfig(
    data_path="data/processed/encoded_data.csv",
    results_path="results/gmm_composite_tuning.csv",
    metric_selection="composite",
    silhouette_weight=0.4,
    calinski_weight=0.3,
    davies_weight=0.3
)
tuner = HyperparameterTuner(config=tuning_config, evaluator=evaluator)

print(f"\nüìä Using Composite Score:")
print(f"   Silhouette weight     : {tuning_config.silhouette_weight}")
print(f"   Calinski-Harabasz wt  : {tuning_config.calinski_weight}")
print(f"   Davies-Bouldin wt     : {tuning_config.davies_weight}")

# 3. Ch·∫°y Tuning
print("\nüîÑ Running GMM grid search...")
print("üí° GMM l√† probabilistic model, m·ªói ƒëi·ªÉm c√≥ x√°c su·∫•t thu·ªôc t·ª´ng cluster")
tuner.run_grid_search("gmm", gmm_grid)

# 4. L∆∞u k·∫øt qu·∫£
tuner.save_results()

# 5. L·∫•y k·∫øt qu·∫£ t·ªët nh·∫•t
df_results = tuner.get_summary()

if len(df_results) > 0:
    best = df_results.iloc[0]

    print("\n" + "="*80)
    print("üèÜ BEST GMM CONFIG (by Composite Score)")
    print("="*80)
    print(f"  n_clusters             : {int(best['n_clusters'])}")
    print(f"  covariance_type        : {best['covariance_type']}")
    print(f"  n_init                 : {int(best['n_init'])}")
    print(f"  max_iter               : {int(best['max_iter'])}")
    print(f"\n  üìà METRICS:")
    print(f"  Silhouette Score       : {best['silhouette']:.4f}")
    print(f"  Calinski-Harabasz      : {best['calinski_harabasz']:.2f}")
    print(f"  Davies-Bouldin Index   : {best['davies_bouldin']:.4f}")
    print(f"  Composite Score        : {best['composite_score']:.4f}")

    # Train v√† l∆∞u model t·ªët nh·∫•t
    print("\nüîÑ Training best model...")
    best_params = {
        "covariance_type": best['covariance_type'],
        "n_init": int(best['n_init']),
        "max_iter": int(best['max_iter'])
    }

    config = TrainingConfig(
        data_path="data/processed/encoded_data.csv",
        model_type="gmm",
        n_clusters=int(best['n_clusters']),
        model_params=best_params,
        model_path="results/gmm_composite_best.pkl"
    )

    trainer = ModelTrainer(config=config, evaluator=evaluator)
    trainer.load_data()
    trainer.train_model()
    trainer.save_model()
    trainer.save_labels("results/gmm_composite_best_labels.csv")

    print("\n‚úÖ Best GMM model saved!")
    print("   Model: results/gmm_composite_best.pkl")
    print("   Labels: results/gmm_composite_best_labels.csv")

    # Show top 5 configs
    print("\n" + "="*80)
    print("üìä TOP 5 CONFIGS (by Composite Score)")
    print("="*80)
    cols = ['n_clusters', 'covariance_type', 'n_init',
            'silhouette', 'calinski_harabasz', 'davies_bouldin', 'composite_score']
    cols = [c for c in cols if c in df_results.columns]
    print(df_results[cols].head(5).to_string(index=False))
    
    # Ph√¢n t√≠ch covariance type performance
    print("\n" + "="*80)
    print("üìä COVARIANCE TYPE COMPARISON")
    print("="*80)
    if 'covariance_type' in df_results.columns:
        cov_analysis = df_results.groupby('covariance_type').agg({
            'silhouette': 'mean',
            'calinski_harabasz': 'mean',
            'davies_bouldin': 'mean',
            'composite_score': 'mean'
        }).round(4)
        print(cov_analysis)
        print("\nüí° Interpretation:")
        print("   - 'full': T·ªët nh·∫•t nh∆∞ng t·ªën b·ªô nh·ªõ (n_features¬≤)")
        print("   - 'tied': C√¢n b·∫±ng gi·ªØa flexibility v√† efficiency")
        print("   - 'diag': Nhanh, ph√π h·ª£p v·ªõi high-dimensional data")
        print("   - 'spherical': Nhanh nh·∫•t nh∆∞ng √≠t linh ho·∫°t")

else:
    print("‚ö†Ô∏è No valid results found")

In [None]:
print("="*80)
print("HDBSCAN - COMPOSITE METRIC")
print("="*80)

# 1. Thi·∫øt l·∫≠p Grid cho HDBSCAN
# min_cluster_size: S·ªë ƒëi·ªÉm t·ªëi thi·ªÉu trong 1 cluster (quan tr·ªçng nh·∫•t!)
# min_samples: S·ªë ƒëi·ªÉm trong neighborhood ƒë·ªÉ t√≠nh core distance
# metric: ƒê·ªô ƒëo kho·∫£ng c√°ch
# cluster_selection_method:
#   - 'eom' (Excess of Mass): Ch·ªçn cluster stable nh·∫•t
#   - 'leaf': Ch·ªçn leaf clusters trong hierarchy tree
hdbscan_grid = {
    "min_cluster_size": [10, 20, 30, 50, 100],
    "min_samples": [10, 20],
    "metric": ["euclidean", "manhattan"],
    "cluster_selection_method": ["eom"]
}

print("Grid parameters:")
for key, values in hdbscan_grid.items():
    print(f"  {key}: {values}")

total = 1
for v in hdbscan_grid.values():
    total *= len(v)
print(f"\nTotal combinations: {total}")

# 2. Kh·ªüi t·∫°o Tuner v·ªõi COMPOSITE METRIC
evaluator = ClusteringEvaluator()
tuning_config = TuningConfig(
    data_path="data/processed/encoded_data.csv",
    results_path="results/hdbscan_composite_tuning.csv",
    metric_selection="composite",
    silhouette_weight=0.4,
    calinski_weight=0.3,
    davies_weight=0.3
)
tuner = HyperparameterTuner(config=tuning_config, evaluator=evaluator)

print(f"\nüìä Using Composite Score:")
print(f"   Silhouette weight     : {tuning_config.silhouette_weight}")
print(f"   Calinski-Harabasz wt  : {tuning_config.calinski_weight}")
print(f"   Davies-Bouldin wt     : {tuning_config.davies_weight}")

# 3. Ch·∫°y Tuning
print("\nüîÑ Running HDBSCAN grid search...")
print("üí° HDBSCAN advantages:")
print("   ‚úÖ T·ª± ƒë·ªông t√¨m s·ªë clusters (kh√¥ng c·∫ßn ch·ªâ ƒë·ªãnh K)")
print("   ‚úÖ Ph√°t hi·ªán clusters v·ªõi m·∫≠t ƒë·ªô kh√°c nhau")
print("   ‚úÖ Robust v·ªõi noise v√† outliers")
tuner.run_grid_search("hdbscan", hdbscan_grid)

# 4. L∆∞u k·∫øt qu·∫£
tuner.save_results()

# 5. L·∫•y k·∫øt qu·∫£ t·ªët nh·∫•t
df_results = tuner.get_summary()

if len(df_results) > 0:
    best = df_results.iloc[0]

    print("\n" + "="*80)
    print("üèÜ BEST HDBSCAN CONFIG (by Composite Score)")
    print("="*80)
    print(f"  min_cluster_size          : {int(best['min_cluster_size'])}")
    print(f"  min_samples               : {int(best['min_samples'])}")
    print(f"  metric                    : {best['metric']}")
    print(f"  cluster_selection_method  : {best['cluster_selection_method']}")
    print(f"\n  üìà METRICS:")
    print(f"  Silhouette Score          : {best['silhouette']:.4f}")
    print(f"  Calinski-Harabasz         : {best['calinski_harabasz']:.2f}")
    print(f"  Davies-Bouldin Index      : {best['davies_bouldin']:.4f}")
    print(f"  Composite Score           : {best['composite_score']:.4f}")
    print(f"  Number of Clusters        : {int(best['n_clusters'])}")

    # Train v√† l∆∞u model t·ªët nh·∫•t
    print("\nüîÑ Training best model...")
    best_params = {
        "min_cluster_size": int(best['min_cluster_size']),
        "min_samples": int(best['min_samples']),
        "metric": best['metric'],
        "cluster_selection_method": best['cluster_selection_method']
    }

    config = TrainingConfig(
        data_path="data/processed/encoded_data.csv",
        model_type="hdbscan",
        model_params=best_params,
        model_path="results/hdbscan_composite_best.pkl"
    )

    trainer = ModelTrainer(config=config, evaluator=evaluator)
    trainer.load_data()
    trainer.train_model()
    
    # Ki·ªÉm tra s·ªë clusters v√† noise
    labels = trainer.get_cluster_labels()
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)
    noise_pct = n_noise / len(labels) * 100
    
    print(f"\nüìä Cluster Analysis:")
    print(f"  Total clusters (excluding noise): {n_clusters}")
    print(f"  Noise points: {n_noise} ({noise_pct:.1f}%)")
    
    # Ph√¢n b·ªë k√≠ch th∆∞·ªõc clusters
    if n_clusters > 0:
        cluster_sizes = pd.Series(labels).value_counts().sort_index()
        cluster_sizes = cluster_sizes[cluster_sizes.index != -1]  # Lo·∫°i noise
        print(f"\n  Cluster size distribution:")
        for cluster_id, size in cluster_sizes.items():
            pct = size / len(labels) * 100
            print(f"    Cluster {cluster_id}: {size} samples ({pct:.1f}%)")
    
    trainer.save_model()
    trainer.save_labels("results/hdbscan_composite_best_labels.csv")

    print("\n‚úÖ Best HDBSCAN model saved!")
    print("   Model: results/hdbscan_composite_best.pkl")
    print("   Labels: results/hdbscan_composite_best_labels.csv")

    # Show top 5 configs
    print("\n" + "="*80)
    print("üìä TOP 5 CONFIGS (by Composite Score)")
    print("="*80)
    cols = ['min_cluster_size', 'min_samples', 'metric', 'n_clusters',
            'silhouette', 'calinski_harabasz', 'davies_bouldin', 'composite_score']
    cols = [c for c in cols if c in df_results.columns]
    print(df_results[cols].head(5).to_string(index=False))
    
    # Ph√¢n t√≠ch min_cluster_size impact
    print("\n" + "="*80)
    print("üìä MIN_CLUSTER_SIZE IMPACT ANALYSIS")
    print("="*80)
    if 'min_cluster_size' in df_results.columns:
        size_analysis = df_results.groupby('min_cluster_size').agg({
            'n_clusters': 'mean',
            'silhouette': 'mean',
            'calinski_harabasz': 'mean',
            'composite_score': 'mean'
        }).round(2)
        print(size_analysis)
        print("\nüí° Interpretation:")
        print("   - min_cluster_size nh·ªè ‚Üí nhi·ªÅu clusters, c√≥ th·ªÉ over-segment")
        print("   - min_cluster_size l·ªõn ‚Üí √≠t clusters, c√≥ th·ªÉ under-segment")
        print("   - Ch·ªçn value c√¢n b·∫±ng gi·ªØa s·ªë clusters v√† quality metrics")

else:
    print("‚ö†Ô∏è No valid results found")
    print("üí° Tip: HDBSCAN c√≥ th·ªÉ kh√¥ng t√¨m ƒë∆∞·ª£c c·∫•u h√¨nh t·ªët n·∫øu:")
    print("   - min_cluster_size qu√° l·ªõn so v·ªõi dataset size")
    print("   - Data kh√¥ng c√≥ c·∫•u tr√∫c hierarchical density r√µ r√†ng")

## 5F. HDBSCAN Tuning

T√¨m best configuration cho HDBSCAN (Hierarchical Density-Based Spatial Clustering)

## 5E. GMM Tuning

T√¨m best configuration cho Gaussian Mixture Model (GMM)