In [1]:
import os
import psycopg2
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import optuna
from sklearn.cluster import KMeans
import contextlib, sys, io, logging
from sklearn.mixture import GaussianMixture



from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [2]:
# Connect to database and load data
db_params = {
    "host": os.getenv("LOCAL_HOST"),
    "user": os.getenv("LOCAL_USER"),
    "password": os.getenv("LOCAL_PW"),
    "port": os.getenv("LOCAL_PORT"),
    "dbname": os.getenv("LOCAL_DB")
}

try:
    conn = psycopg2.connect(**db_params)
    print("Database connection successful")
    sql_query = "SELECT * FROM dev.golden_table;"
    df = pd.read_sql_query(sql_query, conn)
    conn.close()
    print("Golden data loaded into DataFrame:")
    print(df.info())
except Exception as e:
    print(f"An error occurred: {e}")

Database connection successful


  df = pd.read_sql_query(sql_query, conn)


Golden data loaded into DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23038 entries, 0 to 23037
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   school_name             23038 non-null  object 
 1   school_type             23038 non-null  object 
 2   teachers_fte            22550 non-null  float64
 3   enrollment              22863 non-null  float64
 4   grade_eight_enrollment  21613 non-null  float64
 5   math_counts             22507 non-null  float64
 6   math_high_pct           22507 non-null  float64
 7   math_low_pct            19960 non-null  float64
 8   read_counts             22386 non-null  float64
 9   read_high_pct           22386 non-null  float64
 10  read_low_pct            19907 non-null  float64
 11  pct_hhi_150k_200k       23038 non-null  float64
 12  pct_hhi_220k_plus       23038 non-null  float64
 13  avg_natwalkind          23038 non-null  float64
 14  tot

## 1. Data Overview & Preprocessing



In [3]:
# Basic shape & preview
print(f"DataFrame shape: {df.shape}")
display(df.head())

df_numeric = df.select_dtypes(include=['int64','float64']).copy()
print(f"Numeric subset shape: {df_numeric.shape}")
missing_pct = df_numeric.isna().mean().sort_values(ascending=False)
print("Missing value percentage (top 15):")
display(missing_pct.head(15))

DataFrame shape: (23038, 22)


Unnamed: 0,school_name,school_type,teachers_fte,enrollment,grade_eight_enrollment,math_counts,math_high_pct,math_low_pct,read_counts,read_high_pct,...,pct_hhi_220k_plus,avg_natwalkind,total_10_14,pct_10_14,pct_female_10_14,total_pop,hhi_150k_200k,hhi_220k_plus,schools_in_zip,dup_rank
0,1 LT Charles W. Whitcomb School,1,93.0,1077.0,370.0,71.0,9.0,6.0,71.0,5.0,...,5.03,12.317521,2081,0,0.0,41505,1896,2086,2,1
1,100 Academy of Engineering and Technology MS,1,,147.0,47.0,5.0,49.0,0.0,7.0,49.0,...,0.75,12.120378,3841,0,0.0,47881,961,358,3,1
2,1R ELEMENTARY,1,12.0,191.0,25.0,9.0,79.0,80.0,9.0,59.0,...,1.34,8.287234,2008,0,0.0,25966,605,349,4,1
3,21st Century Charter Sch of Gary,1,96.0,1329.0,102.0,98.0,5.0,0.0,96.0,49.0,...,0.15,8.167196,528,0,0.0,6105,33,9,1,1
4,21st Century Cyber CS,1,72.0,1536.0,202.0,95.0,49.0,20.0,96.0,69.0,...,8.93,8.938298,2838,0,0.0,50510,2702,4511,4,1


Numeric subset shape: (23038, 20)
Missing value percentage (top 15):


read_low_pct              0.135906
math_low_pct              0.133605
grade_eight_enrollment    0.061854
read_counts               0.028301
read_high_pct             0.028301
math_counts               0.023049
math_high_pct             0.023049
teachers_fte              0.021182
enrollment                0.007596
pct_female_10_14          0.004384
schools_in_zip            0.000000
hhi_220k_plus             0.000000
hhi_150k_200k             0.000000
total_pop                 0.000000
pct_hhi_220k_plus         0.000000
dtype: float64

In [4]:
# Handle missing values: simple strategy (median). Could be enhanced later.

imputer = SimpleImputer(strategy='median')
scaled_features = imputer.fit_transform(df_numeric)
scaler = StandardScaler()
X = scaler.fit_transform(scaled_features)
print(f"Feature matrix ready. Shape: {X.shape}")

Feature matrix ready. Shape: (23038, 20)


## 2. Optimization Helpers (Optuna)
We define metric computation and a utility to optionally apply PCA inside each trial to reduce dimensionality (tuned as a hyperparameter).

In [5]:
# Ensure optuna is available (if running in an environment where pip install is allowed)


# Cache original data for reuse
X_full = X  # already scaled


def prepare_features(trial, X_input):
    """Optionally apply PCA controlled by trial hyperparameters."""
    use_pca = trial.suggest_categorical('use_pca', [True, False])
    if use_pca:
        # limit components between 2 and min(50, n_features)
        max_comp = min(50, X_input.shape[1])
        n_components = trial.suggest_int('pca_components', 2, max_comp)
        pca = PCA(n_components=n_components, random_state=42)
        X_red = pca.fit_transform(X_input)
        return X_red, pca
    return X_input, None


def compute_cluster_metrics(X_data, labels):
    # Guard for metrics requiring >1 cluster and fewer than n_samples clusters
    unique_labels = set(labels)
    if len(unique_labels) <= 1 or len(unique_labels) >= len(labels):
        return {
            'silhouette': float('nan'),
            'calinski_harabasz': float('nan'),
            'davies_bouldin': float('nan')
        }
    return {
        'silhouette': silhouette_score(X_data, labels),
        'calinski_harabasz': calinski_harabasz_score(X_data, labels),
        'davies_bouldin': davies_bouldin_score(X_data, labels)
    }


def objective_wrapper(build_model_fn):
    def objective(trial):
        X_trial, pca_obj = prepare_features(trial, X_full)
        model = build_model_fn(trial)
        labels = model.fit_predict(X_trial)
        metrics = compute_cluster_metrics(X_trial, labels)
        # We'll optimize on silhouette (maximize)
        trial.set_user_attr('metrics', metrics)
        if pca_obj is not None:
            trial.set_user_attr('pca_components_actual', getattr(pca_obj, 'n_components_', None))
        return metrics['silhouette']
    return objective

print("Helper functions and metrics ready.")

Helper functions and metrics ready.


## 3. KMeans Optimization
We search hyperparameters: n_clusters, init method, algorithm, optional PCA usage & components.

In [6]:
SUPPRESS_TRIAL_OUTPUT = True  # toggle this to see full trial logs

@contextlib.contextmanager
def silent_stdout(enabled=True):
    if not enabled:
        yield
        return
    new_target = io.StringIO()
    old_stdout = sys.stdout
    try:
        sys.stdout = new_target
        yield
    finally:
        sys.stdout = old_stdout

# Reduce Optuna logging level (shows only WARNING+)
if SUPPRESS_TRIAL_OUTPUT:
    optuna.logging.set_verbosity(optuna.logging.WARNING)
else:
    optuna.logging.set_verbosity(optuna.logging.INFO)

print("Optuna logging suppression active:" , SUPPRESS_TRIAL_OUTPUT)

Optuna logging suppression active: True


In [7]:
# --- KMeans Hyperparameter Optimization (enhanced) ---
import time
from sklearn.cluster import KMeans
import pandas as pd

N_KMEANS_TRIALS = 40  # adjust if you want a faster/slower search
REUSE_EXISTING_KMEANS_STUDY = False  # set True to skip re-optimizing if study_kmeans already present
KMEANS_STUDY_NAME = 'kmeans_clustering'

if REUSE_EXISTING_KMEANS_STUDY and 'study_kmeans' in globals():
    print('[KMeans] Reusing existing Optuna study; skipping optimization.')
else:
    def build_kmeans(trial):
        n_clusters = trial.suggest_int('kmeans_n_clusters', 2, 15)
        init = trial.suggest_categorical('kmeans_init', ['k-means++', 'random'])
        algorithm = trial.suggest_categorical('kmeans_algorithm', ['lloyd', 'elkan'])
        # Tune n_init (sklearn >=1.4 supports int or 'auto')
        n_init = trial.suggest_categorical('kmeans_n_init', [10, 20, 30, 'auto'])
        return KMeans(
            n_clusters=n_clusters,
            init=init,
            algorithm=algorithm,
            n_init=n_init,
            random_state=42
        )

    print(f"[KMeans] Running optimization (silence trials={SUPPRESS_TRIAL_OUTPUT}) ...")
    with silent_stdout(SUPPRESS_TRIAL_OUTPUT):
        study_kmeans = optuna.create_study(direction='maximize', study_name=KMEANS_STUDY_NAME)

        def objective_wrapper_with_runtime(build_model_fn):
            base_objective = objective_wrapper(build_model_fn)
            def _inner(trial):
                start = time.time()
                val = base_objective(trial)
                trial.set_user_attr('runtime_sec', time.time() - start)
                return val
            return _inner

        study_kmeans.optimize(
            objective_wrapper_with_runtime(build_kmeans),
            n_trials=N_KMEANS_TRIALS,
            show_progress_bar=not SUPPRESS_TRIAL_OUTPUT
        )

best_k_params = study_kmeans.best_trial.params
best_k_metrics = study_kmeans.best_trial.user_attrs.get('metrics', {})
print("[KMeans] Best Params:")
print(best_k_params)
print("[KMeans] Best Metrics:")
print(best_k_metrics)
print(f"[KMeans] Best silhouette: {study_kmeans.best_value:.4f}")

# Build results DataFrame (include inertia if available)
kmeans_results = []
for t in study_kmeans.trials:
    row = {**t.params}
    metrics = t.user_attrs.get('metrics', {})
    row.update(metrics)
    row['runtime_sec'] = t.user_attrs.get('runtime_sec')
    # inertia: recompute quickly if silhouette is valid and clusters >1
    try:
        if not np.isnan(metrics.get('silhouette', np.nan)) and 'kmeans_n_clusters' in t.params:
            # Refit minimal model (no PCA) ONLY for inertia if clusters moderate
            km_tmp = KMeans(
                n_clusters=t.params['kmeans_n_clusters'],
                init=t.params['kmeans_init'],
                algorithm=t.params['kmeans_algorithm'],
                n_init=t.params.get('kmeans_n_init','auto'),
                random_state=42
            ).fit(X_full)
            row['inertia'] = km_tmp.inertia_
        else:
            row['inertia'] = np.nan
    except Exception:
        row['inertia'] = np.nan
    kmeans_results.append(row)

kmeans_results_df = pd.DataFrame(kmeans_results)
if not kmeans_results_df.empty:
    # Rank by silhouette then inertia (lower inertia better)
    kmeans_results_df['inertia_rank'] = kmeans_results_df['inertia'].rank(method='min')
    display(kmeans_results_df.sort_values(['silhouette','inertia'], ascending=[False, True]).head(10))
    print('[KMeans] Summary:')
    print(kmeans_results_df[['silhouette','inertia','runtime_sec']].describe().round(3))
else:
    print('[KMeans] No trials recorded.')

[KMeans] Running optimization (silence trials=True) ...
[KMeans] Best Params:
{'use_pca': True, 'pca_components': 2, 'kmeans_n_clusters': 13, 'kmeans_init': 'random', 'kmeans_algorithm': 'elkan', 'kmeans_n_init': 'auto'}
[KMeans] Best Metrics:
{'silhouette': 0.34916301015486256, 'calinski_harabasz': 15715.756812735168, 'davies_bouldin': 0.8432115740169353}
[KMeans] Best silhouette: 0.3492


Unnamed: 0,use_pca,pca_components,kmeans_n_clusters,kmeans_init,kmeans_algorithm,kmeans_n_init,silhouette,calinski_harabasz,davies_bouldin,runtime_sec,inertia,inertia_rank
37,True,2.0,13,random,elkan,auto,0.349163,15715.756813,0.843212,8.965235,179065.321316,19.0
32,True,2.0,13,k-means++,lloyd,auto,0.348427,15578.822891,0.876889,6.668493,177618.839354,14.0
12,True,2.0,15,k-means++,lloyd,auto,0.343977,15217.226408,0.870908,7.614656,168067.687776,2.0
14,True,2.0,14,k-means++,lloyd,auto,0.343066,15548.924897,0.846677,8.490222,171651.770712,5.0
17,True,2.0,14,k-means++,lloyd,auto,0.343066,15548.924897,0.846677,7.56709,171651.770712,5.0
21,True,2.0,14,k-means++,lloyd,auto,0.343066,15548.924897,0.846677,6.618609,171651.770712,5.0
23,True,2.0,14,k-means++,lloyd,auto,0.343066,15548.924897,0.846677,6.626425,171651.770712,5.0
27,True,2.0,14,k-means++,lloyd,auto,0.343066,15548.924897,0.846677,6.279018,171651.770712,5.0
31,True,2.0,14,k-means++,lloyd,auto,0.343066,15548.924897,0.846677,6.154963,171651.770712,5.0
11,True,3.0,15,k-means++,lloyd,30,0.277631,7388.703815,1.01012,8.464087,167106.339981,1.0


[KMeans] Summary:
       silhouette     inertia  runtime_sec
count      40.000      40.000       40.000
mean        0.229  195456.174        7.772
std         0.075   38899.092        1.317
min         0.139  167106.340        6.155
25%         0.153  171651.771        6.744
50%         0.222  181334.923        7.515
75%         0.278  198768.027        8.471
max         0.349  335421.708       11.725


## 4. Gaussian Mixture (GMM) Optimization
We tune: n_components, covariance_type, reg_covar, and optional PCA usage.

In [8]:

print("Running GMM optimization (silence trials=", SUPPRESS_TRIAL_OUTPUT, ") ...")

def build_gmm(trial):
    n_components = trial.suggest_int('gmm_n_components', 2, 15)
    covariance_type = trial.suggest_categorical('gmm_covariance_type', ['full', 'tied', 'diag', 'spherical'])
    reg_covar = trial.suggest_float('gmm_reg_covar', 1e-6, 1e-2, log=True)
    return GaussianMixture(
        n_components=n_components,
        covariance_type=covariance_type,
        reg_covar=reg_covar,
        random_state=42
    )

with silent_stdout(SUPPRESS_TRIAL_OUTPUT):
    study_gmm = optuna.create_study(direction='maximize', study_name='gmm_clustering')
    study_gmm.optimize(objective_wrapper(build_gmm), n_trials=40, show_progress_bar=not SUPPRESS_TRIAL_OUTPUT)

best_g_params = study_gmm.best_trial.params
best_g_metrics = study_gmm.best_trial.user_attrs.get('metrics', {})
print("GMM Best Params:")
print(best_g_params)
print("GMM Best Metrics:")
print(best_g_metrics)
print(f"Best silhouette: {study_gmm.best_value:.4f}")

gmm_results = []
for t in study_gmm.trials:
    row = {**t.params}
    row.update(t.user_attrs.get('metrics', {}))
    gmm_results.append(row)
import pandas as pd
gmm_results_df = pd.DataFrame(gmm_results)
if not gmm_results_df.empty:
    display(gmm_results_df.sort_values('silhouette', ascending=False).head())

Running GMM optimization (silence trials= True ) ...
GMM Best Params:
{'use_pca': True, 'pca_components': 2, 'gmm_n_components': 11, 'gmm_covariance_type': 'spherical', 'gmm_reg_covar': 7.399960393515928e-06}
GMM Best Metrics:
{'silhouette': 0.3448087051671553, 'calinski_harabasz': 14657.689648074242, 'davies_bouldin': 0.9467695484157462}
Best silhouette: 0.3448


Unnamed: 0,use_pca,pca_components,gmm_n_components,gmm_covariance_type,gmm_reg_covar,silhouette,calinski_harabasz,davies_bouldin
36,True,2.0,11,spherical,7e-06,0.344809,14657.689648,0.94677
26,True,2.0,12,spherical,1.2e-05,0.335289,14251.354262,0.927694
32,True,2.0,14,diag,3e-06,0.333015,13243.796537,1.141978
14,True,2.0,13,tied,4e-06,0.324657,14163.022286,0.910668
22,True,2.0,14,tied,3e-06,0.295958,13761.372193,0.964681


## 5. Visualization (2D PCA Projections)
We project the full standardized feature matrix to 2 principal components (outside of optimization) for consistent side-by-side cluster plots.

In [9]:
import plotly.express as px

# Refit best models on (possibly PCA-transformed) feature space chosen by best trial
best_kmeans_params = study_kmeans.best_trial.params
best_gmm_params = study_gmm.best_trial.params

# Build best models again without PCA reduction (for consistent plotting base); PCA only for 2D viz
kmeans_best = KMeans(
    n_clusters=best_kmeans_params['kmeans_n_clusters'],
    init=best_kmeans_params['kmeans_init'],
    algorithm=best_kmeans_params['kmeans_algorithm'],
    n_init='auto',
    random_state=42
).fit(X_full)

gmm_best = GaussianMixture(
    n_components=best_gmm_params['gmm_n_components'],
    covariance_type=best_gmm_params['gmm_covariance_type'],
    reg_covar=best_gmm_params['gmm_reg_covar'],
    random_state=42
).fit(X_full)

labels_kmeans = kmeans_best.predict(X_full)
labels_gmm = gmm_best.predict(X_full)

# PCA for viz only
pca_viz = PCA(n_components=2, random_state=42)
X_2d = pca_viz.fit_transform(X_full)
plot_df = pd.DataFrame({
    'PC1': X_2d[:,0],
    'PC2': X_2d[:,1],
    'KMeans_Cluster': labels_kmeans.astype(str),
    'GMM_Cluster': labels_gmm.astype(str)
})

fig1 = px.scatter(plot_df, x='PC1', y='PC2', color='KMeans_Cluster', title='KMeans Clusters (PCA 2D)')
fig1.show()
fig2 = px.scatter(plot_df, x='PC1', y='PC2', color='GMM_Cluster', title='GMM Clusters (PCA 2D)')
fig2.show()

print("Visualization complete.")





This means that static image generation (e.g. `fig.write_image()`) will not work.

Please upgrade Plotly to version 6.1.1 or greater, or downgrade Kaleido to version 0.2.1.




Visualization complete.


## 6. Cluster Profiling & Comparison
Generate aggregate statistics per cluster for both algorithms and compare metrics side-by-side.

In [10]:
# Attach cluster labels back to original numeric df
profile_df = df_numeric.copy()
profile_df['kmeans_cluster'] = labels_kmeans
profile_df['gmm_cluster'] = labels_gmm

kmeans_profile = profile_df.groupby('kmeans_cluster').agg(['mean','median','count'])
gmm_profile = profile_df.groupby('gmm_cluster').agg(['mean','median','count'])

print("KMeans cluster profile (first 10 features):")
display(kmeans_profile.iloc[:, :30])  # limit columns for display
print("GMM cluster profile (first 10 features):")
display(gmm_profile.iloc[:, :30])

# Consolidate top metrics
comparison_df = pd.DataFrame({
    'model': ['KMeans', 'GMM'],
    'best_silhouette': [study_kmeans.best_value, study_gmm.best_value],
    'best_params': [study_kmeans.best_trial.params, study_gmm.best_trial.params]
})

# Fetch corresponding Calinski-Harabasz and Davies-Bouldin from best trials
comparison_df['calinski_harabasz'] = [
    study_kmeans.best_trial.user_attrs['metrics']['calinski_harabasz'],
    study_gmm.best_trial.user_attrs['metrics']['calinski_harabasz']
]
comparison_df['davies_bouldin'] = [
    study_kmeans.best_trial.user_attrs['metrics']['davies_bouldin'],
    study_gmm.best_trial.user_attrs['metrics']['davies_bouldin']
]

print("Model comparison metrics:")
display(comparison_df)

# Simple relative ranking summary
ranking = comparison_df[['model','best_silhouette','calinski_harabasz','davies_bouldin']].copy()
ranking['silhouette_rank'] = ranking['best_silhouette'].rank(ascending=False)
ranking['ch_rank'] = ranking['calinski_harabasz'].rank(ascending=False)
ranking['db_rank'] = ranking['davies_bouldin'].rank(ascending=True)
ranking['avg_rank'] = ranking[['silhouette_rank','ch_rank','db_rank']].mean(axis=1)
print("Ranking summary:")
display(ranking.sort_values('avg_rank'))

KMeans cluster profile (first 10 features):


Unnamed: 0_level_0,teachers_fte,teachers_fte,teachers_fte,enrollment,enrollment,enrollment,grade_eight_enrollment,grade_eight_enrollment,grade_eight_enrollment,math_counts,...,read_counts,read_high_pct,read_high_pct,read_high_pct,read_low_pct,read_low_pct,read_low_pct,pct_hhi_150k_200k,pct_hhi_150k_200k,pct_hhi_150k_200k
Unnamed: 0_level_1,mean,median,count,mean,median,count,mean,median,count,mean,...,count,mean,median,count,mean,median,count,mean,median,count
kmeans_cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,112.359551,70.0,89,2014.288889,1252.0,90,560.444444,463.5,90,503.681319,...,91,66.230769,69.0,91,61.692308,60.0,91,3.504286,3.69,91
1,42.266458,35.5,1276,547.866099,472.5,1292,85.927007,52.0,959,12.732085,...,1225,24.498776,3.0,1225,30.983482,40.0,787,4.602374,4.52,1310
2,39.545641,36.0,975,538.113911,485.0,992,96.557778,73.0,900,26.97667,...,954,39.474843,49.0,954,28.154534,25.0,783,1.820518,1.63,1004
3,28.140751,26.0,2103,386.631308,365.0,2140,61.22878,43.0,2050,12.853839,...,2037,47.139421,49.0,2037,20.414646,21.0,1980,1.7257,1.585,2172
4,45.057644,43.0,1995,664.668955,634.0,2039,205.77695,195.0,2013,73.457644,...,2045,38.890465,49.0,2045,29.738765,30.0,2025,1.865078,1.68,2054
5,24.111957,21.0,2760,333.341859,284.0,2776,73.184739,50.0,2739,14.54076,...,2687,68.560104,74.0,2687,59.569832,60.0,2685,2.302055,2.06,2788
6,52.848522,51.0,1624,733.354878,698.0,1640,222.492629,215.0,1628,52.541514,...,1632,75.862745,79.0,1632,71.27897,70.0,1631,4.908117,4.88,1646
7,13.278412,9.0,2191,159.07754,71.0,2244,16.912543,5.0,1738,3.654169,...,2232,3.362007,3.0,2232,46.539806,50.0,515,1.98227,1.75,2278
8,18.954198,13.0,131,242.664179,141.0,134,37.792,21.0,125,16.557252,...,132,37.848485,49.0,132,41.69697,50.0,99,2.07306,1.41,134
9,17.258914,15.0,3113,220.515729,185.0,3147,39.598781,30.0,3118,14.877994,...,3045,48.444992,49.0,3045,24.533553,21.0,3040,1.566064,1.33,3171


GMM cluster profile (first 10 features):


Unnamed: 0_level_0,teachers_fte,teachers_fte,teachers_fte,enrollment,enrollment,enrollment,grade_eight_enrollment,grade_eight_enrollment,grade_eight_enrollment,math_counts,...,read_counts,read_high_pct,read_high_pct,read_high_pct,read_low_pct,read_low_pct,read_low_pct,pct_hhi_150k_200k,pct_hhi_150k_200k,pct_hhi_150k_200k
Unnamed: 0_level_1,mean,median,count,mean,median,count,mean,median,count,mean,...,count,mean,median,count,mean,median,count,mean,median,count
gmm_cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,45.942286,44.0,1750,632.011312,606.0,1768,185.541714,174.0,1750,48.560822,...,1742,74.081515,79.0,1742,69.15977,70.0,1740,4.90363,4.83,1774
1,8.990826,6.0,1308,94.457804,51.0,1339,8.003795,4.0,1054,3.049483,...,1343,3.367833,3.0,1343,53.973783,50.0,267,1.680855,1.54,1357
2,31.075715,30.0,2483,455.165534,434.0,2501,132.742674,122.0,2491,80.807387,...,2514,57.739459,59.0,2514,47.894988,50.0,2514,1.83141,1.69,2518
3,30.809361,28.0,4978,422.304236,387.0,5075,94.689386,59.0,4607,25.759304,...,4900,32.485102,49.0,4900,24.03821,21.0,3978,2.30954,1.98,5150
4,65.560226,61.0,1951,972.070415,929.0,1974,283.534778,294.5,1754,43.104906,...,1911,57.642595,64.0,1911,57.937962,60.0,1757,4.551875,4.58,1979
5,21.559024,20.0,2499,292.369427,260.5,2512,60.845659,49.0,2488,19.564228,...,2446,69.645953,79.0,2446,59.806214,60.0,2446,2.033626,1.89,2521
6,37.204997,34.0,1561,522.61052,475.0,1597,100.81996,67.0,1483,26.73763,...,1547,39.005818,49.0,1547,27.145946,21.0,1295,1.795834,1.62,1613
7,15.83806,14.0,2433,199.534799,169.0,2457,33.08046,27.0,2436,10.865758,...,2385,49.102306,49.0,2385,23.533977,21.0,2384,1.405394,1.27,2473
8,16.920635,13.0,126,210.75969,130.0,129,33.783333,20.5,120,15.777778,...,127,36.944882,49.0,127,40.297872,50.0,94,1.872171,1.18,129
9,51.76178,50.0,3056,820.23372,788.5,3102,254.515063,255.0,3087,58.411043,...,3088,64.443005,64.0,3088,57.925518,60.0,3088,2.384764,2.33,3113


Model comparison metrics:


Unnamed: 0,model,best_silhouette,best_params,calinski_harabasz,davies_bouldin
0,KMeans,0.349163,"{'use_pca': True, 'pca_components': 2, 'kmeans...",15715.756813,0.843212
1,GMM,0.344809,"{'use_pca': True, 'pca_components': 2, 'gmm_n_...",14657.689648,0.94677


Ranking summary:


Unnamed: 0,model,best_silhouette,calinski_harabasz,davies_bouldin,silhouette_rank,ch_rank,db_rank,avg_rank
0,KMeans,0.349163,15715.756813,0.843212,1.0,1.0,1.0,1.0
1,GMM,0.344809,14657.689648,0.94677,2.0,2.0,2.0,2.0


## 7. Notes & Next Steps
Potential enhancements:
- Add DBSCAN / HDBSCAN for density-based perspective.
- Use feature selection or domain-driven grouping before clustering.
- Evaluate stability across bootstrap samples.
- Store cluster assignments back to database for downstream analytics.

## 8. Persist Best Models
Save best KMeans and GMM models, hyperparameters, and metrics into `src/unsupervised/` for reuse.

In [11]:
import json, joblib, pathlib, datetime

# Directory to save models
save_dir = pathlib.Path('src') / 'unsupervised'
save_dir.mkdir(parents=True, exist_ok=True)

# Recreate best models if not already in memory (guard if cell order differs)
if 'kmeans_best' not in globals():
    from sklearn.cluster import KMeans
    kmeans_best = KMeans(
        n_clusters=study_kmeans.best_trial.params['kmeans_n_clusters'],
        init=study_kmeans.best_trial.params['kmeans_init'],
        algorithm=study_kmeans.best_trial.params['kmeans_algorithm'],
        n_init='auto',
        random_state=42
    ).fit(X_full)

# GMM study might not yet be executed; wrap in try
try:
    if 'study_gmm' in globals():
        from sklearn.mixture import GaussianMixture
        if 'gmm_best' not in globals():
            gmm_best = GaussianMixture(
                n_components=study_gmm.best_trial.params['gmm_n_components'],
                covariance_type=study_gmm.best_trial.params['gmm_covariance_type'],
                reg_covar=study_gmm.best_trial.params['gmm_reg_covar'],
                random_state=42
            ).fit(X_full)
    else:
        gmm_best = None
except Exception as e:
    print(f"Could not rebuild GMM best model: {e}")
    gmm_best = None

# Metadata assembly
now_ts = datetime.datetime.utcnow().isoformat() + 'Z'
meta = {
    'timestamp_utc': now_ts,
    'kmeans': {
        'params': study_kmeans.best_trial.params,
        'metrics': study_kmeans.best_trial.user_attrs.get('metrics', {})
    },
}
if 'study_gmm' in globals():
    meta['gmm'] = {
        'params': study_gmm.best_trial.params,
        'metrics': study_gmm.best_trial.user_attrs.get('metrics', {})
    }
else:
    meta['gmm'] = None

# Save models & meta
joblib.dump(kmeans_best, save_dir / 'kmeans_best_model.joblib')
if 'gmm_best' in globals() and gmm_best is not None:
    joblib.dump(gmm_best, save_dir / 'gmm_best_model.joblib')
with open(save_dir / 'unsupervised_models_metadata.json', 'w') as f:
    json.dump(meta, f, indent=2)

print(f"Saved artifacts to: {save_dir.resolve()}")
print("Files:")
for p in save_dir.glob('*model.joblib'):
    print(' -', p.name)
print(' - unsupervised_models_metadata.json')

Saved artifacts to: D:\docs\MADS\696-Milestone 2\src\src\unsupervised
Files:
 - gmm_best_model.joblib
 - kmeans_best_model.joblib
 - unsupervised_models_metadata.json


Metrics to compute later:
- Silhouette Score (higher better)
- Calinski-Harabasz Index (higher better)
- Davies-Bouldin Index (lower better)

We'll define reusable helper functions so both model families share logic.

### Loading Later
```python
import joblib, json
kmeans = joblib.load('src/unsupervised/kmeans_best_model.joblib')
try:
    gmm = joblib.load('src/unsupervised/gmm_best_model.joblib')
except FileNotFoundError:
    gmm = None
with open('src/unsupervised/unsupervised_models_metadata.json') as f:
    meta = json.load(f)
```
The metadata file contains hyperparameters and validation metrics captured at save time.