# Module 4.2 Unsupervised Learning Analysis

Exploratory notebook for clustering + anomaly detection on synthetic semiconductor process feature space.

Sections:
1. Generate synthetic dataset
2. PCA dimensionality inspection
3. Compare clustering algorithms (KMeans, GMM, DBSCAN)
4. Hybrid clustering + IsolationForest anomaly scoring
5. Metrics summary & guardrail flags
6. Next steps / observations

In [None]:
from pathlib import Path
import pandas as pd
import json

# Local import of pipeline utilities
from 4_2_unsupervised_pipeline import generate_synthetic_process, UnsupervisedPipeline  # type: ignore  # noqa: E402

# 1. Generate synthetic dataset
raw_df = generate_synthetic_process(n_samples=1200, n_latent=5, n_features=18)
raw_df.head()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

feature_cols = [c for c in raw_df.columns if c.startswith('f')]
X = raw_df[feature_cols].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)

explained = pca.explained_variance_ratio_
print('PCA components retained:', pca.n_components_)
print('Cumulative variance:', explained.cumsum())

In [None]:
from 4_2_unsupervised_pipeline import UnsupervisedPipeline  # type: ignore  # noqa: E402

models = ['kmeans','gmm','dbscan']
results = {}
for m in models:
    pipe = UnsupervisedPipeline(model=m, n_clusters=6, pca_variance=0.95 if m!='dbscan' else 0.95)
    pipe.fit(raw_df[feature_cols])
    eval_out = pipe.evaluate(raw_df[feature_cols])
    results[m] = eval_out['metrics']

json.dumps(results, indent=2)

In [None]:
hybrid = UnsupervisedPipeline(model='kmeans_iso', n_clusters=6, pca_variance=0.95)
hybrid.fit(raw_df[feature_cols])
clust_pred = hybrid.predict(raw_df[feature_cols])
hybrid_eval = hybrid.evaluate(raw_df[feature_cols])
print('Hybrid metrics:', json.dumps(hybrid_eval['metrics'], indent=2))
print('Anomaly flags sample:', clust_pred['anomaly_flag'][:20])

In [None]:
import pandas as pd
summary_df = pd.DataFrame(results).T
summary_df

## Observations & Next Steps

- Review guardrail warnings to tune parameters.
- Consider adding HDBSCAN for variable density in future module.
- Track baseline cluster_size_entropy over multiple simulated runs to establish stability band.
