# Week 11 â€” DBSCAN and Agglomerative Clustering (CKD Dataset)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    silhouette_score
)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering


In [None]:
csv_path = "ckd_dataset_v2.csv"  # change if needed
df = pd.read_csv(csv_path)
df.columns = df.columns.str.strip().str.lower()

print("Shape:", df.shape)
df.head()


In [None]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
X_cluster = df[numeric_cols].copy()
X_cluster = X_cluster.fillna(X_cluster.median())

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)

print("Numeric features:", numeric_cols)

In [None]:
dbscan = DBSCAN(eps=1.5, min_samples=5)
db_labels = dbscan.fit_predict(X_scaled)

unique_labels = np.unique(db_labels)
print("DBSCAN labels:", unique_labels)
print("Counts:\n", pd.Series(db_labels).value_counts())

df["dbscan_cluster"] = db_labels

if "target" in df.columns:
    print(pd.crosstab(df["dbscan_cluster"], df["target"], normalize="index"))


In [None]:
feat_x = numeric_cols[0]
feat_y = numeric_cols[1]

plt.figure()
plt.scatter(df[feat_x], df[feat_y], c=df["dbscan_cluster"], alpha=0.7)
plt.xlabel(feat_x)
plt.ylabel(feat_y)
plt.title("DBSCAN clusters (including noise)")
plt.show()

In [None]:
agg = AgglomerativeClustering(n_clusters=3)
agg_labels = agg.fit_predict(X_scaled)

df["agg_cluster"] = agg_labels

if "target" in df.columns:
    print(pd.crosstab(df["agg_cluster"], df["target"], normalize="index"))

In [None]:
plt.figure()
plt.scatter(df[feat_x], df[feat_y], c=df["agg_cluster"], alpha=0.7)
plt.xlabel(feat_x)
plt.ylabel(feat_y)
plt.title("Agglomerative clusters")
plt.show()

### Notes
- Use DBSCAN parameters, cluster counts, and any noise points in Milestone Two.
- Compare Agglomerative clusters to CKD labels for additional insight.