In [None]:
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os 
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns


load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")

# 1 - create engine:
engine = create_engine(f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

In [None]:
query = """
SELECT *
FROM fact_neighborhood_price_analysis;
"""

df = pd.read_sql(query, engine)

df.head()

In [None]:
print(len(df))

#### Data preparation for clustering:


In [None]:
# remove identifier column (target)
df_model = df.drop(columns=["neighborhood"])

In [None]:
# check nulls 
df_model.isnull().sum()

In [None]:
# fill NaNs in air quality columns with column means
for col in ["avg_fine_particles", "avg_no2"]:
    df_model[col].fillna(df_model[col].mean(), inplace=True)

# verify
df_model[["avg_fine_particles", "avg_no2"]].isnull().sum()

In [None]:
# SCALE FEATURES:

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_model)

In [None]:
# heatmap:
corr = df_model.corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Elbow method to choose number of clusters:

inertia = []

for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.plot(range(1, 11), inertia)
plt.xlabel("Number of clusters")
plt.ylabel("Inertia")
plt.title("Elbow Method")
plt.show()

In [None]:
# KMeans

kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

df["cluster"] = clusters

In [None]:
# quick check:
df["cluster"].value_counts()

In [None]:
df.groupby(["cluster", "neighborhood"]).mean()

In [None]:
df.dtypes


In [None]:
df_model.groupby(["cluster"]).mean() 

In [None]:
# visualizations:
 
pca = PCA(n_components=2)
components = pca.fit_transform(X_scaled)

plt.figure(figsize=(8,6))
plt.scatter(components[:,0], components[:,1], c=df["cluster"])
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Neighborhood Clusters")
plt.show()