# Unsupervised Machine Learning Analysis
## Goal: Search for Interesting Clusters That Represent Unique Patterns of Electricity Usage

In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


In [15]:
# Import test and train data
test = pd.read_feather('data/test.feather')
train = pd.read_feather('data/train.feather')

In [16]:
# Combine datasets
df_all = pd.concat([train, test], ignore_index=True)


In [17]:
# Convert YEAR_MONTH to extract time features
df_all['month'] = df_all['YEAR_MONTH'].dt.month
df_all['year'] = df_all['YEAR_MONTH'].dt.year
df_all['month_sin'] = np.sin(2 * np.pi * df_all['month'] / 12)
df_all['month_cos'] = np.cos(2 * np.pi * df_all['month'] / 12)

# Create lag and rolling features
df_all['lag_1'] = df_all['TOTALKWH'].shift(1)
df_all['lag_12'] = df_all['TOTALKWH'].shift(12)
df_all['rolling_mean_3'] = df_all['TOTALKWH'].shift(1).rolling(3).mean()
df_all = df_all.dropna()
features = ['tempAvg', 'winddirAvg', 'population', 'zipcode', 'longitude', 'latitude', 'density', 'month_sin', 'month_cos', 'lag_1', 'lag_12', 'rolling_mean_3']

# Scale Feature
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_all[features + ['month_sin', 'month_cos']])

In [18]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

# Try different cluster numbers
best_k = None
best_score = -1
best_labels = None

for k in [2, 3, 4, 5, 6]:
    kmeans = KMeans(n_clusters=k)
    labels = kmeans.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    print(f'k={k}, silhouette score={score:.4f}')
    
    if score > best_score:
        best_score = score
        best_k = k
        best_labels = labels

print(f'Best number of clusters: {best_k}')
df_all['cluster'] = best_labels

k=2, silhouette score=0.1870
k=3, silhouette score=0.1642
k=4, silhouette score=0.1463
k=5, silhouette score=0.1696
k=6, silhouette score=0.1590
Best number of clusters: 2


As the best number of clusters is 2 according to this tuning exercise, this indicates that KMeans modeling might not be the best clustering method for this task. The data is mostly unimodal or has only one clear separation (e.g., summer vs. winter). KMeans also can't capture temporal continuity or sequence â€” it's static, not dynamic.