In [None]:
# MODELING
# --------------------------------------------------------------------------------------

# Select modeling techniques: Determine which algorithms to try (e.g. regression, neural
# net).
# 
# Generate test design: Pending your modeling approach, you might need to split the data
# into training, test, and validation sets.
# 
# Build model: As glamorous as this might sound, this might just be executing a few
# lines of code like “reg = LinearRegression().fit(X, y)”.
# 
# Assess model: Generally, multiple models are competing against each other, and the
# data scientist needs to interpret the model results based on domain knowledge, the
# pre-defined success criteria, and the test design.

In [None]:
# CONCLUSIONS
# --------------------------------------------------------------------------------------

# ...

In [1]:
import os

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.subplots as sp
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import utils

pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)

os.makedirs(f"../output/d-modeling", exist_ok=True)
os.makedirs(f"../data/d-modeling", exist_ok=True)

YEAR = "2022"

file = f"../data/c-data-preparation/{YEAR}-raw.csv"
raw = pd.read_csv(file)

file = f"../data/c-data-preparation/{YEAR}-preprocessed.csv"
preprocessed = pd.read_csv(file)

In [None]:
print(raw.head())

In [None]:
print(preprocessed.head())

In [None]:
# Elbow method to determine the optimal number of clusters
# --------------------------------------------------------------------------------------

length = len(preprocessed)
kmax = int(np.sqrt(length))
kmin = 2
ks = range(kmin, kmax)
columns = ["latitude", "longitude"]
X = preprocessed[columns]

inertia = []
silhouette = []

for i, k in enumerate(ks):
    print(f"[{i + 1}] n_clusters: {k}")

    model = KMeans(n_clusters=k, random_state=42)
    model.fit(X)
    
    inertia.append(model.inertia_)
    silhouette.append(silhouette_score(X, model.labels_))

In [None]:
# Elbow method to determine the optimal number of clusters (plot)
# --------------------------------------------------------------------------------------

# Create subplots using make_subplots
titles = ("Inertia vs Number of Clusters", "Silhouette Score vs Number of Clusters")
fig = sp.make_subplots(rows=1, cols=2, subplot_titles=titles)

# Plotting inertia values
trace = go.Scatter(x=list(ks), y=inertia, mode="lines+markers", name="Inertia")
fig.add_trace(trace, row=1, col=1)

# Plotting silhouette scores
trace = go.Scatter(x=list(ks), y=silhouette, mode="lines+markers", name="Silhouette Score")
fig.add_trace(trace, row=1, col=2)

fig.update_xaxes(title_text="Number of Clusters", row=1, col=1)
fig.update_yaxes(title_text="Inertia", row=1, col=1)
fig.update_xaxes(title_text="Number of Clusters", row=1, col=2)
fig.update_yaxes(title_text="Silhouette Score", row=1, col=2)
fig.update_layout(title="K-Means Clustering Evaluation: Elbow Method", showlegend=False)

fig.show()

In [None]:
# Build top 3 models according to silhouette score
# --------------------------------------------------------------------------------------

ks = [48, 57, 70]
for i, k in enumerate(ks):
    header = f"k: {k}"
    print(header)
    print("-" * len(header))
    print(f"Columns: {columns}")

    X = preprocessed[columns].values

    model = KMeans(n_clusters=k, random_state=42)
    model.fit(X)

    # Number of clusters in labels
    labels = model.labels_
    score = metrics.silhouette_score(X, labels)

    dst = f"../output/d-modeling/{YEAR}-kmeans-{k}-{i}"
    title = (
        f"k: {k}, score: {score:.3f}<br>"
        f"columns: {columns}"
    )

    print(f"Silhouette coefficient: {score:.3f}\n")
    utils.plot_map(raw, labels, dst, title)

In [7]:
# Build final model: regioes
# --------------------------------------------------------------------------------------

k = 48
columns = ["latitude", "longitude"]

header = f"k: {k}"
print(header)
print("-" * len(header))
print(f"Columns: {columns}")

X = preprocessed[columns].values

model = KMeans(n_clusters=k, random_state=42)
model.fit(X)

# Number of clusters in labels
labels = model.labels_
score = metrics.silhouette_score(X, labels)

print(f"Silhouette coefficient: {score:.3f}\n")

# Save dataset with cluster column to csv
# --------------------------------------------------------------------------------------

raw["cluster"] = model.labels_
raw.to_csv(f"../data/d-modeling/{YEAR}-regioes.csv", index=False)

k: 48
-----
Columns: ['latitude', 'longitude']
Silhouette coefficient: 0.557

