In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import Normalizer, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import TruncatedSVD

from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import pairwise_distances, silhouette_score

In [None]:
processed_data_path = Path("../data/processed/cadastro_clientes_demanda.xlsx").resolve()

data = pd.read_excel(processed_data_path)
data

In [None]:
data = data.drop(["Empresa", "Classe"], axis=1)
data

In [None]:
dummies_columns = data.drop(
    ["Instalação", "Demanda contratada (kW)", "GD_Beneficiária", "GD_Geradora"], axis=1
    ).columns
X = pd.get_dummies(
    data.drop(["Instalação", "Demanda contratada (kW)"], axis=1),
    columns=list(dummies_columns)
    )
X

In [None]:
cor_matrix = X.corr(method="spearman").abs()
tri_lower = cor_matrix.where(np.tril(np.ones(cor_matrix.shape), k=-1).astype(bool))
sns.heatmap(tri_lower, annot=True, fmt="0.1f")
plt.show()

In [None]:
# X_pos = X.drop(["Classificação_-", "Tipo_Alta/Media Tensão ", "Tipo_THS_ Azul", "Operando_THS_ Azul"], axis=1)
X_pos = X.drop(["Classificação_-", "Classificação_Livre", "Tipo_Livre ", "Operando_Livre ", "Tipo_THS_ Verde", "Operando_THS_ Verde", "Horário_Normal"], axis=1)

cor_matrix = X_pos.corr(method="spearman")
tri_lower = cor_matrix.where(np.tril(np.ones(cor_matrix.shape), k=-1).astype(bool))
sns.heatmap(tri_lower, annot=True, fmt="0.1f")
plt.show()


In [None]:
scores = []
for n_cluster in range(2,10):
    agg_clustering = AgglomerativeClustering(n_clusters=n_cluster)
    y_pred = agg_clustering.fit_predict(X_pos)
    distance_matrix = pairwise_distances(X_pos.astype(int), metric="hamming")
    score = silhouette_score(distance_matrix, y_pred, metric="precomputed")
    scores.append(score)

plt.plot(list(range(2,10)), scores)
plt.show()

In [None]:
svd = TruncatedSVD(n_components=2, random_state=5)
X_reduced = pd.DataFrame(svd.fit_transform(X), columns=["f1", "f2"])
X_reduced["cluster"] = y_pred
print(svd.explained_variance_ratio_.sum())
sns.scatterplot(X_reduced, x="f1", y="f2", hue="cluster")
plt.show()

In [None]:

clustering = DBSCAN()
y_pred = clustering.fit_predict(X_pos)
distance_matrix = pairwise_distances(X_pos.astype(int), metric="hamming")
score = silhouette_score(distance_matrix, y_pred, metric="precomputed")
print(f"clusters {len(set(clustering.labels_))}")
print(score)



In [None]:
data["cluster"] = y_pred

Cluster 0
Classificaçào: Cativo
Alta/Media Tensao
Alta/Media Tensao
Normal
Verde
Verde
Geradora


In [None]:
data.loc[data["cluster"]==0]

In [None]:
data.loc[data["cluster"]==1]

In [None]:
data.loc[data["cluster"]==2]

In [None]:
data.loc[data["cluster"]==3]

In [None]:
data.loc[data["cluster"]==4]

In [None]:
data.loc[data["cluster"]==5]

In [None]:
data.loc[data["cluster"]==6]

In [None]:
data.loc[data["cluster"]==7]

In [None]:
data.loc[data["cluster"]==8]

In [None]:
# data.pivot_table(index="Instalação", columns="cluster", values="cluster")
data_pivot = pd.pivot_table(data, values="Demanda contratada (kW)", index="Instalação", columns="cluster")

data_pivot["total"] = data_pivot.count(axis=1)
data_pivot



In [None]:
data_pivot.to_excel("../data/processed/cliente_clusters_demanda.xlsx")

In [None]:
data.to_excel("../data/processed/cadastro_demanda_clusters.xlsx", index=False)