In [None]:
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import Normalizer, MaxAbsScaler, MinMaxScaler, RobustScaler, StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
data_profile = pd.read_excel(Path("../data/processed/clientes_perfil.xlsx"))
data_features = pd.read_excel(Path("../data/processed/clientes_features.xlsx"))

# Cluster grupo 0 - Clientes Cativos

In [None]:
data_profile_g0 = data_profile.loc[data_profile["grupo"]==0]

# fill na with row mean
data_profile_g0.iloc[:, 2:-1] = data_profile_g0.iloc[:, 2:-1].apply(lambda row: row.fillna(row.mean()), axis=1)

data_profile_g0 = pd.pivot_table(data_profile_g0,
                                 values=list(range(1,13)),
                                 index="cliente",
                                 columns="grupo_fatura"
                                 )
data_profile_g0.columns = [f"{x[1]}_{x[0]:02}" for x in data_profile_g0.columns.values]
data_profile_g0

In [None]:
new_columns = sorted(data_profile_g0.columns.values)
new_columns = [x for x in new_columns if "FER" not in x]
data_profile_g0 = data_profile_g0[new_columns]
data_profile_g0

In [None]:
X0 = data_profile_g0

# normalizer = Normalizer()
# normalizer = MaxAbsScaler()
# normalizer = MinMaxScaler()
# normalizer = StandardScaler()
normalizer = RobustScaler()
X_normalized = normalizer.fit_transform(X0)

inertias = []
k_range = list(range(2, 11))
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_normalized)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(5,4))
plt.plot(k_range, inertias)
plt.ylabel("Inercias")
plt.xlabel("Numero Clusters")
plt.show()

In [None]:
reduction = PCA(n_components=2, random_state=42)
X_reduced = reduction.fit_transform(X_normalized)
print(f"Explained Variance Ratio: {reduction.explained_variance_ratio_}")

k = 4 # based on elbon plot
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X_normalized)

data_profile_g0["cluster_km"] = labels
data_profile_g0["mean_consumo"] = data_profile_g0.loc[:, data_profile_g0.columns.str.contains("Consumo")].mean(axis=1)
data_profile_g0["mean_demanda"] = data_profile_g0.loc[:, data_profile_g0.columns.str.contains("Demanda")].mean(axis=1)

df = pd.DataFrame(X_reduced)
df["cluster"] = labels
print(f"Clusters count")
print(df["cluster"].value_counts())
# Plot reduction
fig1, ax1 = plt.subplots(figsize=(7, 5))
sns.scatterplot(df, x=1, y=0, hue="cluster", palette="Set1", ax=ax1)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

# plot original data
fig2, ax2 = plt.subplots(figsize=(7, 5))
sns.scatterplot(data_profile_g0, x="mean_consumo", y="mean_demanda", hue="cluster_km", palette="Set1", ax=ax2)
ax2.set_xlabel("Media mensal do Consumo de Energia (KWh)")
ax2.set_ylabel("Media mensal da Demanda Contratada (KW)")
# plt.yscale("log")
# plt.xscale("log")
plt.show()

### DBSCAN

In [None]:
# normalizer = Normalizer()
# normalizer = MaxAbsScaler()
# normalizer = MinMaxScaler()
# normalizer = StandardScaler()
normalizer = RobustScaler()
X_normalized = normalizer.fit_transform(X0)

reduction = PCA(n_components=2, random_state=42)
X_reduced = reduction.fit_transform(X_normalized)
print(f"Explained Variance Ratio: {reduction.explained_variance_ratio_}")

clusterizer = DBSCAN(eps=10, min_samples=10)
labels = clusterizer.fit_predict(X_normalized)
data_profile_g0["cluster_dbscan"] = labels

df = pd.DataFrame(X_reduced)
df["cluster"] = labels
print(f"Clusters count")
print(df["cluster"].value_counts())
# Plot reduction
fig1, ax1 = plt.subplots(figsize=(10, 6))
sns.scatterplot(df, x=1, y=0, hue="cluster", palette="Set1", ax=ax1)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

# plot original data
fig2, ax2 = plt.subplots(figsize=(10, 6))
sns.scatterplot(X0, x="mean_consumo", y="mean_demanda", hue="cluster_dbscan", palette="Set1", ax=ax2)
# plt.yscale("log")
# plt.xscale("log")
plt.show()

# Cluster grupo 1 - Clientes Cativos com Geração

In [None]:
data_profile_g1 = data_profile.loc[data_profile["grupo"]==1]

# fill na with row mean
data_profile_g1.iloc[:, 2:-1] = data_profile_g1.iloc[:, 2:-1].apply(lambda row: row.fillna(row.mean()), axis=1)

data_profile_g1 = pd.pivot_table(data_profile_g1,
                                 values=list(range(1,13)),
                                 index="cliente",
                                 columns="grupo_fatura"
                                 )
data_profile_g1.columns = [f"{x[1]}_{x[0]:02}" for x in data_profile_g1.columns.values]
data_profile_g1

In [None]:
new_columns = sorted(data_profile_g1.columns.values)
new_columns = [x for x in new_columns if "FER" not in x]
new_columns = [x for x in new_columns if "Geração" not in x]
new_columns = [x for x in new_columns if "EnergiaCompensada" not in x]
data_profile_g1 = data_profile_g1[new_columns]
data_profile_g1

In [None]:
X1 = data_profile_g1

# normalizer = Normalizer()
# normalizer = MaxAbsScaler()
# normalizer = MinMaxScaler()
# normalizer = StandardScaler()
normalizer = RobustScaler()
X_normalized = normalizer.fit_transform(X1)

inertias = []
k_range = list(range(2, 11))
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_normalized)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(5,4))
plt.plot(k_range, inertias)
plt.ylabel("Inercias")
plt.xlabel("Numero Clusters")
plt.show()

In [None]:
reduction = PCA(n_components=2, random_state=42)
X_reduced = reduction.fit_transform(X_normalized)
print(f"Explained Variance Ratio: {reduction.explained_variance_ratio_}")

k = 4 # based on elbon plot
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X_normalized)

data_profile_g1["cluster_km"] = labels
data_profile_g1["mean_consumo"] = data_profile_g1.loc[:, data_profile_g1.columns.str.contains("Consumo")].mean(axis=1)
data_profile_g1["mean_demanda"] = data_profile_g1.loc[:, data_profile_g1.columns.str.contains("Demanda")].mean(axis=1)
data_profile_g1["mean_injetada"] = data_profile_g1.loc[:, data_profile_g1.columns.str.contains("Injetada")].mean(axis=1)

df = pd.DataFrame(X_reduced)
df["cluster"] = labels
print(f"Clusters count")
print(df["cluster"].value_counts())
# Plot reduction
fig1, ax1 = plt.subplots(figsize=(10, 6))
sns.scatterplot(df, x=1, y=0, hue="cluster", palette="Set1", ax=ax1)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

# plot original data
fig2, ax2 = plt.subplots(figsize=(7, 5))
sns.scatterplot(data_profile_g1, x="mean_consumo", y="mean_injetada", hue="cluster_km", palette="Set1", ax=ax2)
ax2.set_xlabel("Media mensal do Consumo de Energia (KWh)")
ax2.set_ylabel("Media mensal de Energia Injetada (KWh)")
# plt.yscale("log")
# plt.xscale("log")
plt.show()

# Cluster grupo 2 - Clientes Em Transição

In [None]:
data_profile_g2 = data_profile.loc[data_profile["grupo"]==2]

# fill na with zero
# data_profile_g2 = data_profile_g2.fillna(0)

data_profile_g2 = pd.pivot_table(data_profile_g2,
                                 values=list(range(1,13)),
                                 index="cliente",
                                 columns="grupo_fatura"
                                 )
data_profile_g2.columns = [f"{x[1]}_{x[0]:02}" for x in data_profile_g2.columns.values]
data_profile_g2 = data_profile_g2.fillna(0)
data_profile_g2

In [None]:
new_columns = sorted(data_profile_g2.columns.values)
new_columns = [x for x in new_columns if "FER" not in x]
new_columns = [x for x in new_columns if "Geração" not in x]
new_columns = [x for x in new_columns if "EnergiaCompensada" not in x]
data_profile_g2 = data_profile_g2[new_columns]
data_profile_g2

In [None]:
X2 = data_profile_g2

# normalizer = Normalizer()
# normalizer = MaxAbsScaler()
# normalizer = MinMaxScaler()
normalizer = StandardScaler()
# normalizer = RobustScaler()
X_normalized = normalizer.fit_transform(X2)

inertias = []
k_range = list(range(2, 11))
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_normalized)
    inertias.append(kmeans.inertia_)

plt.plot(k_range, inertias)
plt.ylabel("Inercias")
plt.xlabel("Numero Clusters")
plt.show()

In [None]:
reduction = PCA(n_components=2, random_state=42)
X_reduced = reduction.fit_transform(X_normalized)
print(f"Explained Variance Ratio: {reduction.explained_variance_ratio_}")

k = 5 # based on elbon plot
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X_normalized)

data_profile_g2["cluster_km"] = labels
data_profile_g2["mean_TE"] = data_profile_g2.loc[:, data_profile_g2.columns.str.contains("TE")].mean(axis=1)
data_profile_g2["mean_TUSD"] = data_profile_g2.loc[:, data_profile_g2.columns.str.contains("TUSD")].mean(axis=1)
data_profile_g2["mean_consumo"] = data_profile_g2.loc[:, data_profile_g2.columns.str.contains("Consumo")].mean(axis=1)
data_profile_g2["mean_demanda"] = data_profile_g2.loc[:, data_profile_g2.columns.str.contains("Demanda")].mean(axis=1)
data_profile_g2["mean_injetada"] = data_profile_g2.loc[:, data_profile_g2.columns.str.contains("Injetada")].mean(axis=1)

df = pd.DataFrame(X_reduced)
df["cluster"] = labels
print(f"Clusters count")
print(df["cluster"].value_counts())
# Plot reduction
fig3, ax1 = plt.subplots(figsize=(10, 6))
sns.scatterplot(df, x=1, y=0, hue="cluster", palette="Set1", ax=ax1)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

# plot original data
fig2, ax2 = plt.subplots(figsize=(10, 6))
sns.scatterplot(data_profile_g2, x="mean_consumo", y="mean_demanda", hue="cluster_km", palette="Set1", ax=ax2)
# plt.yscale("log")
# plt.xscale("log")
plt.show()

# Cluster grupo 3 - Clientes Mercado Livre

In [None]:
data_profile_g3 = data_profile.loc[data_profile["grupo"]==3]

# fill na with row mean
data_profile_g3.iloc[:, 2:-1] = data_profile_g3.iloc[:, 2:-1].apply(lambda row: row.fillna(row.mean()), axis=1)

data_profile_g3 = pd.pivot_table(data_profile_g3,
                                 values=list(range(1,13)),
                                 index="cliente",
                                 columns="grupo_fatura"
                                 )
data_profile_g3.columns = [f"{x[1]}_{x[0]:02}" for x in data_profile_g3.columns.values]
data_profile_g3

In [None]:
new_columns = sorted(data_profile_g3.columns.values)
new_columns = [x for x in new_columns if "FER" not in x]
# new_columns = [x for x in new_columns if "Geração" not in x]
# new_columns = [x for x in new_columns if "EnergiaCompensada" not in x]
data_profile_g3 = data_profile_g3[new_columns]
data_profile_g3

In [None]:
X3 = data_profile_g3

# normalizer = Normalizer()
# normalizer = MaxAbsScaler()
# normalizer = MinMaxScaler()
normalizer = StandardScaler()
# normalizer = RobustScaler()
X_normalized = normalizer.fit_transform(X3)

inertias = []
k_range = list(range(2, 11))
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_normalized)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(5,4))
plt.plot(k_range, inertias)
plt.ylabel("Inercias")
plt.xlabel("Numero Clusters")
plt.show()

In [None]:
reduction = PCA(n_components=2, random_state=42)
X_reduced = reduction.fit_transform(X_normalized)
print(f"Explained Variance Ratio: {reduction.explained_variance_ratio_}")

k = 5 # based on elbon plot
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X_normalized)

data_profile_g3["cluster_km"] = labels
data_profile_g3["mean_TE"] = data_profile_g3.loc[:, data_profile_g3.columns.str.contains("TE")].mean(axis=1)
data_profile_g3["mean_TUSD"] = data_profile_g3.loc[:, data_profile_g3.columns.str.contains("TUSD")].mean(axis=1)

df = pd.DataFrame(X_reduced)
df["cluster"] = labels
print(f"Clusters count")
print(df["cluster"].value_counts())
# Plot reduction
fig3, ax1 = plt.subplots(figsize=(10, 6))
sns.scatterplot(df, x=1, y=0, hue="cluster", palette="Set1", ax=ax1)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

# plot original data
fig2, ax2 = plt.subplots(figsize=(7, 5))
sns.scatterplot(data_profile_g3, x="mean_TUSD", y="mean_TE", hue="cluster_km", palette="Set1", ax=ax2)
ax2.set_ylabel("Media mensal do Consumo Uso Sis.Dist. (KWh)")
ax2.set_xlabel("Media mensal do Contrato Usu Sis.Dist. (KWh)")
# plt.yscale("log")
# plt.xscale("log")
plt.show()

In [None]:
data_profile_g0

In [None]:
data_profile_all = pd.concat(
    [
        data_profile_g0, data_profile_g1,
        data_profile_g2, data_profile_g3
    ]
)

teste = pd.merge(left=data_profile, right=data_profile_all[["cluster_km"]], left_on="cliente", right_index=True)
# teste = pd.merge(left=teste, right=data_profile_g1[["cluster_km"]], left_on="cliente", right_index=True)
teste[["cliente", "cluster_km"]].drop_duplicates().to_csv("../data/processed/clientes_subclusters.csv", index=False)