In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
processed_data_path = Path("../data/processed/consumo_tarifas_meses.xlsx").resolve()

data = pd.read_excel(processed_data_path)
data

In [None]:
# Remover negativos
data.loc[data["Valores"]<0, "Valores"] *= -1
# Ajustar formato da columa Meses
data["Meses"] = data["Meses"].apply(lambda x: int(str(x).split(".")[0]))

In [None]:
data["Item Faturamento BW"] = data["Item Faturamento BW"].apply(lambda x: str(x).replace(" ",""))
for i, item in enumerate(np.sort(data["Item Faturamento BW"].unique())):
    print(f"{i+1:02d} - {item}")


In [None]:
data["SubAgrupador de Fatu"] = data["SubAgrupador de Fatu"].apply(lambda x: str(x).replace(" ",""))
for i, item in enumerate(np.sort(data["SubAgrupador de Fatu"].unique())):
    print(f"{i+1:02d} - {item}")


In [None]:
data["grupo"] = data["grupo"].apply(lambda x: str(x).replace(" ",""))
for i, item in enumerate(np.sort(data["grupo"].unique())):
    print(f"{i+1:02d} - {item}")

In [None]:
index = [
    'Empresa',
    'Nº instalação',
    "Meses"
    ]

columns = "Item Faturamento BW"
value = "Valores"

data_piv = pd.pivot_table(data, values=value, index=index, columns=columns)

data_piv = data_piv.reset_index()

data_piv["EnergAtvInj.mUCoPT-FPTE"] = data_piv["EnergAtvInj.mUCoPT-FPTE"] + data_piv["EnergAtvInj.mUCoPT-FP-TE"]
data_piv = data_piv.drop("EnergAtvInj.mUCoPT-FP-TE", axis=1)
data_piv



# Extração de características

In [None]:
data_feat = pd.DataFrame()
keys = ["Nº instalação", "SubAgrupador de Fatu"]

for (cliente, grupo), grouped_df in data.groupby(by=keys):
    
    keys_2 = keys.copy()
    keys_2.append("Meses")
    df_tmp = pd.DataFrame()
    df_tmp["cliente"] = cliente,
    df_tmp["grupo"] = grupo,
    df_tmp["count"] = grouped_df["Meses"].unique().shape[0],
    df_tmp["initial"] = grouped_df["Meses"].min(),
    df_tmp["last"] = grouped_df["Meses"].max(),
    df_tmp["mean"] = float(round(grouped_df.groupby(keys_2).sum()["Valores"].mean(),4)),
    df_tmp["max"] = float(round(grouped_df.groupby(keys_2).sum()["Valores"].max(),4)),
    df_tmp["min"] = float(round(grouped_df.groupby(keys_2).sum()["Valores"].min(),4)),
    df_tmp["total"] = float(round(grouped_df.groupby(keys_2).sum()["Valores"].sum(),4)),
    df_tmp["amplitude"] = df_tmp["max"] - df_tmp["min"]
    df_tmp["std"] = float(round(grouped_df.groupby(keys_2).sum()["Valores"].std(),4))
    # break
    data_feat = pd.concat([data_feat, df_tmp])



In [None]:
values = [
    "count",
    "initial",
    "last",
    "mean",
    "max",
    "min",
    "total",
    "amplitude",
    "std"
]

data_feat_pivot = pd.pivot_table(data_feat, values=values, columns="grupo", index="cliente")
data_feat_pivot.columns = ['_'.join(col).strip() for col in data_feat_pivot.columns]
data_feat_pivot = data_feat_pivot.reset_index(drop=False)
data_feat_pivot = data_feat_pivot.fillna(0)

In [None]:
data_feat_pivot.columns

In [None]:
plt.figure(figsize=(14,5))
plt.subplot(1,6,1)
sns.boxplot(data_feat_pivot, y="mean_Consumo")
plt.ylabel("Media mensal do Consumo de Energia (KWh)")
plt.subplot(1,6,2)
sns.boxplot(data_feat_pivot, y="mean_Demanda")
plt.ylabel("Média mensal da Demanda Contratada (KW)")
plt.subplot(1,6,3)
sns.boxplot(data_feat_pivot, y="mean_TE")
plt.ylabel("Média mensal do Consumo Uso Sist.Dist. (KWh)")
plt.ticklabel_format(style='plain', axis='y')
plt.subplot(1,6,4)
sns.boxplot(data_feat_pivot, y="mean_TUSD")
plt.ylabel("Média mensal da Demanda Uso Sist.Dist. (KW)")
plt.subplot(1,6,5)
sns.boxplot(data_feat_pivot, y="mean_FER")
plt.ylabel("Média mensal da Energia Reativa (KWh)")
plt.subplot(1,6,6)
sns.boxplot(data_feat_pivot, y="mean_Injetada")
plt.ylabel("Média mensal da Energia Inetada (KWh)")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(data_feat_pivot, x="mean_Consumo")
plt.xlabel("Média mensal do Consumo de Energia (KWh)")
plt.ylabel("Quantiade de clientes")
plt.title("Distribuição do consumo mensal dos clientes")
plt.show()

In [None]:
pairplot_vars = [
    "mean_Consumo",
    "mean_Demanda",
    "mean_TE",
    "mean_TUSD",
    "mean_FER",
    "mean_Injetada"
]
sns.pairplot(data_feat_pivot.loc[:, pairplot_vars], height=1.5, aspect=1.9)
plt.show()

### Agrupamentos
0 = Clientes do Mercado Livre (Sem consumo, demanda contratada e geração).

1 = Clientes que migraram para o Mercado Livre durante 2024. (Finalizaram 2024 com TUSD)

2 = Clientes que geram energia ativamente (Finalizaram o ano de 2024 Injetando Energia).

3 = Clientes que não geram energia (Sem geração ou compensação de energia).

In [None]:
data_feat_pivot["grupo"] = -1

# Set group 3 - Livre
data_feat_pivot.loc[
    (
        (data_feat_pivot["mean_Consumo"]==0) &
        (data_feat_pivot["mean_Demanda"]==0) &
        (data_feat_pivot["mean_Injetada"]==0) &
        (data_feat_pivot["grupo"]==-1)
    ),
    "grupo"
    ] = 3

# Set group 2 - Cativo -> Livre
data_feat_pivot.loc[
    (
        (data_feat_pivot["count_Consumo"]!=0) &
        (data_feat_pivot["last_TUSD"]==12) &
        (data_feat_pivot["last_TE"]==12) &
        (data_feat_pivot["grupo"]==-1)
    ),
    "grupo"
    ] = 2

# Set group 1 - Cativo com Geração
data_feat_pivot.loc[
    (
        (data_feat_pivot["mean_Injetada"]!=0) &
        (data_feat_pivot["last_Injetada"]==12) & 
        (data_feat_pivot["grupo"]==-1)
    ),
    "grupo"
    ] = 1

# Set group 0 - Cativos sem Geração
data_feat_pivot.loc[
    (
        (data_feat_pivot["mean_Consumo"]!=0) &
        (data_feat_pivot["mean_Demanda"]!=0) &
        (data_feat_pivot["mean_Injetada"]==0) &
        (data_feat_pivot["mean_TUSD"]==0) &
        (data_feat_pivot["mean_EnergiaCompensada"]==0) &
        (data_feat_pivot["mean_Geração"]==0) &
        (data_feat_pivot["initial_Consumo"]==1) &
        (data_feat_pivot["last_Consumo"]==12) &
        (data_feat_pivot["grupo"]==-1)
    ),
    "grupo"
    ] = 0

group_dict = {
    -1: "Outliers",
    0: "Cativos",
    1: "Cativos Geradores",
    2: "Em Transição",
    3: "Mercado Livre"
}

data_feat_pivot["grupo"].value_counts()

In [None]:
pairplot_vars = [
    "mean_Consumo",
    "mean_Demanda",
    "mean_TE",
    "mean_TUSD",
    "mean_FER",
    "mean_Injetada",
    "grupo"
]
sns.pairplot(data_feat_pivot.loc[:, pairplot_vars],
             height=1.5, aspect=1.9,
             vars=pairplot_vars[0:-1], 
             hue="grupo",
             palette="Set1")
plt.show()

In [None]:
group_count = data_feat_pivot["grupo"].value_counts()

group_color = {
    0: "#01108b",
    1: "#39c9e6",
    2: "#f9a907",
    3: "#820808",
    -1: "#999999",
}

total = group_count.sum()
percentages = group_count / total * 100
labels = [f"{group_dict[idx]}: {val} ({pct:.1f}%)" for idx, val, pct in zip(group_count.index, group_count.values, percentages)]

palette = sns.color_palette("tab10", n_colors=len(data))

plt.pie(group_count, labels=labels,
        explode=[0.01,0.01,0.01,0.01,0.01],
        colors=[group_color[x] for x in group_count.keys()]
        )
plt.show()
# group_count.keys()

In [None]:
color = [group_color[x] for x in group_count.keys()]
color

In [None]:
list(group_color.values())

In [None]:
data_feat_pivot.to_excel(Path("../data/processed/clientes_features.xlsx"), index=False)

In [None]:
mask_parallel = data_feat_pivot.columns.str.contains("mean|cliente|grupo", regex=True, case=False)

data_parallel = data_feat_pivot.loc[
    data_feat_pivot["grupo"]!=-1,
    mask_parallel
]
data_parallel

parallel_dimensions = list(
    data_feat_pivot.columns[
        data_feat_pivot.columns.str.contains("mean|grupo", regex=True, case=False)
    ]
)
parallel_dimensions.remove("mean_EnergiaCompensada")
parallel_dimensions.remove("mean_Geração")


In [None]:
fig = px.parallel_coordinates(
    data_frame=data_parallel,
    dimensions=parallel_dimensions[0:-1],
    # color_continuous_scale=px.colors.sequential.Jet_r,
    labels={
        "mean_Consumo": "Consumo Energia (KWh)",
        "mean_Demanda": "Demanda Contratada (KW)",
        "mean_EnergiaCompensada": "Energia Compensada (KWh)",
        "mean_FER": "Consumo Reativo (KWh)",
        "mean_Geração": "Geração Contratada (KW)",
        "mean_Injetada": "Energia Injetada (KWh)",
        "mean_TE": "Uso Sis.Distrib. Contratado (KW)",
        "mean_TUSD": "Consumo Uso Sis.Distrib. (KWh)",
    }
)

fig.update_layout(
    font=dict(size=16),  # ← change font size here
)
fig.show()
fig.write_html("../plots/perfil_clientes_coordenadas_paralelas_0.html")


In [None]:

fig = px.parallel_coordinates(
    data_frame=data_parallel,
    dimensions=parallel_dimensions,
    color="grupo",
    color_continuous_scale=px.colors.sequential.Jet,
    labels={
        "mean_Consumo": "Consumo Energia (KWh)",
        "mean_Demanda": "Demanda Contratada (KW)",
        "mean_EnergiaCompensada": "Energia Compensada (KWh)",
        "mean_FER": "Consumo Reativo (KWh)",
        "mean_Geração": "Geração Contratada (KW)",
        "mean_Injetada": "Energia Injetada (KWh)",
        "mean_TE": "Uso Sis.Distrib. Contratado (KW)",
        "mean_TUSD": "Consumo Uso Sis.Distrib. (KWh)",
    }
)

fig.update_layout(
    font=dict(size=16),  # ← change font size here
)
fig.show()
fig.write_html("../plots/perfil_clientes_coordenadas_paralelas.html")


# Perfil dos clientes

In [None]:
data_profile = pd.DataFrame()
keys = ["Nº instalação", "SubAgrupador de Fatu"]

for (cliente, grupo), grouped_df in data.groupby(by=keys):
    keys_2 = keys.copy()
    keys_2.append("Meses")
    df_tmp = grouped_df.reset_index().pivot_table(
        values="Valores", columns="Meses", 
        index=["SubAgrupador de Fatu", "Nº instalação"]
    ).reset_index()        
    data_profile = pd.concat([data_profile, df_tmp])

data_profile = data_profile.rename(columns={
                "Nº instalação": "cliente",
                "SubAgrupador de Fatu": "grupo_fatura"
                })

In [None]:
data_profile = pd.merge(data_profile, 
                        data_feat_pivot[["grupo", "cliente"]],
                        on="cliente",
                        how="left")
data_profile

In [None]:
data_profile.to_excel(Path("../data/processed/clientes_perfil.xlsx"), index=False)

In [None]:
rename_dict = {
    "Consumo": "Consumo Energia (KWh)",
    "Demanda": "Demanda Contratada (KW)",
    "EnergiaCompensada": "Energia Compensada (KWh)",
    "FER": "Consumo Reativo (KWh)",
    "Geração": "Geração Contratada (KW)",
    "Injetada": "Energia Injetada (KWh)",
    "TE": "Uso Sis.Distrib. Contratado (KW)",
    "TUSD": "Consumo Uso Sis.Distrib. (KWh)",
}

def plot_group_profile(data: pd.DataFrame, group, invoice_groups=None):

    if invoice_groups is None:
        invoice_groups = data["grupo_fatura"].unique()

    df = data.loc[
        (data["grupo"]==group) &
        (data["grupo_fatura"].isin(invoice_groups))
        ].copy()
    fig, ax = plt.subplots(
        len(invoice_groups),
        1,
        figsize=(
            5,
            len(invoice_groups)*2)
        )

    for i, invoice_group in enumerate(invoice_groups):
        df_grouped = df[df["grupo_fatura"] == invoice_group]
        df_tmp = df_grouped.drop(["grupo_fatura", "cliente", "grupo"], axis=1)
        
        if not df_tmp.empty:
            df_tmp.T.plot(ax=ax[i], color="lightgray", linewidth=1, legend=False)
            df_tmp.mean().plot(ax=ax[i], color="blue", linewidth=2, legend=False)
        else:
            ax[i].plot(np.zeros(shape=(12)), color="lightgray")
        
        ax[i].set_xticks(range(12))
        ax[i].set_xticklabels(range(1, 13))
        ax[i].set_title(rename_dict[invoice_group])
    fig.suptitle(f"Grupo {group}")
    plt.tight_layout(rect=[0,0,1,0.99])
    plt.show()


In [None]:
invoice_group = [
    'Consumo',
    'Demanda',
    'TE',
    'TUSD',
    'FER',
    'Injetada',
    ]
plot_group_profile(data_profile, group=-1, invoice_groups=invoice_group)

# Comportamento das variáveis por perfil

In [None]:
data_profile_melted = pd.melt(data_profile,
                              id_vars=["grupo_fatura", "grupo", "cliente"],
                              value_vars=list(range(1,13)),
                              var_name="meses",
                              value_name="valores")
data_profile_melted = data_profile_melted.dropna(subset=["valores"], axis=0)

In [None]:
data_profile_melted

In [None]:
data_profile_melted = pd.melt(data_profile,
                              id_vars=["grupo_fatura", "grupo", "cliente"],
                              value_vars=list(range(1,13)),
                              var_name="meses",
                              value_name="valores")
data_profile_melted = data_profile_melted.dropna(subset=["valores"], axis=0)

group_dict = {
    -1: "Outliers",
    0: "Cativos",
    1: "Cativos Geradores",
    2: "Em Transição",
    3: "Mercado Livre"
}

all_months = list(range(1,13))

keys = ["grupo_fatura", "grupo"]
for (invoice_group, group), df_grouped in data_profile_melted.groupby(keys):


    # Ensure 'meses' is a Categorical with fixed order
    df_grouped["meses"] = pd.Categorical(df_grouped["meses"], 
                                         categories=all_months, 
                                         ordered=True)

    # Create subplots (2 rows, 1 column)
    fig, axes = plt.subplots(nrows=2, 
                             ncols=1, 
                             figsize=(6, 3.5), 
                             sharex=True, 
                             gridspec_kw={'height_ratios': [3, 1]})

    # Boxplot (First subplot)
    sns.boxplot(data=df_grouped, x="meses", y="valores", ax=axes[0])
    axes[0].set_title(f"{group_dict[group]} - {rename_dict[invoice_group]}")

    # Remove X-axis markers from the boxplot
    axes[0].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)

    # Recount months and reindex with full month list
    month_counts = df_grouped["meses"].value_counts().reindex(all_months, fill_value=0).reset_index()
    month_counts.columns = ["meses", "Count"]

    # Barplot (Second subplot) - Count occurrences of each month
    # month_counts = df_grouped["meses"].value_counts().reset_index()
    # month_counts.columns = ["meses", "Count"]
    barplot = sns.barplot(data=month_counts, x="meses", y="Count", ax=axes[1])

    # Remove spines (contours)
    for spine in ["top", "right", "left", "bottom"]:
        axes[0].spines[spine].set_visible(False)
        axes[1].spines[spine].set_visible(False)

    # Add value labels to bars
    for p in barplot.patches:
        axes[1].annotate(f'{p.get_height()}', 
                         (p.get_x() + p.get_width() / 2., p.get_height()), 
                         ha='center', va='bottom', fontsize=10,)

    # axes[1].set_title(f"Count of Occurrences for {item}")

    # Adjust layout and show the plot
    plt.tight_layout()
    fig.savefig(f"../plots/variaveis_g{group}_{group_dict[group]}_{invoice_group}.png", transparent=False)
    plt.close()
    # plt.show()
    # break

### Outliers

In [None]:
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import Normalizer, MaxAbsScaler, MinMaxScaler, RobustScaler, StandardScaler

In [None]:
interest_columns = ["mean_Consumo", "mean_Injetada", "mean_Demanda"]
outlier_subset = data_feat_pivot.loc[data_feat_pivot["grupo"]!=1, interest_columns]
outlier_subset

In [None]:
col_count = len(interest_columns)
fig, ax = plt.subplots(col_count, 1, figsize=(col_count*5, 6))
for i, col in enumerate(interest_columns):

    sns.histplot(outlier_subset, x=col, ax=ax[i])
plt.subplots_adjust(hspace=0.4)
plt.show()

In [None]:
col_count = len(interest_columns)
fig, ax = plt.subplots(1, col_count, figsize=(12, 5))
for i, col in enumerate(interest_columns):
    sns.boxplot(outlier_subset, y=col, ax=ax[i])
plt.subplots_adjust(hspace=0.4)
plt.show()



IQR

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(16, 4))
sns.scatterplot(outlier_subset, x="mean_Consumo", y="mean_Injetada", ax=ax[0])
sns.scatterplot(outlier_subset, x="mean_Consumo", y="mean_Demanda", ax=ax[1])
sns.scatterplot(outlier_subset, x="mean_Injetada", y="mean_Demanda", ax=ax[2])
plt.subplots_adjust(wspace=0.4)
plt.show()

In [None]:
q1 = outlier_subset.quantile(0.25)
q3 = outlier_subset.quantile(0.75)
iqr = q3 - q1

outlier_mask = ((outlier_subset < (q1 - 1.5*iqr)) | (outlier_subset > (q3+1.5*iqr))).all(axis=1)
outlier_subset["out_iqr"] = 0
outlier_subset.loc[outlier_mask, "out_iqr"] = -1
print(outlier_subset["out_iqr"].value_counts())

fig, ax = plt.subplots(1, 3, figsize=(16, 4))
sns.scatterplot(outlier_subset, x="mean_Consumo", y="mean_Injetada", hue="out_iqr", palette="Set1", ax=ax[0])
sns.scatterplot(outlier_subset, x="mean_Consumo", y="mean_Demanda", hue="out_iqr", palette="Set1", ax=ax[1])
sns.scatterplot(outlier_subset, x="mean_Injetada", y="mean_Demanda", hue="out_iqr", palette="Set1", ax=ax[2])
plt.subplots_adjust(wspace=0.4)
plt.show()



In [None]:
X_data = outlier_subset.fillna(0)
clusterizer = DBSCAN(eps=10000,min_samples=10)
clusters = clusterizer.fit_predict(X_data)
outlier_subset["out_dbscan"] = clusters
print(outlier_subset["out_dbscan"].value_counts())
fig, ax = plt.subplots(1, 3, figsize=(16, 4))
sns.scatterplot(outlier_subset, x="mean_Consumo", y="mean_Injetada", hue="out_dbscan", palette="Set1", ax=ax[0])
sns.scatterplot(outlier_subset, x="mean_Consumo", y="mean_Demanda", hue="out_dbscan", palette="Set1", ax=ax[1])
sns.scatterplot(outlier_subset, x="mean_Injetada", y="mean_Demanda", hue="out_dbscan", palette="Set1", ax=ax[2])
plt.subplots_adjust(wspace=0.4)
plt.show()


In [None]:
outlier_subset["outlier"] = 0
outlier_subset.loc[(outlier_subset[["out_dbscan", "out_iqr"]]==-1).any(axis=1), "outlier"] = -1
print(outlier_subset["outlier"].value_counts())
fig, ax = plt.subplots(1, 3, figsize=(16, 4))
sns.scatterplot(outlier_subset, x="mean_Consumo", y="mean_Injetada", hue="outlier", palette="Set1", ax=ax[0])
sns.scatterplot(outlier_subset, x="mean_Consumo", y="mean_Demanda", hue="outlier", palette="Set1", ax=ax[1])
sns.scatterplot(outlier_subset, x="mean_Injetada", y="mean_Demanda", hue="outlier", palette="Set1", ax=ax[2])
plt.subplots_adjust(wspace=0.4)
plt.show()

# Clusterização

In [None]:
indexes_mask = outlier_subset.loc[outlier_subset["outlier"]!= -1].index

X = data_feat_pivot.loc[
    # :,
    indexes_mask,
    (data_feat_pivot.columns.str.contains("Consumo")) | (data_feat_pivot.columns.str.contains("Injetada")) #| (data_feat_pivot.columns.str.contains("_Demanda"))
    ].fillna(0)

X = X.loc[:,~(X.columns.str.contains("last_")) & ~(X.columns.str.contains("count_"))]
X

In [None]:
# normalizer = Normalizer()
# normalizer = MaxAbsScaler()
# normalizer = MinMaxScaler()
# normalizer = StandardScaler()
normalizer = RobustScaler()
X_normalized = normalizer.fit_transform(X)

inertias = []
k_range = list(range(2, 11))
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_normalized)
    inertias.append(kmeans.inertia_)

plt.plot(k_range, inertias)
plt.ylabel("Inercias")
plt.xlabel("Numero Clusters")
plt.show()

In [None]:
reduction = PCA(n_components=2, random_state=42)
X_reduced = reduction.fit_transform(X_normalized)
print(f"Explained Variance Ratio: {reduction.explained_variance_ratio_}")

k = 3 # based on elbon plot
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X_normalized)
X["cluster"] = labels

df = pd.DataFrame(X_reduced)
df["cluster"] = labels
print(f"Clusters count")
print(df["cluster"].value_counts())
# Plot reduction
fig1, ax1 = plt.subplots(figsize=(10, 6))
sns.scatterplot(df, x=1, y=0, hue="cluster", palette="Set1", ax=ax1)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

# plot original data
fig2, ax2 = plt.subplots(figsize=(10, 6))
sns.scatterplot(X, x="mean_Consumo", y="mean_Injetada", hue="cluster", palette="Set1", ax=ax2)
# plt.yscale("log")
# plt.xscale("log")
plt.show()

In [None]:
c0 = X.loc[X["cluster"]==0].mean()
c1 = X.loc[X["cluster"]==1].mean()
c2 = X.loc[X["cluster"]==2].mean()
plt.bar(c0.keys() ,c0.values)
plt.bar(c1.keys() ,c1.values)
plt.bar(c2.keys() ,c2.values)
# sns.barplot(X.loc[X["cluster"]==0].mean())
# sns.barplot(X.loc[X["cluster"]==1].mean())
plt.xticks(rotation=90)
plt.show()

In [None]:
X.loc[X["cluster"]==0].mean()

In [None]:
X.loc[X["cluster"]==1].describe().T

In [None]:
X.loc[X["cluster"]==1].mean()

In [None]:
data_feat_pivot.loc[(data_feat_pivot["total_Injetada"]==0) & (data_feat_pivot["total_Consumo"]!=0) & (data_feat_pivot["last_Consumo"]==12)]

In [None]:
data_feat_pivot = pd.merge(data_feat_pivot, X[["cluster"]], how="left", left_index=True, right_index=True)
data_feat_pivot["cluster"] = data_feat_pivot["cluster"].fillna(-1)
data_feat_pivot

In [None]:
data_feat_pivot.loc[data_feat_pivot["cluster"]==0]

In [None]:
sns.boxplot(data_feat_pivot, x="cluster", y="mean_TUSD")

In [None]:
sns.boxplot(X,x="cluster", y="mean_Injetada")

In [None]:
sns.boxplot(X,x="cluster", y="mean_Consumo")

In [None]:
X.loc[X["cluster"]==0]

In [None]:
# normalizer = Normalizer()
# normalizer = MaxAbsScaler()
# normalizer = MinMaxScaler()
normalizer = StandardScaler()
# normalizer = RobustScaler()
X_normalized = normalizer.fit_transform(X)

reduction = PCA(n_components=2, random_state=42)
X_reduced = reduction.fit_transform(X_normalized)
print(f"Explained Variance Ratio: {reduction.explained_variance_ratio_}")

clusterizer = DBSCAN(eps=3, min_samples=10)
labels = clusterizer.fit_predict(X_normalized)
X["cluster"] = labels

df = pd.DataFrame(X_reduced)
df["cluster"] = labels
print(f"Clusters count")
print(df["cluster"].value_counts())
# Plot reduction
fig1, ax1 = plt.subplots(figsize=(10, 6))
sns.scatterplot(df, x=1, y=0, hue="cluster", palette="Set1", ax=ax1)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

# plot original data
fig2, ax2 = plt.subplots(figsize=(10, 6))
sns.scatterplot(X, x="mean_Consumo", y="mean_Demanda", hue="cluster", palette="Set1", ax=ax2)
# plt.yscale("log")
# plt.xscale("log")
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Finding optimal number of clusters using the Elbow Method
inertias = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(6, 4))
plt.plot(range(1, 11), inertias, 'o-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc
from sklearn.cluster import DBSCAN

from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

X_scaled = scaler.fit_transform(X)

# # Initialize and fit K-means
# kmeans = KMeans(n_clusters=3, random_state=0)
# clusters = kmeans.fit_predict(X)

# Fit Agglomerative Clustering
# agg_clustering = AgglomerativeClustering(n_clusters=2)
# agg_clusters = agg_clustering.fit_predict(X)

dbscan = DBSCAN(eps=2, min_samples=5)
db_clusters = dbscan.fit_predict(X_scaled)

x_cluster = X.copy()
x_cluster["cluster"] = db_clusters
var_list = X.columns


In [None]:
x_cluster["cluster"].value_counts()

In [None]:

for cltr in x_cluster["cluster"].unique():
    df_temp = x_cluster.loc[x_cluster["cluster"]==cltr]
    plt.figure(figsize=(10,4))
    sns.boxenplot(df_temp)
    plt.show()


In [None]:

for clusters in clusters
sns.boxplot


In [None]:
fig = px.histogram(data, x="Valores", color="Item Faturamento BW")
fig.show()

In [None]:
for item in data["Item Faturamento BW"].unique():
    data_filtered = data.loc[data["Item Faturamento BW"] == item]
    plt.figure(figsize=(10,3))
    ax = sns.histplot(data_filtered, x="Valores")
    ax.set_title(item)
    plt.tight_layout()
    plt.show()

    

In [None]:
for item in data["Item Faturamento BW"].unique():
    data_filtered = data.loc[data["Item Faturamento BW"] == item]

    # Create subplots (2 rows, 1 column)
    fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(8, 6), sharex=True, gridspec_kw={'height_ratios': [3, 1]})

    # Boxplot (First subplot)
    sns.boxplot(data=data_filtered, x="Meses", y="Valores", ax=axes[0])
    axes[0].set_title(f"{item}")

    # Remove X-axis markers from the boxplot
    axes[0].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)

    # Barplot (Second subplot) - Count occurrences of each month
    month_counts = data_filtered["Meses"].value_counts().reset_index()
    month_counts.columns = ["Meses", "Count"]
    barplot = sns.barplot(data=month_counts, x="Meses", y="Count", ax=axes[1])

    # Remove spines (contours)
    for spine in ["top", "right", "left", "bottom"]:
        axes[0].spines[spine].set_visible(False)
        axes[1].spines[spine].set_visible(False)

    # Add value labels to bars
    for p in barplot.patches:
        axes[1].annotate(f'{p.get_height()}', 
                         (p.get_x() + p.get_width() / 2., p.get_height()), 
                         ha='center', va='bottom', fontsize=10,)

    # axes[1].set_title(f"Count of Occurrences for {item}")

    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()

In [None]:
data

In [None]:
item_occ = data[["Item Faturamento BW", "Meses", "Valores"]].groupby(["Item Faturamento BW", "Meses"]).count()
item_occ = item_occ.reset_index()
item_occ["Meses"] = item_occ["Meses"].astype(str)
item_occ


In [None]:
px.line(item_occ,x="Meses", y="Valores", color="Item Faturamento BW")

In [None]:
data

In [None]:
data

In [None]:
for (a, b,c,d), df  in data.groupby(["Empresa", "Classe de cálculo", "SubGrupo de Tensão", "Município"]):
    print(a)
    print(b)
    print(c)
    print(d)
    
    break


In [None]:
df