In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
processed_data_path = Path("../data/processed/CPFL-RS-Informações industrial grupo A.xlsx").resolve()

data = pd.read_csv(processed_data_path)
data

In [None]:
# Ajustar formato da columa Meses
data["Meses"] = data["Meses"].apply(lambda x: int(str(x).split(".")[0]))

In [None]:
data["Item Faturamento BW"] = data["Item Faturamento BW"].apply(lambda x: str(x).replace(" ",""))
for i, item in enumerate(np.sort(data["Item Faturamento BW"].unique())):
    print(f"{i+1:02d} - {item}")


In [None]:
data.loc[data["Item Faturamento BW"] == "nan"]

In [None]:
index = [
    'Empresa',
    'Nº instalação',
    'Classe de cálculo',
    'SubGrupo de Tensão',
    'Município',
    "Meses"
    ]

columns = "Item Faturamento BW"
value = "Valores"

data_piv = pd.pivot_table(data, values=value, index=index, columns=columns)

data_piv = data_piv.reset_index()

data_piv["EnergAtvInj.mUCoPT-FPTE"] = data_piv["EnergAtvInj.mUCoPT-FPTE"] + data_piv["EnergAtvInj.mUCoPT-FP-TE"]
data_piv["ConsumoAtivo"] = data_piv["ConsPontaBandVerde-TE"] + data_piv["ConsumoFPontaBandVerde-TE"]
data_piv["ConsumoReativo"] = data_piv["ConsumoReativoExcForaPonta"] + data_piv["ConsumoReativoExcPonta"]
data_piv["InjetadaAtiva"] = (data_piv["EnergiaAtvInjetadaFpontaTE"] + data_piv["EnergiaAtvInjetadaPontaTE"]) * -1
data_piv = data_piv.drop("EnergAtvInj.mUCoPT-FP-TE", axis=1)
data_piv



In [None]:
# 05 - DemandaDifContrato[kW]-TUSD
# 06 - DemandaFPonta[kW]-TUSD
# 07 - DemandaPonta[kW]-TUSD
# 08 - Demanda[kW]-TUSD

data_piv[data_piv["Demanda[kW]-TUSD"] >0]

In [None]:
sns.scatterplot(data_piv, x="ConsumoAtivo", y="ConsumoReativo")
plt.show()

In [None]:
sns.scatterplot(data_piv, x="ConsPontaBandVerde-TE", y="EnergiaAtvInjetadaFpontaTE")
plt.show()

In [None]:
sns.scatterplot(data_piv, x="UsoSist.Distr.", y="InjetadaAtiva")
plt.show()

In [None]:
sns.scatterplot(data_piv, x="ConsumoAtivo", y="InjetadaAtiva")
plt.show()

In [None]:
data_client = data_piv.groupby(["Nº instalação"]).sum()
data_client = data_client.drop(["Empresa", "Classe de cálculo", "SubGrupo de Tensão", "Município", "Meses"], axis=1)
data_client


In [None]:
ax = sns.scatterplot(data_client, x="ConsumoAtivo", y="ConsumoReativo", )
ax.set(xscale="log", yscale="log")
plt.show()

In [None]:
ax = sns.scatterplot(data_client, x="ConsumoAtivo", y="InjetadaAtiva", )
# ax.set(xscale="log", yscale="log")

plt.show()

# Clustering

In [None]:
data_piv.columns

In [None]:
# Prepare data



X = data_piv.drop(['Empresa', 'Nº instalação', 'Classe de cálculo', 'SubGrupo de Tensão', 'Município', 'Meses'], axis=1)
X = X.fillna(0)

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import Normalizer, MaxAbsScaler, MinMaxScaler, RobustScaler, StandardScaler
import matplotlib.pyplot as plt

In [None]:
evr = []
for i in range(2,10):

    svd = TruncatedSVD(n_components=i, random_state=42)
    X_reduced = svd.fit_transform(X)
    
    evr.append(svd.explained_variance_ratio_.sum())
    
plt.plot(evr)
plt.show()

In [None]:
evr = []
for i in range(2,10):

    pca = PCA(n_components=i, random_state=42)
    X_reduced = pca.fit_transform(X)
    evr.append(pca.explained_variance_ratio_.sum())

plt.plot(evr)
plt.show()

In [None]:
normalizer = Normalizer()
# normalizer = MaxAbsScaler()
# normalizer = MinMaxScaler()
# normalizer = StandardScaler()
# normalizer = RobustScaler()
X_normalized = normalizer.fit_transform(X)

svd = TruncatedSVD(n_components=2, random_state=42)
X_reduced = svd.fit_transform(X_normalized)

# Step 4: KMeans clustering (approximate cosine similarity via L2-normalized vectors)
k = 5
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X_normalized)
data_piv["cluster"] = labels
df = pd.DataFrame(X_reduced)
df["clusters"] = labels
print(df["clusters"].value_counts())
# Plot the results
plt.figure(figsize=(10, 6))
sns.scatterplot(df, x=0, y=1, hue="clusters", palette=sns.color_palette())
# plt.scatter(X[:, 0], X[:, 1], c=db_clusters, cmap='viridis', s=50)
# plt.legend(hand)
# plt.title('DBSCAN Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')

In [None]:
data_piv.loc[data_piv["cluster"]==1]

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Finding optimal number of clusters using the Elbow Method
inertias = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(6, 4))
plt.plot(range(1, 11), inertias, 'o-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc
from sklearn.cluster import DBSCAN

from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

X_scaled = scaler.fit_transform(X)

# # Initialize and fit K-means
# kmeans = KMeans(n_clusters=3, random_state=0)
# clusters = kmeans.fit_predict(X)

# Fit Agglomerative Clustering
# agg_clustering = AgglomerativeClustering(n_clusters=2)
# agg_clusters = agg_clustering.fit_predict(X)

dbscan = DBSCAN(eps=2, min_samples=5)
db_clusters = dbscan.fit_predict(X_scaled)

x_cluster = X.copy()
x_cluster["cluster"] = db_clusters
var_list = X.columns


In [None]:
x_cluster["cluster"].value_counts()

In [None]:

for cltr in x_cluster["cluster"].unique():
    df_temp = x_cluster.loc[x_cluster["cluster"]==cltr]
    plt.figure(figsize=(10,4))
    sns.boxenplot(df_temp)
    plt.show()


In [None]:

for clusters in clusters
sns.boxplot


In [None]:
fig = px.histogram(data, x="Valores", color="Item Faturamento BW")
fig.show()

In [None]:
for item in data["Item Faturamento BW"].unique():
    data_filtered = data.loc[data["Item Faturamento BW"] == item]
    plt.figure(figsize=(10,3))
    ax = sns.histplot(data_filtered, x="Valores")
    ax.set_title(item)
    plt.tight_layout()
    plt.show()

    

In [None]:
for item in data["Item Faturamento BW"].unique():
    data_filtered = data.loc[data["Item Faturamento BW"] == item]

    # Create subplots (2 rows, 1 column)
    fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(8, 6), sharex=True, gridspec_kw={'height_ratios': [3, 1]})

    # Boxplot (First subplot)
    sns.boxplot(data=data_filtered, x="Meses", y="Valores", ax=axes[0])
    axes[0].set_title(f"{item}")

    # Remove X-axis markers from the boxplot
    axes[0].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)

    # Barplot (Second subplot) - Count occurrences of each month
    month_counts = data_filtered["Meses"].value_counts().reset_index()
    month_counts.columns = ["Meses", "Count"]
    barplot = sns.barplot(data=month_counts, x="Meses", y="Count", ax=axes[1])

    # Remove spines (contours)
    for spine in ["top", "right", "left", "bottom"]:
        axes[0].spines[spine].set_visible(False)
        axes[1].spines[spine].set_visible(False)

    # Add value labels to bars
    for p in barplot.patches:
        axes[1].annotate(f'{p.get_height()}', 
                         (p.get_x() + p.get_width() / 2., p.get_height()), 
                         ha='center', va='bottom', fontsize=10,)

    # axes[1].set_title(f"Count of Occurrences for {item}")

    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()

In [None]:
data

In [None]:
item_occ = data[["Item Faturamento BW", "Meses", "Valores"]].groupby(["Item Faturamento BW", "Meses"]).count()
item_occ = item_occ.reset_index()
item_occ["Meses"] = item_occ["Meses"].astype(str)
item_occ


In [None]:
px.line(item_occ,x="Meses", y="Valores", color="Item Faturamento BW")

In [None]:
data