# ⚙️ Pré-processamento dos Dados

In [2]:
# Importar bibliotecas necessárias
# !pip install pandas scikit_learn numpy
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import numpy as np

In [3]:
# Abrir o ficheiro
df = pd.read_parquet('datasets/consumo_eredes_customizado.parquet')

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

# MinMaxScaler [0, 1]
- Valores baixos de consumo (ex: 839.17) → ficam perto de 0
- Valores mais altos de consumo (ex: 11175.02) → ficam mais perto de 1

In [3]:
# Instanciar o scaler
scaler = MinMaxScaler()

# Aplicar no consumo energético
df['EnergyNormalized'] = scaler.fit_transform(df[['ActiveEnergy(kWh)']])

# StandardScaler [-♾️, +♾️]
- positivos -> acima da media
- negativos -> abaixo da media

In [4]:
scaler = StandardScaler()
df['EnergyStandardized'] = scaler.fit_transform(df[['ActiveEnergy(kWh)']])

In [5]:
df.tail()

Unnamed: 0,DateTime,Date,Hour,ZipCode,ActiveEnergy(kWh),Day,Month,Year,IsWeekend,TimeOfDay,DayOfTheWeek,Season,Temperature,PopulationDensity,EnergyNormalized,EnergyStandardized
3719403,2023-09-30 23:00:00,2023-09-30,23,1685,3588.766159,30,9,2023,1,Noite,Sábado,Outono,21.8,5455.23,0.023778,-0.562608
3719404,2023-09-30 23:00:00,2023-09-30,23,3740,1849.719996,30,9,2023,1,Noite,Sábado,Outono,22.4,2788.79,0.012252,-0.713159
3719405,2023-09-30 23:00:00,2023-09-30,23,3750,12433.094481,30,9,2023,1,Noite,Sábado,Outono,22.4,2788.79,0.082397,0.203054
3719406,2023-09-30 23:00:00,2023-09-30,23,3680,2393.029205,30,9,2023,1,Noite,Sábado,Outono,20.2,196.31,0.015853,-0.666124
3719407,2023-09-30 23:00:00,2023-09-30,23,8970,720.20691,30,9,2023,1,Noite,Sábado,Outono,22.5,333.82,0.004766,-0.810942


# Criação de conjuntos de treino, validação e teste
- train (nov 2022 - jun 2023) -> 70%
- validation (jul 2023 - ago 2023) -> 15%
- train (set 2023) -> 15%

In [6]:
# Divisão com base no tempo

# Definir os limites
train_df = df[df['DateTime'] < '2023-07-01']
val_df   = df[(df['DateTime'] >= '2023-07-01') & (df['DateTime'] < '2023-09-01')]
test_df  = df[df['DateTime'] >= '2023-09-01']

In [7]:
# Separar features e target
features = ['DayOfTheWeek', 'DateTime', 'ZipCode']
target = 'EnergyNormalized' # usar este em vez do ActiveEnergy(kWh), porque ha melhor performance no modelo e modelos sensiveis a escala

X_train, y_train = train_df[features], train_df[target]
X_val, y_val     = val_df[features], val_df[target]
X_test, y_test   = test_df[features], test_df[target]

# Criação de janelas de tempo (`windowing`) para LSTM/CNN

In [8]:
def create_windows(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i+window_size])
        y.append(data[i+window_size])
    return np.array(X), np.array(y)

window_size = 24  # Define o número de passos anteriores (ex: 24 horas)

# Só usamos a coluna normalizada
train_series = train_df['EnergyNormalized'].values
val_series   = val_df['EnergyNormalized'].values
test_series  = test_df['EnergyNormalized'].values

# Criar janelas
X_train_win, y_train_win = create_windows(train_series, window_size)
X_val_win, y_val_win = create_windows(val_series, window_size)
X_test_win, y_test_win = create_windows(test_series, window_size)

# Reshape para LSTM/CNN: (amostras, passos temporais, 1)
X_train_win = X_train_win.reshape((X_train_win.shape[0], X_train_win.shape[1], 1))
X_val_win   = X_val_win.reshape((X_val_win.shape[0], X_val_win.shape[1], 1))
X_test_win  = X_test_win.reshape((X_test_win.shape[0], X_test_win.shape[1], 1))

print("Formatos finais:")
print("X_train:", X_train_win.shape, "| y_train:", y_train_win.shape)
print("X_val  :", X_val_win.shape, "| y_val  :", y_val_win.shape)
print("X_test :", X_test_win.shape, "| y_test :", y_test_win.shape)

Formatos finais:
X_train: (2694883, 24, 1) | y_train: (2694883,)
X_val  : (690403, 24, 1) | y_val  : (690403,)
X_test : (334050, 24, 1) | y_test : (334050,)


In [9]:
df.head()

Unnamed: 0,DateTime,Date,Hour,ZipCode,ActiveEnergy(kWh),Day,Month,Year,IsWeekend,TimeOfDay,DayOfTheWeek,Season,Temperature,PopulationDensity,EnergyNormalized,EnergyStandardized
0,2022-11-01,2022-11-01,0,1000,9328.306723,1,11,2022,0,Noite,Terça,Outono,15.5,5455.23,0.061819,-0.065731
1,2022-11-01,2022-11-01,0,3045,4293.076725,1,11,2022,0,Noite,Terça,Outono,11.4,440.88,0.028446,-0.501635
2,2022-11-01,2022-11-01,0,3050,6608.606545,1,11,2022,0,Noite,Terça,Outono,11.4,440.88,0.043793,-0.301178
3,2022-11-01,2022-11-01,0,3060,16832.631994,1,11,2022,0,Noite,Terça,Outono,11.4,440.88,0.111557,0.583926
4,2022-11-01,2022-11-01,0,3070,5894.381217,1,11,2022,0,Noite,Terça,Outono,11.4,440.88,0.03906,-0.363009


In [10]:
df.to_parquet("datasets/consumo_eredes_normalizado.parquet", index=False)
print("✅ Ficheiro combinado guardado em: datasets/consumo_eredes_normalizado.parquet")

✅ Ficheiro combinado guardado em: datasets/consumo_eredes_normalizado.parquet
