# ⚙️ Pré-processamento dos Dados

In [20]:
# Importar bibliotecas necessárias
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import numpy as np

In [21]:
# Abrir o ficheiro com ; como separador
df = pd.read_csv('datasets/consumo_eredes_limpo.csv', sep=',')

# MinMaxScaler [0, 1]
- Valores baixos de consumo (ex: 839.17) → ficam perto de 0
- Valores mais altos de consumo (ex: 11175.02) → ficam mais perto de 1

In [22]:
# Instanciar o scaler
scaler = MinMaxScaler()

# Aplicar no consumo energético
df['Energy_Normalized'] = scaler.fit_transform(df[['Active Energy (kWh)']])

# StandardScaler [-♾️, +♾️]
- positivos -> acima da media
- negativos -> abaixo da media

In [23]:
scaler = StandardScaler()
df['Energy_Standardized'] = scaler.fit_transform(df[['Active Energy (kWh)']])

In [24]:
df.tail()

Unnamed: 0,Date,Hour,Zip Code,Active Energy (kWh),Day of the Week,Datetime,Day,Month,Year,Is_Weekend,Is_Night,Season,Energy_Normalized,Energy_Standardized
3727419,2023-09-30,23,4560,9738.156651,Sábado,2023-09-30 23:00:00,30,9,2023,1,0,Outono,0.064536,-0.028398
3727420,2023-09-30,23,2845,7462.400572,Sábado,2023-09-30 23:00:00,30,9,2023,1,0,Outono,0.049452,-0.225466
3727421,2023-09-30,23,3025,3889.286949,Sábado,2023-09-30 23:00:00,30,9,2023,1,0,Outono,0.02577,-0.534878
3727422,2023-09-30,23,5340,2527.463359,Sábado,2023-09-30 23:00:00,30,9,2023,1,0,Outono,0.016744,-0.652805
3727423,2023-09-30,23,7780,34065.882374,Sábado,2023-09-30 23:00:00,30,9,2023,1,0,Outono,0.225777,2.078251


# Criação de conjuntos de treino, validação e teste
- train (nov 2022 - jun 2023) -> 70%
- validation (jul 2023 - ago 2023) -> 15%
- train (set 2023) -> 15%

In [25]:
# Divisão com base no tempo

# Definir os limites
train_df = df[df['Datetime'] < '2023-07-01']
val_df   = df[(df['Datetime'] >= '2023-07-01') & (df['Datetime'] < '2023-09-01')]
test_df  = df[df['Datetime'] >= '2023-09-01']

In [26]:
# Separar features e target
features = ['Day of the Week', 'Datetime', 'Zip Code']
target = 'Energy_Normalized' # usar este em vez do Active Energy (kWh), porque ha melhor performance no modelo e modelos sensiveis a escala

X_train, y_train = train_df[features], train_df[target]
X_val, y_val     = val_df[features], val_df[target]
X_test, y_test   = test_df[features], test_df[target]

# Criação de janelas de tempo (`windowing`) para LSTM/CNN

In [27]:
def create_windows(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i+window_size])
        y.append(data[i+window_size])
    return np.array(X), np.array(y)

window_size = 24  # Define o número de passos anteriores (ex: 24 horas)

# Só usamos a coluna normalizada
train_series = train_df['Energy_Normalized'].values
val_series   = val_df['Energy_Normalized'].values
test_series  = test_df['Energy_Normalized'].values

# Criar janelas
X_train_win, y_train_win = create_windows(train_series, window_size)
X_val_win, y_val_win     = create_windows(val_series, window_size)
X_test_win, y_test_win   = create_windows(test_series, window_size)

# Reshape para LSTM/CNN: (amostras, passos temporais, 1)
X_train_win = X_train_win.reshape((X_train_win.shape[0], X_train_win.shape[1], 1))
X_val_win   = X_val_win.reshape((X_val_win.shape[0], X_val_win.shape[1], 1))
X_test_win  = X_test_win.reshape((X_test_win.shape[0], X_test_win.shape[1], 1))

print("Formatos finais:")
print("X_train:", X_train_win.shape, "| y_train:", y_train_win.shape)
print("X_val  :", X_val_win.shape, "| y_val  :", y_val_win.shape)
print("X_test :", X_test_win.shape, "| y_test :", y_test_win.shape)

Formatos finais:
X_train: (2700691, 24, 1) | y_train: (2700691,)
X_val  : (691891, 24, 1) | y_val  : (691891,)
X_test : (334770, 24, 1) | y_test : (334770,)


In [28]:
df.head()

Unnamed: 0,Date,Hour,Zip Code,Active Energy (kWh),Day of the Week,Datetime,Day,Month,Year,Is_Weekend,Is_Night,Season,Energy_Normalized,Energy_Standardized
0,2022-11-01,0,5350,797.992437,Terça,2022-11-01 00:00:00,1,11,2022,0,1,Outono,0.005281,-0.802567
1,2022-11-01,0,3780,10290.2904,Terça,2022-11-01 00:00:00,1,11,2022,0,1,Outono,0.068195,0.019414
2,2022-11-01,0,7595,422.903665,Terça,2022-11-01 00:00:00,1,11,2022,0,1,Outono,0.002795,-0.835048
3,2022-11-01,0,2950,19649.659297,Terça,2022-11-01 00:00:00,1,11,2022,0,1,Outono,0.130228,0.829885
4,2022-11-01,0,7320,974.728282,Terça,2022-11-01 00:00:00,1,11,2022,0,1,Outono,0.006453,-0.787263


In [29]:
df.to_csv('datasets/consumo_eredes_limpo1.csv', index=False)