In [1]:
# Preparação dos dados
import os

import numpy as np
import pandas as pd

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score

sns.set_theme()

In [2]:
## Importar dataset teste

data_path = '../data/' if os.path.exists(
    '../data/') else 'https://raw.githubusercontent.com/kreativermario/Projeto-DECD/master/data/'

test_path = data_path + 'treated/prepared/numeric/no-dates/dataset-numeric-high-tensions-test-no-dates.csv'

test_df = pd.read_csv(test_path)

In [3]:
## Importar dataset treino
train_path = data_path + 'treated/prepared/numeric/no-dates/dataset-numeric-high-tensions-train-no-dates.csv'

train_df = pd.read_csv(train_path)

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4641 entries, 0 to 4640
Data columns (total 5 columns):
 #   Column                              Non-Null Count  Dtype
---  ------                              --------------  -----
 0   energia_ativa_alta_tensao_kwh       4641 non-null   int64
 1   cpes_domestico_alta_tensao          4641 non-null   int64
 2   cpes_nao_domestico_alta_tensao      4641 non-null   int64
 3   cpes_outros_alta_tensao             4641 non-null   int64
 4   densidade_populacional_pessoas_km2  4641 non-null   int64
dtypes: int64(5)
memory usage: 181.4 KB


In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273 entries, 0 to 272
Data columns (total 5 columns):
 #   Column                              Non-Null Count  Dtype
---  ------                              --------------  -----
 0   energia_ativa_alta_tensao_kwh       273 non-null    int64
 1   cpes_domestico_alta_tensao          273 non-null    int64
 2   cpes_nao_domestico_alta_tensao      273 non-null    int64
 3   cpes_outros_alta_tensao             273 non-null    int64
 4   densidade_populacional_pessoas_km2  273 non-null    int64
dtypes: int64(5)
memory usage: 10.8 KB


# Algoritmos de Aprendizagem supervisionada

In [6]:
# Define regressors
regressors = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0, solver='auto'),  # Adjust alpha if needed
    'Lasso': Lasso(alpha=1.0),  # Adjust alpha if needed
    'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5),  # Adjust alpha and l1_ratio if needed
    'k-NN': KNeighborsRegressor(n_neighbors=5),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100),
    'SVM': SVR(kernel='linear', max_iter=10000),
    'MLP': MLPRegressor(hidden_layer_sizes=(8,4), max_iter=20000),
}

In [7]:
cv_scores = pd.Series({
    name: np.mean(cross_val_score(regressor, train_df.drop(columns=['energia_ativa_alta_tensao_kwh']), train_df['energia_ativa_alta_tensao_kwh'], cv=5))
    for name, regressor in regressors.items()
})

cv_scores

Linear Regression    0.525342
Ridge                0.525608
Lasso                0.525342
ElasticNet           0.546765
k-NN                -0.053549
Decision Tree        0.107463
Random Forest        0.313042
SVM                  0.469171
MLP                  0.559093
dtype: float64

In [8]:
best = cv_scores.sort_values(ascending=False).index[0]
best_regressor = regressors[best]
best_regressor.fit(train_df.drop(columns=['energia_ativa_alta_tensao_kwh']), train_df['energia_ativa_alta_tensao_kwh'])

print(f'### {best} ###')

y_pred = best_regressor.predict(test_df.drop(columns=['energia_ativa_alta_tensao_kwh']))

print('r2: {}'.format(r2_score(test_df['energia_ativa_alta_tensao_kwh'], y_pred)))
print('mse: {}'.format(mean_squared_error(test_df['energia_ativa_alta_tensao_kwh'], y_pred)))
print('mae: {}'.format(mean_absolute_error(test_df['energia_ativa_alta_tensao_kwh'], y_pred)))

### MLP ###
r2: -12.467247207610427
mse: 102325493122337.47
mae: 6526089.422055636


## Ordene os exemplos do conjunto de teste por ordem decrescente do erro da previsão do regressor e verifique se existe algum padrão relevante. 

In [9]:
df_error = test_df.copy()
df_error['error'] = np.abs(df_error['energia_ativa_alta_tensao_kwh'] - y_pred)

df_error.sort_values('error', ascending=False).head(20)

Unnamed: 0,energia_ativa_alta_tensao_kwh,cpes_domestico_alta_tensao,cpes_nao_domestico_alta_tensao,cpes_outros_alta_tensao,densidade_populacional_pessoas_km2,error
109,25962882,2,1226,0,5466,78462800.0
102,12592033,0,581,0,649,33681960.0
186,16932236,1,519,0,5753,32930190.0
253,7515787,0,465,0,1821,31655110.0
216,8451642,1,486,0,1215,31431000.0
200,9633627,1,494,0,639,29929860.0
108,9662301,0,490,0,231,28854320.0
251,10364095,1,482,0,667,28312250.0
39,5880938,0,380,0,308,24209980.0
111,7406105,1,366,0,1215,23147620.0


# Aprendizagem supervisionada com datasets normalizados

In [10]:
normalized_train_path = data_path + 'treated/prepared/numeric/normalized/train/dataset-numeric-high-tensions-2223-no-dates-zscore.csv'

normalized_train_df = pd.read_csv(normalized_train_path)

In [11]:
normalized_test_path = data_path + 'treated/prepared/numeric/normalized/test/dataset-numeric-high-tensions-2024-no-dates-zscore.csv'

normalized_test_df = pd.read_csv(normalized_test_path)

In [12]:
# Define regressors
normalized_regressors = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1e-3, solver='auto'),
    'k-NN': KNeighborsRegressor(n_neighbors=5),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100),
    'SVM': SVR(kernel='linear', max_iter=100000),
    'MLP': MLPRegressor(hidden_layer_sizes=(8,4), max_iter=20000)
}

In [13]:
cv_scores_normalized = pd.Series({
    name: np.mean(cross_val_score(regressor, normalized_train_df.drop(columns=['energia_ativa_alta_tensao_kwh']), normalized_train_df['energia_ativa_alta_tensao_kwh'], cv=5))
    for name, regressor in normalized_regressors.items()
})

cv_scores_normalized

Linear Regression    0.525342
Ridge                0.525342
k-NN                 0.180841
Decision Tree        0.229094
Random Forest        0.317098
SVM                  0.513216
MLP                  0.361249
dtype: float64

In [14]:
best_normalized = cv_scores.sort_values(ascending=False).index[0]
best_regressor_normalized = regressors[best_normalized]
best_regressor_normalized.fit(normalized_train_df.drop(columns=['energia_ativa_alta_tensao_kwh']), normalized_train_df['energia_ativa_alta_tensao_kwh'])

print(f'### {best_normalized} ###')

y_pred = best_regressor_normalized.predict(normalized_test_df.drop(columns=['energia_ativa_alta_tensao_kwh']))

print('r2: {}'.format(r2_score(normalized_test_df['energia_ativa_alta_tensao_kwh'], y_pred)))
print('mse: {}'.format(mean_squared_error(normalized_test_df['energia_ativa_alta_tensao_kwh'], y_pred)))
print('mae: {}'.format(mean_absolute_error(normalized_test_df['energia_ativa_alta_tensao_kwh'], y_pred)))

### MLP ###
r2: 0.8070667745138976
mse: 0.1929332254861024
mae: 0.23876231925457972


In [15]:
df_error_normalized = normalized_test_df.copy()
df_error_normalized['error'] = np.abs(df_error_normalized['energia_ativa_alta_tensao_kwh'] - y_pred)

df_error_normalized.sort_values('error', ascending=False).head(20)

Unnamed: 0,energia_ativa_alta_tensao_kwh,cpes_domestico_alta_tensao,cpes_nao_domestico_alta_tensao,cpes_outros_alta_tensao,densidade_populacional_pessoas_km2,error
186,5.58809,3.887966,3.450074,0.0,6.352767,3.169996
102,4.013536,-0.194398,3.956197,0.0,0.390687,2.518237
108,2.950678,-0.194398,3.213339,0.0,-0.097587,1.873522
158,2.871325,-0.194398,1.213337,0.0,0.116179,1.770836
251,3.205277,3.887966,3.148033,0.0,0.411713,1.439724
49,3.178482,-0.194398,2.315379,0.0,0.883633,1.42321
109,8.864259,7.97033,9.221507,0.0,6.017517,1.309189
204,2.421974,-0.194398,1.115378,0.0,0.210796,1.241312
200,2.940275,3.887966,3.245992,0.0,0.379005,1.150703
60,0.649553,-0.194398,0.960276,0.0,2.197767,1.0936
