In [1]:
# Preparação dos dados
import os

import numpy as np
import pandas as pd

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score

sns.set_theme()

In [2]:
## Importar dataset teste

data_path = '../data/' if os.path.exists(
    '../data/') else 'https://raw.githubusercontent.com/kreativermario/Projeto-DECD/master/data/'

test_path = data_path + 'treated/prepared/numeric/no-dates/dataset-numeric-high-tensions-test-no-dates.csv'

test_df = pd.read_csv(test_path)

In [3]:
## Importar dataset treino

data_path = '../data/' if os.path.exists(
    '../data/') else 'https://raw.githubusercontent.com/kreativermario/Projeto-DECD/master/data/'

train_path = data_path + 'treated/prepared/numeric/no-dates/dataset-numeric-high-tensions-train-no-dates.csv'

train_df = pd.read_csv(train_path)

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4641 entries, 0 to 4640
Data columns (total 5 columns):
 #   Column                              Non-Null Count  Dtype
---  ------                              --------------  -----
 0   energia_ativa_alta_tensao_kwh       4641 non-null   int64
 1   cpes_domestico_alta_tensao          4641 non-null   int64
 2   cpes_nao_domestico_alta_tensao      4641 non-null   int64
 3   cpes_outros_alta_tensao             4641 non-null   int64
 4   densidade_populacional_pessoas_km2  4641 non-null   int64
dtypes: int64(5)
memory usage: 181.4 KB


In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273 entries, 0 to 272
Data columns (total 5 columns):
 #   Column                              Non-Null Count  Dtype
---  ------                              --------------  -----
 0   energia_ativa_alta_tensao_kwh       273 non-null    int64
 1   cpes_domestico_alta_tensao          273 non-null    int64
 2   cpes_nao_domestico_alta_tensao      273 non-null    int64
 3   cpes_outros_alta_tensao             273 non-null    int64
 4   densidade_populacional_pessoas_km2  273 non-null    int64
dtypes: int64(5)
memory usage: 10.8 KB


# Algoritmos de Aprendizagem supervisionada

In [6]:
regressors = {
    'Linear Regression': LinearRegression(),
    'k-NN': KNeighborsRegressor(n_neighbors=5),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100),
    'SVM': SVR(kernel='linear', max_iter=10000),
    'MLP': MLPRegressor(hidden_layer_sizes=(8,4), max_iter=20000),
}

In [7]:
cv_scores = pd.Series({
    name: np.mean(cross_val_score(regressor, train_df.drop(columns=['energia_ativa_alta_tensao_kwh']), train_df['energia_ativa_alta_tensao_kwh'], cv=5))
    for name, regressor in regressors.items()
})

cv_scores

Linear Regression    0.525342
k-NN                -0.053549
Decision Tree        0.099493
Random Forest        0.323920
SVM                  0.469171
MLP                  0.555647
dtype: float64

In [8]:
best = cv_scores.sort_values(ascending=False).index[0]
best_regressor = regressors[best]
best_regressor.fit(train_df.drop(columns=['energia_ativa_alta_tensao_kwh']), train_df['energia_ativa_alta_tensao_kwh'])

print(f'### {best} ###')

y_pred = best_regressor.predict(test_df.drop(columns=['energia_ativa_alta_tensao_kwh']))

print('r2: {}'.format(r2_score(test_df['energia_ativa_alta_tensao_kwh'], y_pred)))
print('mse: {}'.format(mean_squared_error(test_df['energia_ativa_alta_tensao_kwh'], y_pred)))
print('mae: {}'.format(mean_absolute_error(test_df['energia_ativa_alta_tensao_kwh'], y_pred)))

### MLP ###
r2: -12.502062336036436
mse: 102590021955096.67
mae: 6536019.218060667


## Ordene os exemplos do conjunto de teste por ordem decrescente do erro da previsão do regressor e verifique se existe algum padrão relevante. 

In [9]:
df_error = test_df.copy()
df_error['error'] = np.abs(df_error['energia_ativa_alta_tensao_kwh'] - y_pred)

df_error.sort_values('error', ascending=False).head(20)

Unnamed: 0,energia_ativa_alta_tensao_kwh,cpes_domestico_alta_tensao,cpes_nao_domestico_alta_tensao,cpes_outros_alta_tensao,densidade_populacional_pessoas_km2,error
109,25962882,2,1226,0,5466,78570540.0
102,12592033,0,581,0,649,33657380.0
186,16932236,1,519,0,5753,33123240.0
253,7515787,0,465,0,1821,31691240.0
216,8451642,1,486,0,1215,31437320.0
200,9633627,1,494,0,639,29911250.0
108,9662301,0,490,0,231,28821370.0
251,10364095,1,482,0,667,28296010.0
39,5880938,0,380,0,308,24191290.0
111,7406105,1,366,0,1215,23165980.0


# Aprendizagem supervisionada com datasets normalizados

In [10]:
normalized_train_path = data_path + 'treated/prepared/numeric/normalized/train/dataset-numeric-high-tensions-2223-no-dates-zscore.csv'

normalized_train_df = pd.read_csv(normalized_train_path)

In [11]:
normalized_test_path = data_path + 'treated/prepared/numeric/normalized/test/dataset-numeric-high-tensions-2024-no-dates-zscore.csv'

normalized_test_df = pd.read_csv(normalized_test_path)

In [12]:
# Define regressors
normalized_regressors = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1e-3, solver='auto'),
    'k-NN': KNeighborsRegressor(n_neighbors=5),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100),
    'SVM': SVR(kernel='linear', max_iter=100000),
    'MLP': MLPRegressor(hidden_layer_sizes=(8,4), max_iter=20000)
}

In [13]:
cv_scores_normalized = pd.Series({
    name: np.mean(cross_val_score(regressor, normalized_train_df.drop(columns=['energia_ativa_alta_tensao_kwh']), normalized_train_df['energia_ativa_alta_tensao_kwh'], cv=5))
    for name, regressor in normalized_regressors.items()
})

cv_scores_normalized

Linear Regression    0.525342
Ridge                0.525342
k-NN                 0.180841
Decision Tree        0.067987
Random Forest        0.350053
SVM                  0.513216
MLP                  0.371212
dtype: float64

In [14]:
best_normalized = cv_scores.sort_values(ascending=False).index[0]
best_regressor_normalized = regressors[best_normalized]
best_regressor_normalized.fit(normalized_train_df.drop(columns=['energia_ativa_alta_tensao_kwh']), normalized_train_df['energia_ativa_alta_tensao_kwh'])

print(f'### {best_normalized} ###')

y_pred = best_regressor_normalized.predict(normalized_test_df.drop(columns=['energia_ativa_alta_tensao_kwh']))

print('r2: {}'.format(r2_score(normalized_test_df['energia_ativa_alta_tensao_kwh'], y_pred)))
print('mse: {}'.format(mean_squared_error(normalized_test_df['energia_ativa_alta_tensao_kwh'], y_pred)))
print('mae: {}'.format(mean_absolute_error(normalized_test_df['energia_ativa_alta_tensao_kwh'], y_pred)))

### MLP ###
r2: 0.7565297621758812
mse: 0.24347023782411883
mae: 0.274430773462974


In [15]:
df_error_normalized = normalized_test_df.copy()
df_error_normalized['error'] = np.abs(df_error_normalized['energia_ativa_alta_tensao_kwh'] - y_pred)

df_error_normalized.sort_values('error', ascending=False).head(20)

Unnamed: 0,energia_ativa_alta_tensao_kwh,cpes_domestico_alta_tensao,cpes_nao_domestico_alta_tensao,cpes_outros_alta_tensao,densidade_populacional_pessoas_km2,error
186,5.58809,3.887966,3.450074,0.0,6.352767,3.976028
117,2.070647,-0.194398,1.548032,0.0,1.588009,2.240852
102,4.013536,-0.194398,3.956197,0.0,0.390687,2.159603
108,2.950678,-0.194398,3.213339,0.0,-0.097587,2.034659
200,2.940275,3.887966,3.245992,0.0,0.379005,1.577505
158,2.871325,-0.194398,1.213337,0.0,0.116179,1.47587
248,0.65969,3.887966,1.588848,0.0,0.138373,1.36284
216,2.511471,3.887966,3.180686,0.0,1.051842,1.356595
167,0.506736,-0.194398,0.952113,0.0,0.266866,1.337691
98,0.133902,-0.194398,0.576602,0.0,1.104407,1.240941
