In [1]:
# Preparação dos dados
import os

import pandas as pd

%matplotlib inline
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score

sns.set_theme()

In [2]:
## Importar dataset teste

data_path = '../data/' if os.path.exists(
    '../data/') else 'https://raw.githubusercontent.com/kreativermario/Projeto-DECD/master/data/'

test_path = data_path + 'treated/prepared/numeric/no-dates/dataset-numeric-high-tensions-test-no-dates.csv'

test_df = pd.read_csv(test_path)

In [3]:
## Importar dataset treino
train_path = data_path + 'treated/prepared/numeric/no-dates/dataset-numeric-high-tensions-train-no-dates.csv'

train_df = pd.read_csv(train_path)

In [4]:
train_df.describe()

Unnamed: 0,energia_ativa_alta_tensao_kwh,cpes_domestico_alta_tensao,cpes_nao_domestico_alta_tensao,cpes_outros_alta_tensao,densidade_populacional_pessoas_km2
count,4641.0,4641.0,4641.0,4641.0,4641.0
mean,7185369.0,0.039647,95.380952,0.408533,314.542125
std,14565940.0,0.20694,121.117813,4.304804,856.169267
min,0.0,0.0,0.0,0.0,4.0
25%,457054.0,0.0,28.0,0.0,25.0
50%,1955336.0,0.0,51.0,0.0,66.0
75%,6360625.0,0.0,119.0,0.0,177.0
max,127947000.0,2.0,1224.0,75.0,7310.0


In [5]:
test_df.describe()

Unnamed: 0,energia_ativa_alta_tensao_kwh,cpes_domestico_alta_tensao,cpes_nao_domestico_alta_tensao,cpes_outros_alta_tensao,densidade_populacional_pessoas_km2
count,546.0,546.0,546.0,546.0,546.0
mean,4366219.0,0.047619,96.485348,0.0,314.542125
std,11086000.0,0.245181,122.640374,0.0,856.862054
min,0.0,0.0,0.0,0.0,4.0
25%,213598.2,0.0,29.0,0.0,25.0
50%,846397.0,0.0,52.0,0.0,66.0
75%,3183322.0,0.0,119.0,0.0,177.0
max,111888700.0,2.0,1229.0,0.0,7310.0


# Algoritmos de Aprendizagem supervisionada

## Dataset não normalizado

In [6]:
# Define regressors
regressors = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0, solver='auto'),
    'Lasso': Lasso(alpha=1.0), 
    'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5),
    'k-NN': KNeighborsRegressor(n_neighbors=5),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100),
    'SVM': SVR(kernel='linear', max_iter=10000),
    'MLP': MLPRegressor(hidden_layer_sizes=(8,4), max_iter=20000),
}

In [7]:
cv_scores = pd.Series({
    name: np.mean(cross_val_score(regressor, train_df.drop(columns=['energia_ativa_alta_tensao_kwh']), train_df['energia_ativa_alta_tensao_kwh'], cv=5))
    for name, regressor in regressors.items()
})

cv_scores

Linear Regression    0.525189
Ridge                0.525455
Lasso                0.525189
ElasticNet           0.546666
k-NN                -0.054153
Decision Tree        0.297696
Random Forest        0.310358
SVM                  0.468917
MLP                  0.549180
dtype: float64

In [8]:
best = cv_scores.sort_values(ascending=False).index[0]
best_regressor = regressors[best]
best_regressor.fit(train_df.drop(columns=['energia_ativa_alta_tensao_kwh']), train_df['energia_ativa_alta_tensao_kwh'])

print(f'### {best} ###')

y_pred = best_regressor.predict(test_df.drop(columns=['energia_ativa_alta_tensao_kwh']))

print('r2: {}'.format(r2_score(test_df['energia_ativa_alta_tensao_kwh'], y_pred)))
print('mse: {}'.format(mean_squared_error(test_df['energia_ativa_alta_tensao_kwh'], y_pred)))
print('mae: {}'.format(mean_absolute_error(test_df['energia_ativa_alta_tensao_kwh'], y_pred)))

### MLP ###
r2: 0.13912706288630095
mse: 105606958847795.28
mae: 5736596.207003404


### Ordenar os exemplos do conjunto de teste por ordem decrescente do erro da previsão do regressor e verificar se existe algum padrão relevante. 


In [9]:
df_error = test_df.copy()
df_error['error'] = np.abs(df_error['energia_ativa_alta_tensao_kwh'] - y_pred)

df_error.sort_values('error', ascending=False).head(20)

Unnamed: 0,energia_ativa_alta_tensao_kwh,cpes_domestico_alta_tensao,cpes_nao_domestico_alta_tensao,cpes_outros_alta_tensao,densidade_populacional_pessoas_km2,error
219,21772335,2,1229,0,5466,82849230.0
424,79685358,0,203,0,533,62996790.0
430,68130804,0,70,0,71,62527550.0
234,86976336,0,286,0,1674,61948600.0
180,69370033,0,151,0,157,57341220.0
414,71168543,0,167,0,1770,55210230.0
166,53710759,0,81,0,246,46962670.0
373,12514870,1,521,0,5753,37605900.0
205,11613559,0,581,0,649,34585380.0
433,7667556,1,488,0,1215,32329810.0


## Aprendizagem supervisionada com datasets normalizados

In [10]:
normalized_train_path = data_path + 'treated/prepared/numeric/normalized/train/dataset-numeric-high-tensions-2223-no-dates-zscore.csv'

normalized_train_df = pd.read_csv(normalized_train_path)

In [11]:
normalized_test_path = data_path + 'treated/prepared/numeric/normalized/test/dataset-numeric-high-tensions-2024-no-dates-zscore.csv'

normalized_test_df = pd.read_csv(normalized_test_path)

In [12]:
# Define regressors
normalized_regressors = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1e-3, solver='auto'),
    'k-NN': KNeighborsRegressor(n_neighbors=5),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100),
    'SVM': SVR(kernel='linear', max_iter=100000),
    'MLP': MLPRegressor(hidden_layer_sizes=(8,4), max_iter=20000)
}

In [13]:
cv_scores_normalized = pd.Series({
    name: np.mean(cross_val_score(regressor, normalized_train_df.drop(columns=['energia_ativa_alta_tensao_kwh']), normalized_train_df['energia_ativa_alta_tensao_kwh'], cv=5))
    for name, regressor in normalized_regressors.items()
})

cv_scores_normalized

Linear Regression    0.525189
Ridge                0.525189
k-NN                 0.180905
Decision Tree        0.232800
Random Forest        0.339819
SVM                  0.512977
MLP                  0.200165
dtype: float64

In [14]:
best_normalized = cv_scores.sort_values(ascending=False).index[0]
best_regressor_normalized = regressors[best_normalized]
best_regressor_normalized.fit(normalized_train_df.drop(columns=['energia_ativa_alta_tensao_kwh']), normalized_train_df['energia_ativa_alta_tensao_kwh'])

print(f'### {best_normalized} ###')

y_pred = best_regressor_normalized.predict(normalized_test_df.drop(columns=['energia_ativa_alta_tensao_kwh']))

print('r2: {}'.format(r2_score(normalized_test_df['energia_ativa_alta_tensao_kwh'], y_pred)))
print('mse: {}'.format(mean_squared_error(normalized_test_df['energia_ativa_alta_tensao_kwh'], y_pred)))
print('mae: {}'.format(mean_absolute_error(normalized_test_df['energia_ativa_alta_tensao_kwh'], y_pred)))

### MLP ###
r2: 0.35663419536690255
mse: 0.6433658046330973
mae: 0.354728854584174


In [15]:
df_error_normalized = normalized_test_df.copy()
df_error_normalized['error'] = np.abs(df_error_normalized['energia_ativa_alta_tensao_kwh'] - y_pred)

df_error_normalized.sort_values('error', ascending=False).head(20)

Unnamed: 0,energia_ativa_alta_tensao_kwh,cpes_domestico_alta_tensao,cpes_nao_domestico_alta_tensao,cpes_outros_alta_tensao,densidade_populacional_pessoas_km2,error
430,5.757087,-0.194398,-0.216157,0.0,-0.284486,5.999678
219,1.571539,7.97033,9.242904,0.0,6.017517,5.818823
424,6.800308,-0.194398,0.869309,0.0,0.255185,5.778237
180,5.868973,-0.194398,0.444916,0.0,-0.184028,5.494895
234,7.458586,-0.194398,1.546705,0.0,1.588009,5.211779
414,6.031354,-0.194398,0.575498,0.0,1.700148,4.900615
166,4.45515,-0.194398,-0.126382,0.0,-0.080065,4.629214
70,4.234446,-0.194398,1.293701,0.0,0.123187,3.192927
496,4.328631,3.887966,1.587512,0.0,0.138373,2.903693
502,4.816107,3.887966,3.146339,0.0,0.411713,2.782874


### Ordenar os exemplos do conjunto de teste por ordem decrescente do erro da previsão do regressor e verificar se existe algum padrão relevante. 


In [16]:
df_error = normalized_test_df.copy()
df_error['error'] = np.abs(df_error['energia_ativa_alta_tensao_kwh'] - y_pred)

df_error.sort_values('error', ascending=False).head(20)

Unnamed: 0,energia_ativa_alta_tensao_kwh,cpes_domestico_alta_tensao,cpes_nao_domestico_alta_tensao,cpes_outros_alta_tensao,densidade_populacional_pessoas_km2,error
430,5.757087,-0.194398,-0.216157,0.0,-0.284486,5.999678
219,1.571539,7.97033,9.242904,0.0,6.017517,5.818823
424,6.800308,-0.194398,0.869309,0.0,0.255185,5.778237
180,5.868973,-0.194398,0.444916,0.0,-0.184028,5.494895
234,7.458586,-0.194398,1.546705,0.0,1.588009,5.211779
414,6.031354,-0.194398,0.575498,0.0,1.700148,4.900615
166,4.45515,-0.194398,-0.126382,0.0,-0.080065,4.629214
70,4.234446,-0.194398,1.293701,0.0,0.123187,3.192927
496,4.328631,3.887966,1.587512,0.0,0.138373,2.903693
502,4.816107,3.887966,3.146339,0.0,0.411713,2.782874
