In [1]:
# Preparação dos dados
import os

import pandas as pd

%matplotlib inline
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score

sns.set_theme()

In [2]:
## Importar dataset teste

data_path = '../data/' if os.path.exists(
    '../data/') else 'https://raw.githubusercontent.com/kreativermario/Projeto-DECD/master/data/'

test_path = data_path + 'treated/prepared/numeric/no-dates/dataset-numeric-high-tensions-test-no-dates.csv'

test_df = pd.read_csv(test_path)

In [3]:
## Importar dataset treino
train_path = data_path + 'treated/prepared/numeric/no-dates/dataset-numeric-high-tensions-train-no-dates.csv'

train_df = pd.read_csv(train_path)

In [4]:
train_df.describe()

Unnamed: 0,energia_ativa_alta_tensao_kwh,cpes_domestico_alta_tensao,cpes_nao_domestico_alta_tensao,cpes_outros_alta_tensao,densidade_populacional_pessoas_km2
count,4641.0,4641.0,4641.0,4641.0,4641.0
mean,7185369.0,0.039647,95.380952,0.408533,314.542125
std,14565940.0,0.20694,121.117813,4.304804,856.169267
min,0.0,0.0,0.0,0.0,4.0
25%,457054.0,0.0,28.0,0.0,25.0
50%,1955336.0,0.0,51.0,0.0,66.0
75%,6360625.0,0.0,119.0,0.0,177.0
max,127947000.0,2.0,1224.0,75.0,7310.0


In [5]:
test_df.describe()

Unnamed: 0,energia_ativa_alta_tensao_kwh,cpes_domestico_alta_tensao,cpes_nao_domestico_alta_tensao,cpes_outros_alta_tensao,densidade_populacional_pessoas_km2
count,546.0,546.0,546.0,546.0,546.0
mean,4366219.0,0.047619,96.485348,0.0,314.542125
std,11086000.0,0.245181,122.640374,0.0,856.862054
min,0.0,0.0,0.0,0.0,4.0
25%,213598.2,0.0,29.0,0.0,25.0
50%,846397.0,0.0,52.0,0.0,66.0
75%,3183322.0,0.0,119.0,0.0,177.0
max,111888700.0,2.0,1229.0,0.0,7310.0


# Algoritmos de Aprendizagem supervisionada

## Dataset não normalizado

In [17]:
# Define regressors
regressors = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0, solver='auto'),
    'Lasso': Lasso(alpha=1.0), 
    'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5),
    'k-NN': KNeighborsRegressor(n_neighbors=5),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100),
    'SVM': SVR(kernel='linear', max_iter=10000),
    'MLP': MLPRegressor(hidden_layer_sizes=(8,4), max_iter=20000),
}

In [18]:
cv_scores = pd.Series({
    name: np.mean(cross_val_score(regressor, train_df.drop(columns=['energia_ativa_alta_tensao_kwh']), train_df['energia_ativa_alta_tensao_kwh'], cv=5))
    for name, regressor in regressors.items()
})

cv_scores

Linear Regression    0.525189
Ridge                0.525455
Lasso                0.525189
ElasticNet           0.546666
k-NN                -0.095929
Decision Tree       -0.039095
Random Forest        0.309895
SVM                  0.468917
MLP                  0.549212
dtype: float64

In [19]:
best = cv_scores.sort_values(ascending=False).index[0]
best_regressor = regressors[best]
best_regressor.fit(train_df.drop(columns=['energia_ativa_alta_tensao_kwh']), train_df['energia_ativa_alta_tensao_kwh'])

print(f'### {best} ###')

y_pred = best_regressor.predict(test_df.drop(columns=['energia_ativa_alta_tensao_kwh']))

print('r2: {}'.format(r2_score(test_df['energia_ativa_alta_tensao_kwh'], y_pred)))
print('mse: {}'.format(mean_squared_error(test_df['energia_ativa_alta_tensao_kwh'], y_pred)))
print('mae: {}'.format(mean_absolute_error(test_df['energia_ativa_alta_tensao_kwh'], y_pred)))

### MLP ###
r2: 0.15486207564530774
mse: 103676677649176.89
mae: 5179072.321153707


### Ordenar os exemplos do conjunto de teste por ordem decrescente do erro da previsão do regressor e verificar se existe algum padrão relevante. 


In [9]:
df_error = test_df.copy()
df_error['error'] = np.abs(df_error['energia_ativa_alta_tensao_kwh'] - y_pred)

df_error.sort_values('error', ascending=False).head(20)

Unnamed: 0,energia_ativa_alta_tensao_kwh,cpes_domestico_alta_tensao,cpes_nao_domestico_alta_tensao,cpes_outros_alta_tensao,densidade_populacional_pessoas_km2,error
219,21772335,2,1229,0,5466,88154310.0
430,68130804,0,70,0,71,63477730.0
424,79685358,0,203,0,533,63204270.0
234,86976336,0,286,0,1674,61721770.0
180,69370033,0,151,0,157,57832390.0
414,71168543,0,167,0,1770,55665360.0
166,53710759,0,81,0,246,47856110.0
373,12514870,1,521,0,5753,38948330.0
205,11613559,0,581,0,649,36529730.0
433,7667556,1,488,0,1215,33639370.0


## Aprendizagem supervisionada com datasets normalizados

In [10]:
normalized_train_path = data_path + 'treated/prepared/numeric/normalized/train/dataset-numeric-high-tensions-2223-no-dates-decimal.csv'

normalized_train_df = pd.read_csv(normalized_train_path)

In [11]:
normalized_test_path = data_path + 'treated/prepared/numeric/normalized/test/dataset-numeric-high-tensions-2024-no-dates-decimal.csv'

normalized_test_df = pd.read_csv(normalized_test_path)

In [12]:
# Define regressors
normalized_regressors = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1e-3, solver='auto'),
    'Lasso': Lasso(alpha=1.0), 
    'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5),
    'k-NN': KNeighborsRegressor(n_neighbors=5),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100),
    'SVM': SVR(kernel='linear', max_iter=100000),
    'MLP': MLPRegressor(hidden_layer_sizes=(8,4), max_iter=20000)
}

In [13]:
cv_scores_normalized = pd.Series({
    name: np.mean(cross_val_score(regressor, normalized_train_df.drop(columns=['energia_ativa_alta_tensao_kwh']), normalized_train_df['energia_ativa_alta_tensao_kwh'], cv=5))
    for name, regressor in normalized_regressors.items()
})

cv_scores_normalized

Linear Regression     0.525189
Ridge                 0.525189
Lasso                 0.091344
ElasticNet            0.430484
k-NN                 -0.062361
Decision Tree         0.142370
Random Forest         0.333740
SVM                 -19.881272
MLP                 -79.400330
dtype: float64

In [14]:
best_normalized = cv_scores_normalized.sort_values(ascending=False).index[0]
best_regressor_normalized = normalized_regressors[best_normalized]
best_regressor_normalized.fit(normalized_train_df.drop(columns=['energia_ativa_alta_tensao_kwh']), normalized_train_df['energia_ativa_alta_tensao_kwh'])

print(f'### {best_normalized} ###')

y_pred = best_regressor_normalized.predict(normalized_test_df.drop(columns=['energia_ativa_alta_tensao_kwh']))

print('r2: {}'.format(r2_score(normalized_test_df['energia_ativa_alta_tensao_kwh'], y_pred)))
print('mse: {}'.format(mean_squared_error(normalized_test_df['energia_ativa_alta_tensao_kwh'], y_pred)))
print('mae: {}'.format(mean_absolute_error(normalized_test_df['energia_ativa_alta_tensao_kwh'], y_pred)))

### Ridge ###
r2: 0.14049102425946935
mse: 0.00010543963588246996
mae: 0.0050770199726011276


In [15]:
df_error_normalized = normalized_test_df.copy()
df_error_normalized['error'] = np.abs(df_error_normalized['energia_ativa_alta_tensao_kwh'] - y_pred)

df_error_normalized.sort_values('error', ascending=False).head(20)

Unnamed: 0,energia_ativa_alta_tensao_kwh,cpes_domestico_alta_tensao,cpes_nao_domestico_alta_tensao,cpes_outros_alta_tensao,densidade_populacional_pessoas_km2,error
219,0.021772,2,1229,0,5466,0.087884
430,0.068131,0,70,0,71,0.063475
424,0.079685,0,203,0,533,0.063166
234,0.086976,0,286,0,1674,0.061655
180,0.06937,0,151,0,157,0.057809
414,0.071169,0,167,0,1770,0.055626
166,0.053711,0,81,0,246,0.047849
373,0.012515,1,521,0,5753,0.038806
205,0.011614,0,581,0,649,0.03666
433,0.007668,1,488,0,1215,0.033455


### Ordenar os exemplos do conjunto de teste por ordem decrescente do erro da previsão do regressor e verificar se existe algum padrão relevante. 


In [16]:
df_error = normalized_test_df.copy()
df_error['error'] = np.abs(df_error['energia_ativa_alta_tensao_kwh'] - y_pred)

df_error.sort_values('error', ascending=False).head(20)

Unnamed: 0,energia_ativa_alta_tensao_kwh,cpes_domestico_alta_tensao,cpes_nao_domestico_alta_tensao,cpes_outros_alta_tensao,densidade_populacional_pessoas_km2,error
219,0.021772,2,1229,0,5466,0.087884
430,0.068131,0,70,0,71,0.063475
424,0.079685,0,203,0,533,0.063166
234,0.086976,0,286,0,1674,0.061655
180,0.06937,0,151,0,157,0.057809
414,0.071169,0,167,0,1770,0.055626
166,0.053711,0,81,0,246,0.047849
373,0.012515,1,521,0,5753,0.038806
205,0.011614,0,581,0,649,0.03666
433,0.007668,1,488,0,1215,0.033455
