In [1]:
# Preparação dos dados
import os

%matplotlib inline
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score

sns.set_theme()

In [2]:
## Importar dataset teste

data_path = '../data/' if os.path.exists(
    '../data/') else 'https://raw.githubusercontent.com/kreativermario/Projeto-DECD/master/data/'

test_path = data_path + 'treated/prepared/numeric/no-dates/dataset-numeric-low-tensions-test-no-dates.csv'

test_df = pd.read_csv(test_path)

In [3]:
## Importar dataset treino
train_path = data_path + 'treated/prepared/numeric/no-dates/dataset-numeric-low-tensions-train-no-dates.csv'

train_df = pd.read_csv(train_path)

In [4]:
train_df.describe()

Unnamed: 0,energia_ativa_baixa_tensao_kwh,cpes_domestico_baixa_tensao,cpes_iluminacao_publica_baixa_tensao,cpes_nao_domestico_baixa_tensao,cpes_outros_baixa_tensao,cpes_mobilidade_eletrica_nao_baixa_tensao,cpes_mobilidade_eletrica_sim_baixa_tensao,densidade_populacional_pessoas_km2
count,4641.0,4641.0,4641.0,4641.0,4641.0,4641.0,4641.0,4641.0
mean,6652473.0,18795.518854,214.017453,4239.124111,129.467787,26681.424477,9.026287,314.542125
std,11855350.0,29390.997527,194.044633,6867.617704,206.654356,45554.097416,23.831036,856.169267
min,205994.0,1042.0,7.0,174.0,1.0,1144.0,0.0,4.0
25%,1310791.0,4581.0,91.0,1101.0,21.0,5940.0,2.0,25.0
50%,2685664.0,8936.0,141.0,1989.0,54.0,11104.0,2.0,66.0
75%,7426747.0,21405.0,280.0,4892.0,157.0,29350.0,6.0,177.0
max,163381900.0,311824.0,1184.0,79956.0,2186.0,528158.0,368.0,7310.0


In [5]:
test_df.describe()

Unnamed: 0,energia_ativa_baixa_tensao_kwh,cpes_domestico_baixa_tensao,cpes_iluminacao_publica_baixa_tensao,cpes_nao_domestico_baixa_tensao,cpes_outros_baixa_tensao,cpes_mobilidade_eletrica_nao_baixa_tensao,cpes_mobilidade_eletrica_sim_baixa_tensao,densidade_populacional_pessoas_km2
count,546.0,546.0,546.0,546.0,546.0,546.0,546.0,546.0
mean,5381401.0,18918.045788,215.278388,4274.082418,119.930403,33234.115385,14.593407,314.542125
std,11063680.0,29635.004603,195.630206,6925.56973,197.548003,50925.09626,35.708189,856.862054
min,100068.0,1042.0,7.0,182.0,1.0,1892.0,0.0,4.0
25%,882598.5,4572.75,91.0,1112.0,18.0,8386.0,2.0,25.0
50%,1962897.0,8928.5,142.0,2001.5,48.0,15531.0,4.0,66.0
75%,5025964.0,21518.25,281.0,4927.25,141.75,39917.0,10.0,177.0
max,164135300.0,312095.0,1185.0,79984.0,2055.0,530262.0,388.0,7310.0


# Algoritmos de Aprendizagem supervisionada

## Dataset não normalizado

In [6]:
regressors = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0, solver='auto'),
    'Lasso': Lasso(alpha=1.0), 
    'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5),
    'k-NN': KNeighborsRegressor(n_neighbors=5),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100),
    'SVM': SVR(kernel='linear', max_iter=1000000000),
    'MLP': MLPRegressor(hidden_layer_sizes=(8,4), max_iter=200000)
}

In [7]:
cv_scores = pd.Series({
    name: np.mean(cross_val_score(regressor, train_df.drop(columns=['energia_ativa_baixa_tensao_kwh']), train_df['energia_ativa_baixa_tensao_kwh'], cv=5))
    for name, regressor in regressors.items()
})

cv_scores

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Linear Regression    0.944744
Ridge                0.944744
Lasso                0.944744
ElasticNet           0.944751
k-NN                 0.929297
Decision Tree        0.845837
Random Forest        0.936781
SVM                  0.951433
MLP                  0.941973
dtype: float64

In [8]:
best = cv_scores.sort_values(ascending=False).index[0]
best_regressor = regressors[best]
best_regressor.fit(train_df.drop(columns=['energia_ativa_baixa_tensao_kwh']), train_df['energia_ativa_baixa_tensao_kwh'])

print(f'### {best} ###')

y_pred = best_regressor.predict(test_df.drop(columns=['energia_ativa_baixa_tensao_kwh']))

print('r2: {}'.format(r2_score(test_df['energia_ativa_baixa_tensao_kwh'], y_pred)))
print('mse: {}'.format(mean_squared_error(test_df['energia_ativa_baixa_tensao_kwh'], y_pred)))
print('mae: {}'.format(mean_absolute_error(test_df['energia_ativa_baixa_tensao_kwh'], y_pred)))

### SVM ###
r2: 0.6787181817616937
mse: 39254473085991.016
mae: 2798339.5838791123


### Ordenar os exemplos do conjunto de teste por ordem decrescente do erro da previsão do regressor e verificar se existe algum padrão relevante. 


In [9]:
df_error = test_df.copy()
df_error['error'] = np.abs(df_error['energia_ativa_baixa_tensao_kwh'] - y_pred)

df_error.sort_values('error', ascending=False).head(20)

Unnamed: 0,energia_ativa_baixa_tensao_kwh,cpes_domestico_baixa_tensao,cpes_iluminacao_publica_baixa_tensao,cpes_nao_domestico_baixa_tensao,cpes_outros_baixa_tensao,cpes_mobilidade_eletrica_nao_baixa_tensao,cpes_mobilidade_eletrica_sim_baixa_tensao,densidade_populacional_pessoas_km2,error
219,49701737,312095,1185,79939,2050,530262,388,5466,77582400.0
433,21242743,173756,938,40704,473,236910,104,1215,40355820.0
218,164135303,312074,1185,79984,2055,516364,382,5466,37062990.0
373,25851440,139500,707,34819,997,288380,92,5753,30853880.0
372,81748684,139499,707,34810,1000,286406,82,5753,25112110.0
507,20742447,138873,1181,22422,1005,245714,120,1821,24324390.0
31,11622919,96690,531,18709,572,171990,84,2544,21982870.0
223,11978360,89988,563,19734,526,168622,122,1215,21372360.0
99,13249506,86590,797,18398,937,144366,110,1071,20121950.0
47,8554055,81435,256,16812,334,122264,54,7310,19502380.0


## Dataset normalizado

In [10]:
## Importar dataset treino normalizado
trainn_path = data_path + 'treated/prepared/numeric/normalized/train/dataset-numeric-low-tensions-2223-no-dates-zscore.csv'

train_n_df = pd.read_csv(trainn_path)

## Importar dataset teste normalizado
testnn_path = data_path + 'treated/prepared/numeric/normalized/test/dataset-numeric-low-tensions-2024-no-dates-zscore.csv'

test_n_df = pd.read_csv(testnn_path)

In [11]:
normalized_regressors = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0, solver='auto'),
    'Lasso': Lasso(alpha=1.0), 
    'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5),
    'k-NN': KNeighborsRegressor(n_neighbors=5),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100),
    'SVM': SVR(kernel='linear', max_iter=1000000000),
    'MLP': MLPRegressor(hidden_layer_sizes=(8,4), max_iter=200000)
}

In [12]:
cv_scores_n = pd.Series({
    name: np.mean(cross_val_score(regressor, train_n_df.drop(columns=['energia_ativa_baixa_tensao_kwh']), train_n_df['energia_ativa_baixa_tensao_kwh'], cv=5))
    for name, regressor in normalized_regressors.items()
})

cv_scores_n

Linear Regression    0.944744
k-NN                 0.907763
Decision Tree        0.911607
Random Forest        0.941423
SVM                  0.947686
MLP                  0.954123
dtype: float64

In [13]:
best = cv_scores_n.sort_values(ascending=False).index[0]
best_regressor_normalized = normalized_regressors[best]
best_regressor_normalized.fit(train_n_df.drop(columns=['energia_ativa_baixa_tensao_kwh']), train_n_df['energia_ativa_baixa_tensao_kwh'])

print(f'### {best} ###')

y_pred = best_regressor_normalized.predict(test_n_df.drop(columns=['energia_ativa_baixa_tensao_kwh']))

print('r2: {}'.format(r2_score(test_n_df['energia_ativa_baixa_tensao_kwh'], y_pred)))
print('mse: {}'.format(mean_squared_error(test_n_df['energia_ativa_baixa_tensao_kwh'], y_pred)))
print('mae: {}'.format(mean_absolute_error(test_n_df['energia_ativa_baixa_tensao_kwh'], y_pred)))

### MLP ###
r2: 0.711042891842121
mse: 0.28895710815787895
mae: 0.25459544545560076


### Ordenar os exemplos do conjunto de teste por ordem decrescente do erro da previsão do regressor e verificar se existe algum padrão relevante. 


In [14]:
df_error = test_n_df.copy()
df_error['error'] = np.abs(df_error['energia_ativa_baixa_tensao_kwh'] - y_pred)

df_error.sort_values('error', ascending=False).head(20)

Unnamed: 0,energia_ativa_baixa_tensao_kwh,cpes_domestico_baixa_tensao,cpes_iluminacao_publica_baixa_tensao,cpes_nao_domestico_baixa_tensao,cpes_outros_baixa_tensao,cpes_mobilidade_eletrica_nao_baixa_tensao,cpes_mobilidade_eletrica_sim_baixa_tensao,densidade_populacional_pessoas_km2,error
219,4.009605,9.901999,4.961457,10.935462,9.779089,9.768929,10.466759,6.017517,6.470255
218,14.362264,9.90129,4.961457,10.941965,9.804422,9.495768,10.298576,6.017517,3.940453
433,1.434955,5.229624,3.697713,5.265029,1.788899,4.003186,2.506108,1.051842,2.955818
507,1.389694,4.051457,4.940992,2.622825,4.484385,4.176226,2.954595,1.759722,2.579557
373,1.851898,4.072634,2.515831,4.4145,4.443851,5.014813,2.169743,6.352767,2.532791
372,6.908851,4.0726,2.515831,4.413199,4.459051,4.976015,1.889438,6.352767,2.531754
99,0.711817,2.285608,2.976304,2.041257,4.139849,2.184263,2.674291,0.883633,1.768501
31,0.564662,2.626734,1.615349,2.086204,2.290502,2.727204,1.945499,2.604273,1.705804
120,4.34791,2.986571,2.080939,2.214254,2.118234,3.170614,5.309154,2.197767,1.662984
223,0.596818,2.400375,1.779073,2.234342,2.057434,2.661007,3.010656,1.051842,1.662974
