In [12]:
# Preparação dos dados
import os

import numpy as np
import pandas as pd

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score

sns.set_theme()

In [13]:
## Importar dataset teste

data_path = '../data/' if os.path.exists(
    '../data/') else 'https://raw.githubusercontent.com/kreativermario/Projeto-DECD/master/data/'

test_path = data_path + 'treated/prepared/numeric/no-dates/dataset-numeric-low-tensions-test-no-dates.csv'

test_df = pd.read_csv(test_path)

In [14]:
## Importar dataset treino

data_path = '../data/' if os.path.exists(
    '../data/') else 'https://raw.githubusercontent.com/kreativermario/Projeto-DECD/master/data/'

train_path = data_path + 'treated/prepared/numeric/no-dates/dataset-numeric-low-tensions-train-no-dates.csv'

train_df = pd.read_csv(train_path)

In [15]:
## Importar dataset treino normalizado

data_path = '../data/' if os.path.exists(
    '../data/') else 'https://raw.githubusercontent.com/kreativermario/Projeto-DECD/master/data/'

trainn_path = data_path + 'treated/prepared/numeric/normalized/train/dataset-numeric-low-tensions-2223-no-dates-zscore.csv'

train_n_df = pd.read_csv(trainn_path)

In [16]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4641 entries, 0 to 4640
Data columns (total 8 columns):
 #   Column                                     Non-Null Count  Dtype
---  ------                                     --------------  -----
 0   energia_ativa_baixa_tensao_kwh             4641 non-null   int64
 1   cpes_domestico_baixa_tensao                4641 non-null   int64
 2   cpes_iluminacao_publica_baixa_tensao       4641 non-null   int64
 3   cpes_nao_domestico_baixa_tensao            4641 non-null   int64
 4   cpes_outros_baixa_tensao                   4641 non-null   int64
 5   cpes_mobilidade_eletrica_nao_baixa_tensao  4641 non-null   int64
 6   cpes_mobilidade_eletrica_sim_baixa_tensao  4641 non-null   int64
 7   densidade_populacional_pessoas_km2         4641 non-null   int64
dtypes: int64(8)
memory usage: 290.2 KB


In [17]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273 entries, 0 to 272
Data columns (total 8 columns):
 #   Column                                     Non-Null Count  Dtype
---  ------                                     --------------  -----
 0   energia_ativa_baixa_tensao_kwh             273 non-null    int64
 1   cpes_domestico_baixa_tensao                273 non-null    int64
 2   cpes_iluminacao_publica_baixa_tensao       273 non-null    int64
 3   cpes_nao_domestico_baixa_tensao            273 non-null    int64
 4   cpes_outros_baixa_tensao                   273 non-null    int64
 5   cpes_mobilidade_eletrica_nao_baixa_tensao  273 non-null    int64
 6   cpes_mobilidade_eletrica_sim_baixa_tensao  273 non-null    int64
 7   densidade_populacional_pessoas_km2         273 non-null    int64
dtypes: int64(8)
memory usage: 17.2 KB


# Algoritmos de Aprendizagem supervisionada

In [18]:
regressors = {
    'Linear Regression': LinearRegression(),
    'k-NN': KNeighborsRegressor(n_neighbors=5),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100),
    'SVM': SVR(kernel='linear', max_iter=1000000000),
    'MLP': MLPRegressor(hidden_layer_sizes=(8,4), max_iter=200000)
}

In [19]:
cv_scores = pd.Series({
    name: np.mean(cross_val_score(regressor, train_df.drop(columns=['energia_ativa_baixa_tensao_kwh']), train_df['energia_ativa_baixa_tensao_kwh'], cv=5))
    for name, regressor in regressors.items()
})

cv_scores



Linear Regression    0.944806
k-NN                 0.929239
Decision Tree        0.915376
Random Forest        0.925721
SVM                  0.951478
MLP                  0.942540
dtype: float64

In [20]:
cv_scores_n = pd.Series({
    name: np.mean(cross_val_score(regressor, train_n_df.drop(columns=['energia_ativa_baixa_tensao_kwh']), train_n_df['energia_ativa_baixa_tensao_kwh'], cv=5))
    for name, regressor in regressors.items()
})

cv_scores_n

Linear Regression    0.944806
k-NN                 0.907827
Decision Tree        0.912084
Random Forest        0.925931
SVM                  0.947769
MLP                  0.940966
dtype: float64

In [21]:
best = cv_scores.sort_values(ascending=False).index[0]
best_regressor = regressors[best]
best_regressor.fit(train_df.drop(columns=['energia_ativa_baixa_tensao_kwh']), train_df['energia_ativa_baixa_tensao_kwh'])

print(f'### {best} ###')

y_pred = best_regressor.predict(test_df.drop(columns=['energia_ativa_baixa_tensao_kwh']))

print('r2: {}'.format(r2_score(test_df['energia_ativa_baixa_tensao_kwh'], y_pred)))
print('mse: {}'.format(mean_squared_error(test_df['energia_ativa_baixa_tensao_kwh'], y_pred)))
print('mae: {}'.format(mean_absolute_error(test_df['energia_ativa_baixa_tensao_kwh'], y_pred)))

### SVM ###
r2: -0.034118553650866135
mse: 38536523510394.74
mae: 3151518.233807645




## Ordene os exemplos do conjunto de teste por ordem decrescente do erro da previsão do regressor e verifique se existe algum padrão relevante. 


In [22]:
df_error = test_df.copy()
df_error['error'] = np.abs(df_error['energia_ativa_baixa_tensao_kwh'] - y_pred)

df_error.sort_values('error', ascending=False).head(20)

Unnamed: 0,energia_ativa_baixa_tensao_kwh,cpes_domestico_baixa_tensao,cpes_iluminacao_publica_baixa_tensao,cpes_nao_domestico_baixa_tensao,cpes_outros_baixa_tensao,cpes_mobilidade_eletrica_nao_baixa_tensao,cpes_mobilidade_eletrica_sim_baixa_tensao,densidade_populacional_pessoas_km2,error
109,66782149,312074,1185,79984,2055,516364,382,5466,60254090.0
216,28444768,173710,938,40674,473,232392,88,1215,32987670.0
186,35501295,139499,707,34810,1000,286406,82,5753,21085830.0
15,15524804,96716,531,18712,566,170750,84,2544,18007680.0
111,15732945,89990,563,19726,521,163242,112,1215,17433090.0
253,28444852,138805,1181,22422,1009,241612,108,1821,16486970.0
23,11610976,81468,256,16830,335,119334,46,7310,16321990.0
49,18417460,86556,796,18393,938,141856,108,1071,14876240.0
155,15554551,84723,451,17502,398,126978,176,3755,14148820.0
212,10566241,59235,506,15639,422,131020,72,533,13998640.0
