In [1]:
# Preparação dos dados
import os

import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

sns.set_theme()

In [2]:
## Importar dataset teste

data_path = '../data/' if os.path.exists(
    '../data/') else 'https://raw.githubusercontent.com/kreativermario/Projeto-DECD/master/data/'

test_path = data_path + 'treated/prepared/categoric/no-dates/test/dataset-categoric-high-tension-test.csv'

test_df = pd.read_csv(test_path)

In [3]:
## Importar dataset treino
train_path = data_path + 'treated/prepared/categoric/no-dates/train/dataset-categoric-high-tensions-train.csv'

train_df = pd.read_csv(train_path)

In [4]:
train_df.describe()

Unnamed: 0,ano,mes,cpes_domestico_alta_tensao,cpes_nao_domestico_alta_tensao,cpes_outros_alta_tensao,densidade_populacional_pessoas_km2,num_industrias_extrativas,num_industrias_transformadoras
count,4641.0,4641.0,4641.0,4641.0,4641.0,4641.0,4641.0,4641.0
mean,2022.705882,7.529412,0.039647,95.380952,0.408533,314.542125,3.549451,243.047619
std,0.455694,3.397902,0.20694,121.117813,4.304804,856.169267,7.986558,381.966373
min,2022.0,1.0,0.0,0.0,0.0,4.0,0.0,8.0
25%,2022.0,5.0,0.0,28.0,0.0,25.0,0.0,43.0
50%,2023.0,8.0,0.0,51.0,0.0,66.0,1.0,91.0
75%,2023.0,10.0,0.0,119.0,0.0,177.0,4.0,246.0
max,2023.0,12.0,2.0,1224.0,75.0,7310.0,87.0,2531.0


In [5]:
test_df.describe()

Unnamed: 0,ano,mes,cpes_domestico_alta_tensao,cpes_nao_domestico_alta_tensao,cpes_outros_alta_tensao,densidade_populacional_pessoas_km2,num_industrias_extrativas,num_industrias_transformadoras
count,546.0,546.0,546.0,546.0,546.0,546.0,546.0,546.0
mean,2024.0,1.5,0.047619,96.485348,0.0,314.542125,3.549451,243.047619
std,0.0,0.500459,0.245181,122.640374,0.0,856.862054,7.99302,382.275449
min,2024.0,1.0,0.0,0.0,0.0,4.0,0.0,8.0
25%,2024.0,1.0,0.0,29.0,0.0,25.0,0.0,43.0
50%,2024.0,1.5,0.0,52.0,0.0,66.0,1.0,91.0
75%,2024.0,2.0,0.0,119.0,0.0,177.0,4.0,246.0
max,2024.0,2.0,2.0,1229.0,0.0,7310.0,87.0,2531.0


## Dataset não normalizado

In [6]:
classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'k-NN': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree': DecisionTreeClassifier(),
}

In [7]:
f1_scores = []
accuracy_scores = []
algorithm_names = []

for name, clf in classifiers.items():
    scores_f1 = cross_val_score(clf, 
                                train_df.drop(columns=['energia_ativa_alta_tensao_kwh']),
                                train_df['energia_ativa_alta_tensao_kwh'], 
                                cv=5, scoring='f1_macro')
    
    scores_accuracy = cross_val_score(clf, 
                                      train_df.drop(columns=['energia_ativa_alta_tensao_kwh']), 
                                      train_df['energia_ativa_alta_tensao_kwh'], 
                                      cv=5, scoring='accuracy')
    
    f1_mean = np.mean(scores_f1)
    accuracy_mean = np.mean(scores_accuracy)
    
    f1_scores.append(f1_mean)
    accuracy_scores.append(accuracy_mean)
    algorithm_names.append(name)

cv_results_df = pd.DataFrame({
    'Algorithm': algorithm_names,
    'F1-score': f1_scores,
    'Accuracy': accuracy_scores
})

cv_results_df




Unnamed: 0,Algorithm,F1-score,Accuracy
0,Random Forest,0.406659,0.737773
1,k-NN,0.374073,0.70437
2,Decision Tree,0.387394,0.709556


### Ordenar os exemplos do conjunto de teste por ordem decrescente do erro da previsão do regressor e verificar se existe algum padrão relevante. 


In [8]:
best_f1_index = cv_results_df['F1-score'].idxmax()

best_accuracy_index = cv_results_df['Accuracy'].idxmax()

best_f1_algorithm = list(classifiers.keys())[best_f1_index]
best_accuracy_algorithm = list(classifiers.keys())[best_accuracy_index]

print("Melhor algoritmo baseado no F1-score:", best_f1_algorithm)
print("Melhor algoritmo baseado na Accuracy:", best_accuracy_algorithm)


Melhor algoritmo baseado no F1-score: Random Forest
Melhor algoritmo baseado na Accuracy: Random Forest


In [16]:
best_algorithm = classifiers[best_f1_algorithm] 

best_algorithm.fit(train_df.drop(columns=['energia_ativa_alta_tensao_kwh']), train_df['energia_ativa_alta_tensao_kwh'])

predictions = best_algorithm.predict(test_df.drop(columns=['energia_ativa_alta_tensao_kwh']))

report_dict = classification_report(test_df['energia_ativa_alta_tensao_kwh'], predictions, zero_division='warn', output_dict=True)

report_df = pd.DataFrame(report_dict).transpose()

report_df = report_df.round(4)

report_df

Unnamed: 0,precision,recall,f1-score,support
0.00 - 3349844.95,0.9944,0.8421,0.9119,418.0
103745511.62 - 113247611.62,0.5,1.0,0.6667,1.0
18073778.11 - 27864901.18,0.5,0.75,0.6,12.0
27864901.18 - 37334467.99,0.3846,0.7143,0.5,7.0
3349844.95 - 9260857.23,0.4457,0.6029,0.5125,68.0
37334467.99 - 46622152.52,0.1818,1.0,0.3077,2.0
46622152.52 - 54159764.39,1.0,0.6667,0.8,3.0
54159764.39 - 62774847.02,0.0,0.0,0.0,1.0
62774847.02 - 72884466.55,0.4,0.6667,0.5,3.0
72884466.55 - 82179750.18,0.5,1.0,0.6667,1.0
