In [20]:
import pickle

data_dict = pickle.load(open('data.pickle', 'rb'))

In [21]:
# =============== OBTER OS DADOS DO DADASET ===================
import numpy as np
import pandas as pd
import pprint
# Transformando os dados do dadaset e transformando em dataframe
# Convertendo em DataFrame
df_data = pd.DataFrame()

# # Transformando cada dados dos arrays em colunas do dataframe
for lista in data_dict['data']:
    linha_list = np.asarray(lista).reshape(-1)
    linha = pd.DataFrame([linha_list])
    df_data = pd.concat([df_data, linha], ignore_index=True)

classes_list = np.asarray(data_dict['class']) # Coleta as classes 
df_classes = pd.DataFrame(classes_list)

df_data.insert(loc=df_data.columns.get_loc(0), column='class', value=df_classes)

new_cols = ['distance_0', 'distance_1', 'distance_2', 'distance_3', 'distance_4', 'distance_5']

# Renomeia as colunas do dataframe com as novas colunas coletadas
df_data.columns = ['class'] + new_cols
df_data

Unnamed: 0,class,distance_0,distance_1,distance_2,distance_3,distance_4,distance_5
0,R,126.889716,7.615773,185.218790,22.472205,63.387696,127.035428
1,R,123.794184,12.206556,190.213038,21.470911,65.764732,132.034087
2,R,129.313572,12.806248,189.129585,25.000000,67.468511,133.135270
3,R,124.679589,13.038405,185.607112,26.076810,65.000000,127.003937
4,R,123.199026,16.278821,184.610401,27.294688,65.520989,126.015872
...,...,...,...,...,...,...,...
15292,W,226.400088,104.785495,93.520051,198.640379,77.162167,102.552426
15293,W,219.440197,99.247166,99.924972,200.384630,75.153177,112.378824
15294,W,161.245155,58.463664,76.557168,152.846328,46.872167,69.339743
15295,W,160.605106,67.052218,72.069411,146.894520,42.059482,63.285069


In [22]:
# ============= BALANCEAMENTO =================
df_data['class'].sort_values().value_counts()

A        600
L        600
Y        600
X        600
W        600
V        600
U        600
T        600
S        600
R        600
B        600
Z        600
K        600
J        600
I        600
H        600
G        600
F        600
E        600
O        596
P        593
D        587
Q        582
C        569
Space    504
M        289
N        177
Name: class, dtype: int64

In [33]:
# Balancear os dados
from imblearn.over_sampling import SMOTE

# Segmentar os dados
df_atributos = df_data.drop(columns=['class'])
df_classes = df_data['class']

# Balancear os dados com a técnica SMOTE (Synthetic Minority Over-sampling Technique)
resampler = SMOTE()
df_atributos, df_classes = resampler.fit_resample(df_atributos, df_classes)

df_classes.value_counts()

R        600
J        600
P        600
Y        600
L        600
K        600
B        600
E        600
X        600
Q        600
V        600
D        600
C        600
M        600
U        600
Space    600
H        600
O        600
F        600
A        600
S        600
T        600
Z        600
G        600
N        600
I        600
W        600
Name: class, dtype: int64

In [34]:
# ========= NORMALIZCÃO DOS DADOS ========================
from sklearn import preprocessing 
normalizer = preprocessing.MinMaxScaler()

# df_atributos = df_data.drop(columns=['class'])
# df_classes = df_data['class']

# Modelo para normalizar os dados númericos
normalization_model = normalizer.fit(df_atributos) # Não normalizar as classes - drop da coluba class antes desta linha

from pickle import dump
dump(normalization_model, open('normalization.model', 'wb'))

dt_num_normalized = normalization_model.fit_transform(df_atributos)

df_atributos = pd.DataFrame(dt_num_normalized, columns=df_atributos.columns)

df_data_normalized = df_atributos.join(df_classes, how = 'left')

df_atributos

Unnamed: 0,distance_0,distance_1,distance_2,distance_3,distance_4,distance_5
0,0.512313,0.033397,0.792533,0.083380,0.106319,0.444002
1,0.499449,0.053529,0.814068,0.078575,0.115385,0.469491
2,0.522385,0.056159,0.809396,0.095512,0.121883,0.475106
3,0.503129,0.057177,0.794208,0.100680,0.112468,0.443841
4,0.496976,0.071387,0.789910,0.106525,0.114455,0.438803
...,...,...,...,...,...,...
16195,0.925121,0.092074,0.055907,0.144063,0.855200,0.841530
16196,0.810423,0.110192,0.049880,0.077338,0.813913,0.660416
16197,0.736295,0.090358,0.026788,0.036291,0.681031,0.506149
16198,0.717107,0.102187,0.052119,0.086288,0.592038,0.512516


In [36]:
########################
# - HIPERPARAMETERS 
# Usando Random Forest

from sklearn.ensemble import RandomForestClassifier
from pprint import pprint
rf = RandomForestClassifier(random_state = 42)

pprint(rf.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [38]:
import numpy as np

# Determinar as faixas de valores para cada hiperparemtro
n_estimators = [int(x) for x in np.linspace(start=10, stop=300, num=10)]
criterion = ['gini', 'entropy']
min_samples_split = [int(x) for x in np.linspace(start=2, stop=10, num=3)]
max_depth = [int(x) for x in np.linspace(start=10, stop=100, num=10)]
max_features = ['sqrt', 'log2']

# Criar a grade de valores
rf_grid = {
    'n_estimators' : n_estimators,
    'criterion' : criterion,
    'min_samples_split' : min_samples_split,
    'max_depth' : max_depth,
    'max_features' : max_features,
    'random_state': [42]
}

pprint(rf_grid)

{'criterion': ['gini', 'entropy'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
 'max_features': ['sqrt', 'log2'],
 'min_samples_split': [2, 6, 10],
 'n_estimators': [10, 42, 74, 106, 138, 171, 203, 235, 267, 300],
 'random_state': [42]}


In [39]:
# Rodar o GridSearch (treinamento e avaliação do grid)
from sklearn.model_selection import RandomizedSearchCV

# rf = RandomForestClassifier()
rf_hyperparameters = RandomizedSearchCV(
                                        estimator = rf,
                                        param_distributions = rf_grid,
                                        n_iter = 200,
                                        cv = 10,
                                        verbose = 1,
                                        random_state = 42,
                                        n_jobs = -1
                                        )
df_atributos = df_data_normalized.drop(columns=['class'])
df_classes = df_data_normalized['class']

# rf_hyperparameters.fit(df_atributos, df_classes.values.ravel())
rf_hyperparameters.fit(df_atributos, df_classes)
pprint(rf_hyperparameters.best_params_)


Fitting 10 folds for each of 200 candidates, totalling 2000 fits
{'criterion': 'entropy',
 'max_depth': 20,
 'max_features': 'log2',
 'min_samples_split': 2,
 'n_estimators': 74,
 'random_state': 42}


In [40]:
df_atributos

Unnamed: 0,distance_0,distance_1,distance_2,distance_3,distance_4,distance_5
0,0.512313,0.033397,0.792533,0.083380,0.106319,0.444002
1,0.499449,0.053529,0.814068,0.078575,0.115385,0.469491
2,0.522385,0.056159,0.809396,0.095512,0.121883,0.475106
3,0.503129,0.057177,0.794208,0.100680,0.112468,0.443841
4,0.496976,0.071387,0.789910,0.106525,0.114455,0.438803
...,...,...,...,...,...,...
16195,0.925121,0.092074,0.055907,0.144063,0.855200,0.841530
16196,0.810423,0.110192,0.049880,0.077338,0.813913,0.660416
16197,0.736295,0.090358,0.026788,0.036291,0.681031,0.506149
16198,0.717107,0.102187,0.052119,0.086288,0.592038,0.512516


In [41]:
# ============== TREINAMENTO =================
# Treinar o modelo de ML

# print(df_classes.values.ravel())

rf = RandomForestClassifier(**rf_hyperparameters.best_params_)
# fertility_rf = rf.fit(df_atributos, df_classes.values.ravel())
fertility_rf = rf.fit(df_atributos, df_classes)
pprint(fertility_rf)

RandomForestClassifier(criterion='entropy', max_depth=20, max_features='log2',
                       n_estimators=74, random_state=42)


In [42]:
from sklearn.model_selection import cross_validate

scoring = ['precision_macro', 'recall_macro']
scores_cross = cross_validate(rf, df_atributos, df_classes, cv=10, scoring = scoring) # Rodar Cross Validation
print('Matriz de sensibilidades:', scores_cross['test_precision_macro'])
print('Matriz de especificidades:', scores_cross['test_recall_macro'])

# Metricas finais
print('Especificade:', scores_cross['test_precision_macro'].mean())
print('Sensibilidades:', scores_cross['test_recall_macro'].mean())

Matriz de sensibilidades: [0.99454399 0.99220222 0.99281655 0.99342491 0.99572927 0.99266638
 0.9908017  0.99634689 0.99332641 0.99591643]
Matriz de especificidades: [0.99444444 0.99197531 0.99259259 0.99320988 0.99567901 0.99259259
 0.99074074 0.9962963  0.99320988 0.99567901]
Especificade: 0.9937774758376149
Sensibilidades: 0.9936419753086421


In [43]:
import pickle

f = open('model.p', 'wb')
pickle.dump({'model': fertility_rf}, f)
f.close()

In [44]:
df_data_denormalized = pd.DataFrame( normalization_model.inverse_transform(df_atributos), columns=df_atributos.columns)
df_data_denormalized

Unnamed: 0,distance_0,distance_1,distance_2,distance_3,distance_4,distance_5
0,126.889716,7.615773,185.218790,22.472205,63.387696,127.035428
1,123.794184,12.206556,190.213038,21.470911,65.764732,132.034087
2,129.313572,12.806248,189.129585,25.000000,67.468511,133.135270
3,124.679589,13.038405,185.607112,26.076810,65.000000,127.003937
4,123.199026,16.278821,184.610401,27.294688,65.520989,126.015872
...,...,...,...,...,...,...
16195,226.228722,20.996085,14.380284,35.116052,259.746139,204.994600
16196,198.627568,25.127649,12.982409,21.213203,248.920366,169.476458
16197,180.789359,20.604810,7.626921,12.660549,214.078511,139.223121
16198,176.171845,23.302245,13.501583,23.077976,190.744377,140.471854
