In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

# Models
from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier

from keras.models import Sequential
from keras.layers import Dense

from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

from sklearn.svm import SVC

from sklearn.ensemble import HistGradientBoostingClassifier

# Módulos para a função de validação cruzada (visando balanceamento)
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from statistics import median, mean

In [4]:
treated_train = pd.read_csv("/content/Treated_numeric_columns_phl(20%)_pipe_Train.csv")
treated_test = pd.read_csv("/content/Treated_numeric_columns_phl(20%)_pipe_Test.csv")

In [5]:
# Tratar no pipeline
treated_train.drop(['Unnamed: 0'], axis=1, inplace=True)
treated_test.drop(['Unnamed: 0'], axis=1, inplace=True)

Divisão de variáveis independentes e dependente nos dados de treino

In [6]:
X_Train = treated_train.drop(['P_HABITABLE'], axis=1)
y_Train = treated_train['P_HABITABLE']

Divisão de variáveis independentes e dependente nos dados de teste

In [7]:
X_Test = treated_test.drop(['P_HABITABLE'], axis=1)
y_Test = treated_test['P_HABITABLE']

In [8]:
scaler = MinMaxScaler()

scaler.fit(X_Train)

X_train_normalized = scaler.transform(X_Train)
X_test_normalized = scaler.transform(X_Test)

# Convertendo os dados normalizados em DataFrame para que seja possível utilizá-los na função de cross validation
X_Train_normalized_df = pd.DataFrame(X_train_normalized, columns=X_Train.columns)
X_Test_normalized_df = pd.DataFrame(X_test_normalized, columns=X_Train.columns)

In [9]:
# Inserir as instancias abaixo em uma lista para utilizar no for que executará a função abaixo
model_rfc = RandomForestClassifier()
model_dtc = DecisionTreeClassifier()
model_lr = LogisticRegression()
model_xgb = XGBClassifier()
model_svc = SVC()
model_hgb = HistGradientBoostingClassifier()

models = [model_rfc, model_dtc, model_lr, model_xgb, model_svc, model_hgb]

In [10]:
# Criando a função de validação cruzada
def validacao_cruzada(modelo, X, y, oversampling=False):

  Kfold = KFold(n_splits=5)

  # Lista de acuracia para cada split (5)
  acuracias_split = []

  for idx, (idx_treino, idx_validacao) in enumerate(Kfold.split(X, y)):
    # X_split_treino = X.iloc[idx_treino, :]
    # y_split_treino = y.iloc[idx_treino, :]
    X_split_treino = X.iloc[idx_treino, :]
    y_split_treino = y.iloc[idx_treino]

    # Realizando Oversampling (apenas no split de treino)
    if oversampling:
      smote = SMOTE(random_state=20)
      X_split_treino, y_split_treino = smote.fit_resample(X_split_treino, y_split_treino)

  # Treinando o modelo com os dados (DE TREINO) balanceados
    modelo.fit(X_split_treino, y_split_treino.values.flatten())

    # X_split_validacao = X.iloc[idx_validacao, :]
    # y_split_validacao = y.iloc[idx_validacao, :]
    X_split_validacao = X.iloc[idx_validacao, :]
    y_split_validacao = y.iloc[idx_validacao]

    # Realizando a validação SEM oversampling
    # Amostra do mundo real, ou seja, com dados desbalanceados
    predict_validacao = modelo.predict(X_split_validacao)

    acuracia_split = accuracy_score(y_split_validacao, predict_validacao)

    acuracias_split.append(acuracia_split)

    print(f"Acuracia do split {idx}: {acuracia_split}")

  return acuracias_split

Inserir a linha abaixo em um for, como estava sendo feito da última vez

In [12]:
sem_smote = []
com_smote = []
for i in models:
  var_temp_sem_smote = 0.0
  var_temp_com_smote = 0.0

  print(i)
  var_temp_sem_smote = mean(validacao_cruzada(i, X_Train_normalized_df, y_Train, oversampling=False)) * 100
  print(f'/n{i} - Com Oversampling/n')
  var_temp_com_smote = mean(validacao_cruzada(i, X_Train_normalized_df, y_Train, oversampling=True)) * 100

  sem_smote.append(var_temp_sem_smote)
  com_smote.append(var_temp_com_smote)


RandomForestClassifier()
Acuracia do split 0: 0.9929453262786596
Acuracia do split 1: 0.9964726631393298
Acuracia do split 2: 1.0
Acuracia do split 3: 0.9929328621908127
Acuracia do split 4: 0.9964664310954063
/nRandomForestClassifier() - Com Oversampling/n
Acuracia do split 0: 1.0
Acuracia do split 1: 0.9964726631393298
Acuracia do split 2: 0.9982363315696648
Acuracia do split 3: 0.9964664310954063
Acuracia do split 4: 0.9964664310954063
DecisionTreeClassifier()
Acuracia do split 0: 0.9964726631393298
Acuracia do split 1: 0.9982363315696648
Acuracia do split 2: 0.9964726631393298
Acuracia do split 3: 0.9964664310954063
Acuracia do split 4: 1.0
/nDecisionTreeClassifier() - Com Oversampling/n
Acuracia do split 0: 0.9982363315696648
Acuracia do split 1: 0.9964726631393298
Acuracia do split 2: 0.9982363315696648
Acuracia do split 3: 0.9964664310954063
Acuracia do split 4: 1.0
LogisticRegression()
Acuracia do split 0: 0.9841269841269841
Acuracia do split 1: 0.9964726631393298
Acuracia do s

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Acuracia do split 0: 0.9894179894179894


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Acuracia do split 1: 0.9894179894179894


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Acuracia do split 2: 0.9805996472663139


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Acuracia do split 3: 0.9876325088339223


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Acuracia do split 4: 0.9840989399293286
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)
Acuracia do split 0: 0.9964726631393298
Acuracia do split 1: 0.9982363315696648
Acuracia do split 2: 0.9964726631393298
Acuracia do split 3: 0.9982332155477032
Acuracia do split 4: 1.0
/nXGBCla

In [16]:
# os valores estão armazenados em ordem, ou seja, segue a seguinte ordem: rfc, dtc, lr, xgb, svc, hgb
sem_smote

[99.57634565408416,
 99.75296177887462,
 98.9762621446956,
 99.78829746792056,
 98.552670119219,
 98.3045101301874]

In [17]:
# os valores estão armazenados em ordem, ou seja, segue a seguinte ordem: rfc, dtc, lr, xgb, svc, hgb
com_smote

[99.75283713799614,
 99.78823514748132,
 98.62334149731086,
 99.85878188469472,
 98.97632446513484,
 99.82350851608803]

Numa primeira análise, todos os modelos com a aplicação do smote performaram melhor com os dados de treino, sendo assim, pretendo aplicar o teste de hiperparametros nos seguintes modelos: HGB e XGB com smote

Obs.: O teste de hiperparametro, a principio, será realizado no notebook principal, dentro do pipeline (https://colab.research.google.com/drive/1nuiTpVcbHg80Iz2bCf4Ofxr3IoEDrPwK#scrollTo=w0znTQ0fk4ap)

Obs. 2: Se for o caso, posso selecionar 2 modelos sem smote que melhor performaram para testar, mas este passo é opcional.