# IMPORTAÇÃO DOS MÓDULOS E CONFIGURAÇÃO


In [1]:
import numpy as np
import pandas as pd
# from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from imblearn.pipeline import Pipeline
import sys
import os

# Adiciona o diretório pai (onde está dropout_rate_project) ao sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..'))) # Isso permite que o notebook enxergue a pasta dropout_rate_project.

from dropout_rate_project.py_files import data_prepared as dp
from dropout_rate_project.py_files import adjusted_models_util as amu
from dropout_rate_project.py_files import define_models_util as dmu


In [2]:
def model_result_saved(auc, f1, recall, precision, accuracy,
                             y_test, y_pred, model_test, data_filter, concat_inputs,
                             reference_year, drop_year, features_train, target_train,
                             features_test, target_test, best_model, best_threshold, best_threshold_tpr):
    
    # Definir nome do arquivo conforme especificado
    file_name = f"test_result_{model_test}_{reference_year}_{drop_year}_{data_filter}_{concat_inputs}.txt"

    with open(file_name, "w", encoding="utf-8") as file:
        # Gerando a matriz de confusão
        matriz_confusao = confusion_matrix(y_test, y_pred)
        file.write("Matriz de Confusão:\n")
        file.write(str(matriz_confusao) + "\n\n")

        # Informações gerais
        file.write(f"Modelo testado: {model_test}\n")
        file.write(f"Data filter aplicado: {data_filter}\n")
        file.write(f"Concatenação de inputs: {concat_inputs}\n")
        file.write(f"Ano de referência e anos removidos: {reference_year, drop_year}\n")
        file.write(f"Dimensões - features_train: {features_train.shape}\n")
        file.write(f"Dimensões - target_train: {target_train.shape}\n")
        file.write(f"Dimensões - features_test: {features_test.shape}\n")
        file.write(f"Dimensões - target_test: {target_test.shape}\n\n")

        # Parâmetros e thresholds
        file.write("Parâmetros do melhor modelo:\n")
        file.write(str(best_model.get_params()) + "\n\n")
        file.write(f"Melhor threshold: {best_threshold}\n")
        file.write(f"Melhor threshold TPR: {best_threshold_tpr}\n\n")

        # Métricas
        file.write(f"AUC: {auc:.4f}\n")
        file.write(f"F1: {f1:.4f}\n")
        file.write(f"Recall: {recall:.4f}\n")
        file.write(f"Precision: {precision:.4f}\n")
        file.write(f"Accuracy: {accuracy:.4f}\n")

    print(f"Relatório salvo em: {file_name}")



def prepared_data_test(dados, data_filter=False, concat_inputs=False):
    # Carregar os dados
    df = pd.read_csv(dados)

    # Definir features padrão
    categorical_features = ['RF-CAT', 'SEXO', 'TC', 'NC', 'TO', 'ME','TURNO']
    numerical_features = ['IDA-MÉD']

    # Aplicar filtro, se necessário
    if data_filter:
        df = dp.filter_dataframe_by_column(df, 'TO', 'SUB')
        # print("DataFrame filtrado:")
        # print(df)

    # Realizar concatenação de colunas, se necessário
    if concat_inputs:
        df["RFCAT_SEXO_TC_NC_TO_ME_TURNO"] = df[["RF-CAT","SEXO","TC", "NC", "TO", "ME", "TURNO"]].astype(str).agg("-".join, axis=1)
        df = df[["IDA-MÉD","RFCAT_SEXO_TC_NC_TO_ME_TURNO","ANO","AE"]]
        categorical_features = ["IDA-MÉD", "RFCAT_SEXO_TC_NC_TO_ME_TURNO"]

    return df


def test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr, show_dataset_information=False):

  # Separação dos dados de treino e teste
    df_train, df_test, features_train, target_train, features_test, target_test = dp.get_df_train_df_test(
        df, reference_year, drop_year)

    # Configuração do dicionário de modelo e thresholds
    test_pipeline = {
        'model': base_model,
        'optimal_threshold': optimal_threshold,
        'best_threshold_tpr': best_threshold_tpr
    }

    # Selecionando as configurações para o modelo final
    best_model = test_pipeline['model']
    best_threshold = test_pipeline['optimal_threshold']
    best_threshold_tpr = test_pipeline['best_threshold_tpr']

    # Configurando random_state se o modelo permitir
    if model_test != "KNN":
        best_model.random_state = random_state

    # Criando o pipeline com o modelo e treinando
    pipeline_best_model = amu.create_model_pipeline(best_model, features_train, target_train)

    # Avaliando o modelo
    auc, f1, recall, precision, accuracy, y_test, y_pred = amu.evaluate_model(
        pipeline_best_model, features_test, target_test, best_threshold)

    if show_dataset_information:
        # Chamada da função com os DataFrames de treino e teste
        dp.show_dataset_information(df_train, df_test)
    
    # Salvando os resultados
    model_result_saved(auc, f1, recall, precision, accuracy,
                       y_test, y_pred, model_test, data_filter, concat_inputs,
                       reference_year, drop_year, features_train, target_train,
                       features_test, target_test, best_model, best_threshold, best_threshold_tpr)


In [3]:
dados = '../data/selected_data.csv'
reference_year = 2018
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = LogisticRegression(class_weight='balanced', random_state=random_state, fit_intercept=True, C=0.4, max_iter=300, penalty='l1', solver='saga')
model_test = "Logistic_Regression"
optimal_threshold = 0.6
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (497, 9)
features_train shape: (497, 8)
target_train shape: (497,)
Relatório salvo em: test_result_Logistic_Regression_2018_[]_False_False.txt




In [4]:
dados = '../data/selected_data.csv'
reference_year = 2018
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = LogisticRegression(class_weight='balanced', random_state=random_state, fit_intercept=True, C=0.7, max_iter=100, penalty='l1', solver='saga')
model_test = "Logistic_Regression"
optimal_threshold = 0.54
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)


df_train shape: (289, 9)
features_train shape: (289, 8)
target_train shape: (289,)
Relatório salvo em: test_result_Logistic_Regression_2018_[]_True_False.txt


In [5]:
dados = '../data/selected_data.csv'
reference_year = 2019
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = LogisticRegression(class_weight='balanced', random_state=random_state, fit_intercept=True, C=0.1, max_iter=200, penalty='l1', solver='saga')
model_test = "Logistic_Regression"
optimal_threshold = 0.65
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (967, 9)
features_train shape: (967, 8)
target_train shape: (967,)
Relatório salvo em: test_result_Logistic_Regression_2019_[]_False_False.txt




In [6]:
dados = '../data/selected_data.csv'
reference_year = 2019
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = LogisticRegression(class_weight='balanced', random_state=random_state, fit_intercept=True, C=0.1, max_iter=100, penalty='l1', solver='saga')
model_test = "Logistic_Regression"
optimal_threshold = 0.58
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (582, 9)
features_train shape: (582, 8)
target_train shape: (582,)
Relatório salvo em: test_result_Logistic_Regression_2019_[]_True_False.txt


In [7]:
dados = '../data/selected_data.csv'
reference_year = 2020
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = LogisticRegression(class_weight='balanced', random_state=random_state, fit_intercept=True, C=0.01, max_iter=100, penalty='elasticnet', solver='saga', l1_ratio=0.5)
model_test = "Logistic_Regression"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1550, 9)
features_train shape: (1550, 8)
target_train shape: (1550,)
Relatório salvo em: test_result_Logistic_Regression_2020_[]_False_False.txt


In [8]:
dados = '../data/selected_data.csv'
reference_year = 2020
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = LogisticRegression(class_weight='balanced', random_state=random_state, fit_intercept=True, C=0.1, max_iter=100, penalty='l1', solver='saga')
model_test = "Logistic_Regression"
optimal_threshold = 0.57
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (902, 9)
features_train shape: (902, 8)
target_train shape: (902,)
Relatório salvo em: test_result_Logistic_Regression_2020_[]_True_False.txt


In [9]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[2020]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = LogisticRegression(class_weight='balanced', random_state=random_state, fit_intercept=True, C=0.01, max_iter=100, penalty='elasticnet', solver='saga', l1_ratio=0.5)
model_test = "Logistic_Regression"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1550, 9)
features_train shape: (1550, 8)
target_train shape: (1550,)
Relatório salvo em: test_result_Logistic_Regression_2021_[2020]_False_False.txt


In [10]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[2020]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = LogisticRegression(class_weight='balanced', random_state=random_state, fit_intercept=True, C=0.1, max_iter=100, penalty='l1', solver='saga')
model_test = "Logistic_Regression"
optimal_threshold = 0.57
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (902, 9)
features_train shape: (902, 8)
target_train shape: (902,)
Relatório salvo em: test_result_Logistic_Regression_2021_[2020]_True_False.txt


In [11]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = LogisticRegression(class_weight='balanced', random_state=random_state, fit_intercept=True, C=1, max_iter=100, penalty='l2', solver='liblinear')
model_test = "Logistic_Regression"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2636, 9)
features_train shape: (2636, 8)
target_train shape: (2636,)
Relatório salvo em: test_result_Logistic_Regression_2021_[]_False_False.txt


In [12]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = LogisticRegression(class_weight='balanced', random_state=random_state, fit_intercept=True, C=0.1, max_iter=100, penalty='l1', solver='saga')
model_test = "Logistic_Regression"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1175, 9)
features_train shape: (1175, 8)
target_train shape: (1175,)
Relatório salvo em: test_result_Logistic_Regression_2021_[]_True_False.txt


In [13]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[2020]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = LogisticRegression(class_weight='balanced', random_state=random_state, fit_intercept=True, C=0.8, max_iter=100, penalty='l1', solver='saga')
model_test = "Logistic_Regression"
optimal_threshold = 0.7
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2899, 9)
features_train shape: (2899, 8)
target_train shape: (2899,)
Relatório salvo em: test_result_Logistic_Regression_2022_[2020]_False_False.txt


In [14]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[2020]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = LogisticRegression(class_weight='balanced', random_state=random_state, fit_intercept=True, C=0.3, max_iter=100, penalty='l1', solver='saga')
model_test = "Logistic_Regression"
optimal_threshold = 0.71
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1436, 9)
features_train shape: (1436, 8)
target_train shape: (1436,)
Relatório salvo em: test_result_Logistic_Regression_2022_[2020]_True_False.txt


In [15]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = LogisticRegression(class_weight='balanced', random_state=random_state, fit_intercept=True, C=0.1, max_iter=100, penalty='l1', solver='saga')
model_test = "Logistic_Regression"
optimal_threshold = 0.50
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (3985, 9)
features_train shape: (3985, 8)
target_train shape: (3985,)
Relatório salvo em: test_result_Logistic_Regression_2022_[]_False_False.txt


In [16]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = LogisticRegression(class_weight='balanced', random_state=random_state, fit_intercept=True, C=0.4, max_iter=100, penalty='elasticnet', solver='saga', l1_ratio=0.1)
model_test = "Logistic_Regression"
optimal_threshold = 0.71
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1709, 9)
features_train shape: (1709, 8)
target_train shape: (1709,)
Relatório salvo em: test_result_Logistic_Regression_2022_[]_True_False.txt


In [17]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[2020]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = LogisticRegression(class_weight='balanced', random_state=random_state, fit_intercept=True, C=0.4, max_iter=100, penalty='l2', solver='sag')
model_test = "Logistic_Regression"
optimal_threshold = 0.7
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (3806, 9)
features_train shape: (3806, 8)
target_train shape: (3806,)
Relatório salvo em: test_result_Logistic_Regression_2023_[2020]_False_False.txt




In [18]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[2020]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = LogisticRegression(class_weight='balanced', random_state=random_state, fit_intercept=True, C=0.5, max_iter=100, penalty='l1', solver='saga')
model_test = "Logistic_Regression"
optimal_threshold = 0.61
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2039, 9)
features_train shape: (2039, 8)
target_train shape: (2039,)
Relatório salvo em: test_result_Logistic_Regression_2023_[2020]_True_False.txt


In [19]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = LogisticRegression(class_weight='balanced', random_state=random_state, fit_intercept=True, C=0.4, max_iter=100, penalty='l1', solver='saga')
model_test = "Logistic_Regression"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (4892, 9)
features_train shape: (4892, 8)
target_train shape: (4892,)
Relatório salvo em: test_result_Logistic_Regression_2023_[]_False_False.txt


In [20]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = LogisticRegression(class_weight='balanced', random_state=random_state, fit_intercept=True, C=0.05, max_iter=100, penalty='l1', solver='saga')
model_test = "Logistic_Regression"
optimal_threshold = 0.57
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2312, 9)
features_train shape: (2312, 8)
target_train shape: (2312,)
Relatório salvo em: test_result_Logistic_Regression_2023_[]_True_False.txt


In [21]:
dados = '../data/selected_data.csv'
reference_year = 2018
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, criterion='entropy', max_depth=10, max_features='log2', 
                                    min_samples_leaf= 3, min_samples_split=10)
model_test = "Decision_Tree"
optimal_threshold = 0.48
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (497, 9)
features_train shape: (497, 8)
target_train shape: (497,)
Relatório salvo em: test_result_Decision_Tree_2018_[]_False_False.txt


In [22]:
dados = '../data/selected_data.csv'
reference_year = 2018
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, criterion='gini', max_depth=3, max_features=None, 
                                    min_samples_leaf= 5, min_samples_split=2)
model_test = "Decision_Tree"
optimal_threshold = 0.6
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (289, 9)
features_train shape: (289, 8)
target_train shape: (289,)
Relatório salvo em: test_result_Decision_Tree_2018_[]_True_False.txt


In [23]:
dados = '../data/selected_data.csv'
reference_year = 2019
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, criterion='entropy', max_depth=10, max_features='sqrt', 
                                    min_samples_leaf= 4, min_samples_split=2)
model_test = "Decision_Tree"
optimal_threshold = 0.85
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (967, 9)
features_train shape: (967, 8)
target_train shape: (967,)
Relatório salvo em: test_result_Decision_Tree_2019_[]_False_False.txt


In [24]:
dados = '../data/selected_data.csv'
reference_year = 2019
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, criterion='entropy', max_depth=5, max_features=None, 
                                    min_samples_leaf= 4, min_samples_split=3)
model_test = "Decision_Tree"
optimal_threshold = 0.58
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (582, 9)
features_train shape: (582, 8)
target_train shape: (582,)
Relatório salvo em: test_result_Decision_Tree_2019_[]_True_False.txt


In [25]:
dados = '../data/selected_data.csv'
reference_year = 2020
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, criterion='entropy', max_depth=5, max_features=None, 
                                    min_samples_leaf= 4, min_samples_split=2)
model_test = "Decision_Tree"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1550, 9)
features_train shape: (1550, 8)
target_train shape: (1550,)
Relatório salvo em: test_result_Decision_Tree_2020_[]_False_False.txt


In [26]:
dados = '../data/selected_data.csv'
reference_year = 2020
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, criterion='entropy', max_depth=3, max_features='sqrt', 
                                    min_samples_leaf= 1, min_samples_split=2)
model_test = "Decision_Tree"
optimal_threshold = 0.68
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (902, 9)
features_train shape: (902, 8)
target_train shape: (902,)
Relatório salvo em: test_result_Decision_Tree_2020_[]_True_False.txt


In [27]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[2020]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, criterion='entropy', max_depth=5, max_features=None, 
                                    min_samples_leaf= 4, min_samples_split=3)
model_test = "Decision_Tree"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1550, 9)
features_train shape: (1550, 8)
target_train shape: (1550,)
Relatório salvo em: test_result_Decision_Tree_2021_[2020]_False_False.txt


In [28]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[2020]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, criterion='entropy', max_depth=3, max_features='sqrt', 
                                    min_samples_leaf= 1, min_samples_split=2)
model_test = "Decision_Tree"
optimal_threshold = 0.68
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (902, 9)
features_train shape: (902, 8)
target_train shape: (902,)
Relatório salvo em: test_result_Decision_Tree_2021_[2020]_True_False.txt


In [29]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, criterion='entropy', max_depth=10, max_features='log2', 
                                    min_samples_leaf= 4, min_samples_split=10)
model_test = "Decision_Tree"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2636, 9)
features_train shape: (2636, 8)
target_train shape: (2636,)
Relatório salvo em: test_result_Decision_Tree_2021_[]_False_False.txt


In [30]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, criterion='gini', max_depth=3, max_features='sqrt', 
                                    min_samples_leaf= 1, min_samples_split=2)
model_test = "Decision_Tree"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1175, 9)
features_train shape: (1175, 8)
target_train shape: (1175,)
Relatório salvo em: test_result_Decision_Tree_2021_[]_True_False.txt


In [31]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[2020]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, criterion='gini', max_depth=5, max_features=None, 
                                    min_samples_leaf= 4, min_samples_split=2)
model_test = "Decision_Tree"
optimal_threshold = 0.54
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2899, 9)
features_train shape: (2899, 8)
target_train shape: (2899,)
Relatório salvo em: test_result_Decision_Tree_2022_[2020]_False_False.txt


In [32]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[2020]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, criterion='gini', max_depth=3, max_features=None, 
                                    min_samples_leaf= 1, min_samples_split=2)
model_test = "Decision_Tree"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1436, 9)
features_train shape: (1436, 8)
target_train shape: (1436,)
Relatório salvo em: test_result_Decision_Tree_2022_[2020]_True_False.txt


In [33]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, criterion='gini', max_depth=3, max_features=None, 
                                    min_samples_leaf= 3, min_samples_split=2)
model_test = "Decision_Tree"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (3985, 9)
features_train shape: (3985, 8)
target_train shape: (3985,)
Relatório salvo em: test_result_Decision_Tree_2022_[]_False_False.txt


In [34]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, criterion='entropy', max_depth=3, max_features=None, 
                                    min_samples_leaf= 1, min_samples_split=2)
model_test = "Decision_Tree"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1709, 9)
features_train shape: (1709, 8)
target_train shape: (1709,)
Relatório salvo em: test_result_Decision_Tree_2022_[]_True_False.txt


In [35]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[2020]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, criterion='gini', max_depth=10, max_features='log2', 
                                    min_samples_leaf= 3, min_samples_split=10)
model_test = "Decision_Tree"
optimal_threshold = 0.59
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (3806, 9)
features_train shape: (3806, 8)
target_train shape: (3806,)
Relatório salvo em: test_result_Decision_Tree_2023_[2020]_False_False.txt


In [36]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[2020]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, criterion='entropy', max_depth=5, max_features=None, 
                                    min_samples_leaf= 5, min_samples_split=2)
model_test = "Decision_Tree"
optimal_threshold = 0.58
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2039, 9)
features_train shape: (2039, 8)
target_train shape: (2039,)
Relatório salvo em: test_result_Decision_Tree_2023_[2020]_True_False.txt


In [37]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, criterion='gini', max_depth=5, max_features=None, 
                                    min_samples_leaf= 5, min_samples_split=2)
model_test = "Decision_Tree"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (4892, 9)
features_train shape: (4892, 8)
target_train shape: (4892,)
Relatório salvo em: test_result_Decision_Tree_2023_[]_False_False.txt


In [38]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = DecisionTreeClassifier(class_weight='balanced', random_state=random_state, criterion='entropy', max_depth=5, max_features=None, 
                                    min_samples_leaf= 4, min_samples_split=10)
model_test = "Decision_Tree"
optimal_threshold = 0.72
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2312, 9)
features_train shape: (2312, 8)
target_train shape: (2312,)
Relatório salvo em: test_result_Decision_Tree_2023_[]_True_False.txt


In [39]:
dados = '../data/selected_data.csv'
reference_year = 2018
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = RandomForestClassifier(
            bootstrap = True,
            n_estimators = 300,
            criterion = 'gini',
            max_depth = 3,
            max_features = None,
            min_samples_leaf = 1,
            min_samples_split = 2,
            class_weight='balanced',
            random_state= random_state)
model_test = "Random_Forest"
optimal_threshold = 0.53
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (497, 9)
features_train shape: (497, 8)
target_train shape: (497,)
Relatório salvo em: test_result_Random_Forest_2018_[]_False_False.txt


In [40]:
dados = '../data/selected_data.csv'
reference_year = 2018
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = RandomForestClassifier(
            bootstrap = True,
            n_estimators = 300,
            criterion = 'gini',
            max_depth = 3,
            max_features = None,
            min_samples_leaf = 5,
            min_samples_split = 2,
            class_weight='balanced',
            random_state= random_state)
model_test = "Random_Forest"
optimal_threshold = 0.63
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (289, 9)
features_train shape: (289, 8)
target_train shape: (289,)
Relatório salvo em: test_result_Random_Forest_2018_[]_True_False.txt


In [41]:
dados = '../data/selected_data.csv'
reference_year = 2019
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = RandomForestClassifier(
            bootstrap = True,
            n_estimators = 100,
            criterion = 'entropy',
            max_depth = 3,
            max_features = None,
            min_samples_leaf = 3,
            min_samples_split = 10,
            class_weight='balanced',
            random_state= random_state)
model_test = "Random_Forest"
optimal_threshold = 0.42
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (967, 9)
features_train shape: (967, 8)
target_train shape: (967,)
Relatório salvo em: test_result_Random_Forest_2019_[]_False_False.txt


In [42]:
dados = '../data/selected_data.csv'
reference_year = 2019
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = RandomForestClassifier(
            bootstrap = True,
            n_estimators = 100,
            criterion = 'entropy',
            max_depth = 3,
            max_features = None,
            min_samples_leaf = 3,
            min_samples_split = 10,
            class_weight='balanced',
            random_state= random_state)
model_test = "Random_Forest"
optimal_threshold = 0.45
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (582, 9)
features_train shape: (582, 8)
target_train shape: (582,)
Relatório salvo em: test_result_Random_Forest_2019_[]_True_False.txt


In [43]:
dados = '../data/selected_data.csv'
reference_year = 2020
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = RandomForestClassifier(
            bootstrap = True,
            n_estimators = 100,
            criterion = 'entropy',
            max_depth = 3,
            max_features = None,
            min_samples_leaf = 5,
            min_samples_split = 2,
            class_weight='balanced',
            random_state= random_state)
model_test = "Random_Forest"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1550, 9)
features_train shape: (1550, 8)
target_train shape: (1550,)
Relatório salvo em: test_result_Random_Forest_2020_[]_False_False.txt


In [44]:
dados = '../data/selected_data.csv'
reference_year = 2020
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = RandomForestClassifier(
            bootstrap = True,
            n_estimators = 100,
            criterion = 'gini',
            max_depth = 10,
            max_features = None,
            min_samples_leaf = 5,
            min_samples_split = 2,
            class_weight='balanced',
            random_state= random_state)
model_test = "Random_Forest"
optimal_threshold = 0.74
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (902, 9)
features_train shape: (902, 8)
target_train shape: (902,)
Relatório salvo em: test_result_Random_Forest_2020_[]_True_False.txt


In [45]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[2020]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = RandomForestClassifier(
            bootstrap = True,
            n_estimators = 100,
            criterion = 'entropy',
            max_depth = 3,
            max_features = None,
            min_samples_leaf = 5,
            min_samples_split = 2,
            class_weight='balanced',
            random_state= random_state)
model_test = "Random_Forest"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1550, 9)
features_train shape: (1550, 8)
target_train shape: (1550,)
Relatório salvo em: test_result_Random_Forest_2021_[2020]_False_False.txt


In [46]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[2020]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = RandomForestClassifier(
            bootstrap = True,
            n_estimators = 100,
            criterion = 'gini',
            max_depth = 10,
            max_features = None,
            min_samples_leaf = 5,
            min_samples_split = 2,
            class_weight='balanced',
            random_state= random_state)
model_test = "Random_Forest"
optimal_threshold = 0.74
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (902, 9)
features_train shape: (902, 8)
target_train shape: (902,)
Relatório salvo em: test_result_Random_Forest_2021_[2020]_True_False.txt


In [47]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = RandomForestClassifier(
            bootstrap = True,
            n_estimators = 300,
            criterion = 'gini',
            max_depth = 20,
            max_features = 'log2',
            min_samples_leaf = 5,
            min_samples_split = 2,
            class_weight='balanced',
            random_state= random_state)
model_test = "Random_Forest"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2636, 9)
features_train shape: (2636, 8)
target_train shape: (2636,)
Relatório salvo em: test_result_Random_Forest_2021_[]_False_False.txt


In [48]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = RandomForestClassifier(
            bootstrap = True,
            n_estimators = 300,
            criterion = 'entropy',
            max_depth = 3,
            max_features = 'sqrt',
            min_samples_leaf = 3,
            min_samples_split = 2,
            class_weight='balanced',
            random_state= random_state)
model_test = "Random_Forest"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1175, 9)
features_train shape: (1175, 8)
target_train shape: (1175,)
Relatório salvo em: test_result_Random_Forest_2021_[]_True_False.txt


In [49]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[2020]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = RandomForestClassifier(
            bootstrap = True,
            n_estimators = 300,
            criterion = 'entropy',
            max_depth = 10,
            max_features = 'log2',
            min_samples_leaf = 3,
            min_samples_split = 2,
            class_weight='balanced',
            random_state= random_state)
model_test = "Random_Forest"
optimal_threshold = 0.61
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2899, 9)
features_train shape: (2899, 8)
target_train shape: (2899,)
Relatório salvo em: test_result_Random_Forest_2022_[2020]_False_False.txt


In [50]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[2020]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = RandomForestClassifier(
            bootstrap = True,
            n_estimators = 300,
            criterion = 'entropy',
            max_depth = 3,
            max_features = 'sqrt',
            min_samples_leaf = 5,
            min_samples_split = 2,
            class_weight='balanced',
            random_state= random_state)
model_test = "Random_Forest"
optimal_threshold = 0.61
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1436, 9)
features_train shape: (1436, 8)
target_train shape: (1436,)
Relatório salvo em: test_result_Random_Forest_2022_[2020]_True_False.txt


In [51]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = RandomForestClassifier(
            bootstrap = True,
            n_estimators = 100,
            criterion = 'gini',
            max_depth = 3,
            max_features = None,
            min_samples_leaf = 3,
            min_samples_split = 10,
            class_weight='balanced',
            random_state= random_state)
model_test = "Random_Forest"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (3985, 9)
features_train shape: (3985, 8)
target_train shape: (3985,)
Relatório salvo em: test_result_Random_Forest_2022_[]_False_False.txt


In [52]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = RandomForestClassifier(
            bootstrap = True,
            n_estimators = 100,
            criterion = 'gini',
            max_depth = 3,
            max_features = 'sqrt',
            min_samples_leaf = 3,
            min_samples_split = 10,
            class_weight='balanced',
            random_state= random_state)
model_test = "Random_Forest"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1709, 9)
features_train shape: (1709, 8)
target_train shape: (1709,)
Relatório salvo em: test_result_Random_Forest_2022_[]_True_False.txt


In [125]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[2020]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = RandomForestClassifier(
            bootstrap = True,
            n_estimators = 100,
            criterion = 'entropy',
            max_depth = 3,
            max_features = None,
            min_samples_leaf = 3,
            min_samples_split = 10,
            class_weight='balanced',
            random_state= random_state)
model_test = "Random_Forest"
optimal_threshold = 0.65
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (3806, 9)
features_train shape: (3806, 8)
target_train shape: (3806,)
Relatório salvo em: test_result_Random_Forest_2023_[2020]_False_False.txt


In [54]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[2020]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = RandomForestClassifier(
            bootstrap = True,
            n_estimators = 100,
            criterion = 'gini',
            max_depth = 3,
            max_features = 'sqrt',
            min_samples_leaf = 5,
            min_samples_split = 2,
            class_weight='balanced',
            random_state= random_state)
model_test = "Random_Forest"
optimal_threshold = 0.53
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2039, 9)
features_train shape: (2039, 8)
target_train shape: (2039,)
Relatório salvo em: test_result_Random_Forest_2023_[2020]_True_False.txt


In [55]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = RandomForestClassifier(
            bootstrap = True,
            n_estimators = 300,
            criterion = 'entropy',
            max_depth = 3,
            max_features = None,
            min_samples_leaf = 5,
            min_samples_split = 2,
            class_weight='balanced',
            random_state= random_state)
model_test = "Random_Forest"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (4892, 9)
features_train shape: (4892, 8)
target_train shape: (4892,)
Relatório salvo em: test_result_Random_Forest_2023_[]_False_False.txt


In [56]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# label_pipeline = 'logreg'
base_model = RandomForestClassifier(
            bootstrap = True,
            n_estimators = 100,
            criterion = 'gini',
            max_depth = 3,
            max_features = 'sqrt',
            min_samples_leaf = 1,
            min_samples_split = 2,
            class_weight='balanced',
            random_state= random_state)
model_test = "Random_Forest"
optimal_threshold = 0.56
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2312, 9)
features_train shape: (2312, 8)
target_train shape: (2312,)
Relatório salvo em: test_result_Random_Forest_2023_[]_True_False.txt


In [57]:
dados = '../data/selected_data.csv'
reference_year = 2018
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

base_model = SVC(
            C=1,
            degree=2,  # Apenas para kernel 'poly'
            gamma='scale',
            kernel='linear',
            class_weight='balanced',
            probability=True,
            random_state=random_state
            )

model_test = "Support_Vector_Machine"
optimal_threshold = 0.68
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (497, 9)
features_train shape: (497, 8)
target_train shape: (497,)
Relatório salvo em: test_result_Support_Vector_Machine_2018_[]_False_False.txt


In [58]:
dados = '../data/selected_data.csv'
reference_year = 2018
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

base_model = SVC(
            C=10,
            degree=2,  # Apenas para kernel 'poly'
            gamma='auto',
            kernel='sigmoid',
            class_weight='balanced',
            probability=True,
            random_state=random_state
            )

model_test = "Support_Vector_Machine"
optimal_threshold = 0.45
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (289, 9)
features_train shape: (289, 8)
target_train shape: (289,)
Relatório salvo em: test_result_Support_Vector_Machine_2018_[]_True_False.txt


In [59]:
dados = '../data/selected_data.csv'
reference_year = 2019
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

base_model = SVC(
            C=1,
            degree=2,  # Apenas para kernel 'poly'
            gamma='scale',
            kernel='linear',
            class_weight='balanced',
            probability=True,
            random_state=random_state
            )

model_test = "Support_Vector_Machine"
optimal_threshold = 0.55
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (967, 9)
features_train shape: (967, 8)
target_train shape: (967,)
Relatório salvo em: test_result_Support_Vector_Machine_2019_[]_False_False.txt


In [60]:
dados = '../data/selected_data.csv'
reference_year = 2019
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

base_model = SVC(
            C=1,
            degree=2,  # Apenas para kernel 'poly'
            gamma='scale',
            kernel='poly',
            class_weight='balanced',
            probability=True,
            random_state=random_state
            )

model_test = "Support_Vector_Machine"
optimal_threshold = 0.38
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (582, 9)
features_train shape: (582, 8)
target_train shape: (582,)
Relatório salvo em: test_result_Support_Vector_Machine_2019_[]_True_False.txt


In [61]:
dados = '../data/selected_data.csv'
reference_year = 2020
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

base_model = SVC(
            C=0.1,
            degree=3,  # Apenas para kernel 'poly'
            gamma='scale',
            kernel='poly',
            class_weight='balanced',
            probability=True,
            random_state=random_state
            )

model_test = "Support_Vector_Machine"
optimal_threshold = 0.65
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1550, 9)
features_train shape: (1550, 8)
target_train shape: (1550,)
Relatório salvo em: test_result_Support_Vector_Machine_2020_[]_False_False.txt


In [62]:
dados = '../data/selected_data.csv'
reference_year = 2020
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

base_model = SVC(
            C=1,
            degree=2,  # Apenas para kernel 'poly'
            gamma='auto',
            kernel='poly',
            class_weight='balanced',
            probability=True,
            random_state=random_state
            )

model_test = "Support_Vector_Machine"
optimal_threshold = 0.56
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (902, 9)
features_train shape: (902, 8)
target_train shape: (902,)
Relatório salvo em: test_result_Support_Vector_Machine_2020_[]_True_False.txt


In [63]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[2020]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

base_model = SVC(
            C=0.1,
            degree=3,  # Apenas para kernel 'poly'
            gamma='scale',
            kernel='poly',
            class_weight='balanced',
            probability=True,
            random_state=random_state
            )

model_test = "Support_Vector_Machine"
optimal_threshold = 0.65
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1550, 9)
features_train shape: (1550, 8)
target_train shape: (1550,)
Relatório salvo em: test_result_Support_Vector_Machine_2021_[2020]_False_False.txt


In [64]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[2020]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

base_model = SVC(
            C=1,
            degree=2,  # Apenas para kernel 'poly'
            gamma='auto',
            kernel='poly',
            class_weight='balanced',
            probability=True,
            random_state=random_state
            )

model_test = "Support_Vector_Machine"
optimal_threshold = 0.56
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (902, 9)
features_train shape: (902, 8)
target_train shape: (902,)
Relatório salvo em: test_result_Support_Vector_Machine_2021_[2020]_True_False.txt


In [65]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

base_model = SVC(
            C=0.1,
            degree=2,  # Apenas para kernel 'poly'
            gamma='scale',
            kernel='poly',
            class_weight='balanced',
            probability=True,
            random_state=random_state
            )

model_test = "Support_Vector_Machine"
optimal_threshold = 0.59
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2636, 9)
features_train shape: (2636, 8)
target_train shape: (2636,)
Relatório salvo em: test_result_Support_Vector_Machine_2021_[]_False_False.txt


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [66]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

base_model = SVC(
            C=1,
            degree=2,  # Apenas para kernel 'poly'
            gamma='auto',
            kernel='poly',
            class_weight='balanced',
            probability=True,
            random_state=random_state
            )

model_test = "Support_Vector_Machine"
optimal_threshold = 0.54
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1175, 9)
features_train shape: (1175, 8)
target_train shape: (1175,)
Relatório salvo em: test_result_Support_Vector_Machine_2021_[]_True_False.txt


In [67]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[2020]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

base_model = SVC(
            C=1,
            degree=2,  # Apenas para kernel 'poly'
            gamma='scale',
            kernel='rbf',
            class_weight='balanced',
            probability=True,
            random_state=random_state
            )

model_test = "Support_Vector_Machine"
optimal_threshold = 0.53
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2899, 9)
features_train shape: (2899, 8)
target_train shape: (2899,)
Relatório salvo em: test_result_Support_Vector_Machine_2022_[2020]_False_False.txt


In [68]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[2020]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

base_model = SVC(
            C=1,
            degree=2,  # Apenas para kernel 'poly'
            gamma='scale',
            kernel='linear',
            class_weight='balanced',
            probability=True,
            random_state=random_state
            )

model_test = "Support_Vector_Machine"
optimal_threshold = 0.7
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1436, 9)
features_train shape: (1436, 8)
target_train shape: (1436,)
Relatório salvo em: test_result_Support_Vector_Machine_2022_[2020]_True_False.txt


In [69]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

base_model = SVC(
            C=0.1,
            degree=2,  # Apenas para kernel 'poly'
            gamma='auto',
            kernel='sigmoid',
            class_weight='balanced',
            probability=True,
            random_state=random_state
            )

model_test = "Support_Vector_Machine"
optimal_threshold = 0.51
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (3985, 9)
features_train shape: (3985, 8)
target_train shape: (3985,)
Relatório salvo em: test_result_Support_Vector_Machine_2022_[]_False_False.txt


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [70]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

base_model = SVC(
            C=0.1,
            degree=2,  # Apenas para kernel 'poly'
            gamma='scale',
            kernel='linear',
            class_weight='balanced',
            probability=True,
            random_state=random_state
            )

model_test = "Support_Vector_Machine"
optimal_threshold = 0.55
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1709, 9)
features_train shape: (1709, 8)
target_train shape: (1709,)
Relatório salvo em: test_result_Support_Vector_Machine_2022_[]_True_False.txt


In [71]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[2020]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

base_model = SVC(
            C=0.1,
            degree=4,  # Apenas para kernel 'poly'
            gamma='scale',
            kernel='poly',
            class_weight='balanced',
            probability=True,
            random_state=random_state
            )

model_test = "Support_Vector_Machine"
optimal_threshold = 0.54
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (3806, 9)
features_train shape: (3806, 8)
target_train shape: (3806,)
Relatório salvo em: test_result_Support_Vector_Machine_2023_[2020]_False_False.txt


In [72]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[2020]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

base_model = SVC(
            C=1,
            degree=3,  # Apenas para kernel 'poly'
            gamma='auto',
            kernel='poly',
            class_weight='balanced',
            probability=True,
            random_state=random_state
            )

model_test = "Support_Vector_Machine"
optimal_threshold = 0.52
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2039, 9)
features_train shape: (2039, 8)
target_train shape: (2039,)
Relatório salvo em: test_result_Support_Vector_Machine_2023_[2020]_True_False.txt


In [73]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

base_model = SVC(
            C=0.1,
            degree=2,  # Apenas para kernel 'poly'
            gamma='auto',
            kernel='sigmoid',
            class_weight='balanced',
            probability=True,
            random_state=random_state
            )

model_test = "Support_Vector_Machine"
optimal_threshold = 0.55
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (4892, 9)
features_train shape: (4892, 8)
target_train shape: (4892,)
Relatório salvo em: test_result_Support_Vector_Machine_2023_[]_False_False.txt


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [74]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

base_model = SVC(
            C=0.1,
            degree=2,  # Apenas para kernel 'poly'
            gamma='auto',
            kernel='rbf',
            class_weight='balanced',
            probability=True,
            random_state=random_state
            )

model_test = "Support_Vector_Machine"
optimal_threshold = 0.46
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2312, 9)
features_train shape: (2312, 8)
target_train shape: (2312,)
Relatório salvo em: test_result_Support_Vector_Machine_2023_[]_True_False.txt


In [75]:
dados = '../data/selected_data.csv'
reference_year = 2018
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

scale_pos_weight_val = dmu.get_scale_pos_weight(reference_year)

base_model = XGBClassifier(
            gamma=0.2,
            learning_rate=0.01,
            max_depth=3,
            n_estimators=250,
            reg_alpha=0.1,
            reg_lambda=2,
            scale_pos_weight=scale_pos_weight_val,
            eval_metric='logloss',
            random_state=random_state
            #subsample=best_hyperparams_grid_search['xgboost__subsample'],
            #colsample_bytree=best_hyperparams_grid_search['xgboost__colsample_bytree'],
        )

model_test = "XG_Boost"
optimal_threshold = 0.6
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (497, 9)
features_train shape: (497, 8)
target_train shape: (497,)
Relatório salvo em: test_result_XG_Boost_2018_[]_False_False.txt


In [76]:
dados = '../data/selected_data.csv'
reference_year = 2018
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

scale_pos_weight_val = dmu.get_scale_pos_weight(reference_year)

base_model = XGBClassifier(
            gamma=0.1,
            learning_rate=0.2,
            max_depth=3,
            n_estimators=100,
            reg_alpha=1,
            reg_lambda=1.5,
            scale_pos_weight=scale_pos_weight_val,
            eval_metric='logloss',
            random_state=random_state
            #subsample=best_hyperparams_grid_search['xgboost__subsample'],
            #colsample_bytree=best_hyperparams_grid_search['xgboost__colsample_bytree'],
        )

model_test = "XG_Boost"
optimal_threshold = 0.67
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (289, 9)
features_train shape: (289, 8)
target_train shape: (289,)
Relatório salvo em: test_result_XG_Boost_2018_[]_True_False.txt


In [77]:
dados = '../data/selected_data.csv'
reference_year = 2019
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

scale_pos_weight_val = dmu.get_scale_pos_weight(reference_year)

base_model = XGBClassifier(
            gamma=0.2,
            learning_rate=0.01,
            max_depth=3,
            n_estimators=250,
            reg_alpha=0.1,
            reg_lambda=1,
            scale_pos_weight=scale_pos_weight_val,
            eval_metric='logloss',
            random_state=random_state
            #subsample=best_hyperparams_grid_search['xgboost__subsample'],
            #colsample_bytree=best_hyperparams_grid_search['xgboost__colsample_bytree'],
        )

model_test = "XG_Boost"
optimal_threshold = 0.48
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (967, 9)
features_train shape: (967, 8)
target_train shape: (967,)
Relatório salvo em: test_result_XG_Boost_2019_[]_False_False.txt


In [78]:
dados = '../data/selected_data.csv'
reference_year = 2019
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

scale_pos_weight_val = dmu.get_scale_pos_weight(reference_year)

base_model = XGBClassifier(
            gamma=0.1,
            learning_rate=0.01,
            max_depth=3,
            n_estimators=250,
            reg_alpha=0,
            reg_lambda=1,
            scale_pos_weight=scale_pos_weight_val,
            eval_metric='logloss',
            random_state=random_state
            #subsample=best_hyperparams_grid_search['xgboost__subsample'],
            #colsample_bytree=best_hyperparams_grid_search['xgboost__colsample_bytree'],
        )

model_test = "XG_Boost"
optimal_threshold = 0.65
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (582, 9)
features_train shape: (582, 8)
target_train shape: (582,)
Relatório salvo em: test_result_XG_Boost_2019_[]_True_False.txt


In [79]:
dados = '../data/selected_data.csv'
reference_year = 2020
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

scale_pos_weight_val = dmu.get_scale_pos_weight(reference_year)

base_model = XGBClassifier(
            gamma=0.1,
            learning_rate=0.01,
            max_depth=7,
            n_estimators=250,
            reg_alpha=0.1,
            reg_lambda=1,
            scale_pos_weight=scale_pos_weight_val,
            eval_metric='logloss',
            random_state=random_state
            #subsample=best_hyperparams_grid_search['xgboost__subsample'],
            #colsample_bytree=best_hyperparams_grid_search['xgboost__colsample_bytree'],
        )

model_test = "XG_Boost"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1550, 9)
features_train shape: (1550, 8)
target_train shape: (1550,)
Relatório salvo em: test_result_XG_Boost_2020_[]_False_False.txt


In [80]:
dados = '../data/selected_data.csv'
reference_year = 2020
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

scale_pos_weight_val = dmu.get_scale_pos_weight(reference_year)

base_model = XGBClassifier(
            gamma=0.1,
            learning_rate=0.1,
            max_depth=7,
            n_estimators=100,
            reg_alpha=1,
            reg_lambda=1.5,
            scale_pos_weight=scale_pos_weight_val,
            eval_metric='logloss',
            random_state=random_state
            #subsample=best_hyperparams_grid_search['xgboost__subsample'],
            #colsample_bytree=best_hyperparams_grid_search['xgboost__colsample_bytree'],
        )

model_test = "XG_Boost"
optimal_threshold = 0.72
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (902, 9)
features_train shape: (902, 8)
target_train shape: (902,)
Relatório salvo em: test_result_XG_Boost_2020_[]_True_False.txt


In [81]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[2020]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

scale_pos_weight_val = dmu.get_scale_pos_weight(reference_year)

base_model = XGBClassifier(
            gamma=0.2,
            learning_rate=0.01,
            max_depth=7,
            n_estimators=250,
            reg_alpha=0.1,
            reg_lambda=2,
            scale_pos_weight=scale_pos_weight_val,
            eval_metric='logloss',
            random_state=random_state
            #subsample=best_hyperparams_grid_search['xgboost__subsample'],
            #colsample_bytree=best_hyperparams_grid_search['xgboost__colsample_bytree'],
        )

model_test = "XG_Boost"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1550, 9)
features_train shape: (1550, 8)
target_train shape: (1550,)
Relatório salvo em: test_result_XG_Boost_2021_[2020]_False_False.txt


In [82]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[2020]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

scale_pos_weight_val = dmu.get_scale_pos_weight(reference_year)

base_model = XGBClassifier(
            gamma=0.2,
            learning_rate=0.1,
            max_depth=5,
            n_estimators=100,
            reg_alpha=1,
            reg_lambda=2,
            scale_pos_weight=scale_pos_weight_val,
            eval_metric='logloss',
            random_state=random_state
            #subsample=best_hyperparams_grid_search['xgboost__subsample'],
            #colsample_bytree=best_hyperparams_grid_search['xgboost__colsample_bytree'],
        )

model_test = "XG_Boost"
optimal_threshold = 0.71
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (902, 9)
features_train shape: (902, 8)
target_train shape: (902,)
Relatório salvo em: test_result_XG_Boost_2021_[2020]_True_False.txt


In [83]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

scale_pos_weight_val = dmu.get_scale_pos_weight(reference_year)

base_model = XGBClassifier(
            gamma=0.1,
            learning_rate=0.01,
            max_depth=5,
            n_estimators=100,
            reg_alpha=1,
            reg_lambda=2,
            scale_pos_weight=scale_pos_weight_val,
            eval_metric='logloss',
            random_state=random_state
            #subsample=best_hyperparams_grid_search['xgboost__subsample'],
            #colsample_bytree=best_hyperparams_grid_search['xgboost__colsample_bytree'],
        )

model_test = "XG_Boost"
optimal_threshold = 0.7
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2636, 9)
features_train shape: (2636, 8)
target_train shape: (2636,)
Relatório salvo em: test_result_XG_Boost_2021_[]_False_False.txt


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [84]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

scale_pos_weight_val = dmu.get_scale_pos_weight(reference_year)

base_model = XGBClassifier(
            gamma=0,
            learning_rate=0.01,
            max_depth=3,
            n_estimators=250,
            reg_alpha=1,
            reg_lambda=1,
            scale_pos_weight=scale_pos_weight_val,
            eval_metric='logloss',
            random_state=random_state
            #subsample=best_hyperparams_grid_search['xgboost__subsample'],
            #colsample_bytree=best_hyperparams_grid_search['xgboost__colsample_bytree'],
        )

model_test = "XG_Boost"
optimal_threshold = 0.79
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1175, 9)
features_train shape: (1175, 8)
target_train shape: (1175,)
Relatório salvo em: test_result_XG_Boost_2021_[]_True_False.txt


In [85]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[2020]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

scale_pos_weight_val = dmu.get_scale_pos_weight(reference_year)

base_model = XGBClassifier(
            gamma=0.2,
            learning_rate=0.1,
            max_depth=3,
            n_estimators=100,
            reg_alpha=0,
            reg_lambda=1.5,
            scale_pos_weight=scale_pos_weight_val,
            eval_metric='logloss',
            random_state=random_state
            #subsample=best_hyperparams_grid_search['xgboost__subsample'],
            #colsample_bytree=best_hyperparams_grid_search['xgboost__colsample_bytree'],
        )

model_test = "XG_Boost"
optimal_threshold = 0.74
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2899, 9)
features_train shape: (2899, 8)
target_train shape: (2899,)
Relatório salvo em: test_result_XG_Boost_2022_[2020]_False_False.txt


In [86]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[2020]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

scale_pos_weight_val = dmu.get_scale_pos_weight(reference_year)

base_model = XGBClassifier(
            gamma=0,
            learning_rate=0.01,
            max_depth=3,
            n_estimators=250,
            reg_alpha=1,
            reg_lambda=2,
            scale_pos_weight=scale_pos_weight_val,
            eval_metric='logloss',
            random_state=random_state
            #subsample=best_hyperparams_grid_search['xgboost__subsample'],
            #colsample_bytree=best_hyperparams_grid_search['xgboost__colsample_bytree'],
        )

model_test = "XG_Boost"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1436, 9)
features_train shape: (1436, 8)
target_train shape: (1436,)
Relatório salvo em: test_result_XG_Boost_2022_[2020]_True_False.txt


In [87]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

scale_pos_weight_val = dmu.get_scale_pos_weight(reference_year)

base_model = XGBClassifier(
            gamma=0.2,
            learning_rate=0.01,
            max_depth=3,
            n_estimators=100,
            reg_alpha=1,
            reg_lambda=2,
            scale_pos_weight=scale_pos_weight_val,
            eval_metric='logloss',
            random_state=random_state
            #subsample=best_hyperparams_grid_search['xgboost__subsample'],
            #colsample_bytree=best_hyperparams_grid_search['xgboost__colsample_bytree'],
        )

model_test = "XG_Boost"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (3985, 9)
features_train shape: (3985, 8)
target_train shape: (3985,)
Relatório salvo em: test_result_XG_Boost_2022_[]_False_False.txt


In [88]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

scale_pos_weight_val = dmu.get_scale_pos_weight(reference_year)

base_model = XGBClassifier(
            gamma=0,
            learning_rate=0.01,
            max_depth=3,
            n_estimators=100,
            reg_alpha=0.1,
            reg_lambda=2,
            scale_pos_weight=scale_pos_weight_val,
            eval_metric='logloss',
            random_state=random_state
            #subsample=best_hyperparams_grid_search['xgboost__subsample'],
            #colsample_bytree=best_hyperparams_grid_search['xgboost__colsample_bytree'],
        )

model_test = "XG_Boost"
optimal_threshold = 0.66
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1709, 9)
features_train shape: (1709, 8)
target_train shape: (1709,)
Relatório salvo em: test_result_XG_Boost_2022_[]_True_False.txt


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [124]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[2020]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

scale_pos_weight_val = dmu.get_scale_pos_weight(reference_year)

base_model = XGBClassifier(
            gamma=0.1,
            learning_rate=0.01,
            max_depth=3,
            n_estimators=250,
            reg_alpha=0.1,
            reg_lambda=1,
            scale_pos_weight=scale_pos_weight_val,
            eval_metric='logloss',
            random_state=random_state
            #subsample=best_hyperparams_grid_search['xgboost__subsample'],
            #colsample_bytree=best_hyperparams_grid_search['xgboost__colsample_bytree'],
        )

model_test = "XG_Boost"
optimal_threshold = 0.69
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (3806, 9)
features_train shape: (3806, 8)
target_train shape: (3806,)
Relatório salvo em: test_result_XG_Boost_2023_[2020]_False_False.txt


In [90]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[2020]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

scale_pos_weight_val = dmu.get_scale_pos_weight(reference_year)

base_model = XGBClassifier(
            gamma=0.2,
            learning_rate=0.01,
            max_depth=3,
            n_estimators=100,
            reg_alpha=0,
            reg_lambda=1.5,
            scale_pos_weight=scale_pos_weight_val,
            eval_metric='logloss',
            random_state=random_state
            #subsample=best_hyperparams_grid_search['xgboost__subsample'],
            #colsample_bytree=best_hyperparams_grid_search['xgboost__colsample_bytree'],
        )

model_test = "XG_Boost"
optimal_threshold = 0.63
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2039, 9)
features_train shape: (2039, 8)
target_train shape: (2039,)
Relatório salvo em: test_result_XG_Boost_2023_[2020]_True_False.txt


In [91]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

scale_pos_weight_val = dmu.get_scale_pos_weight(reference_year)

base_model = XGBClassifier(
            gamma=0,
            learning_rate=0.01,
            max_depth=3,
            n_estimators=250,
            reg_alpha=0,
            reg_lambda=1,
            scale_pos_weight=scale_pos_weight_val,
            eval_metric='logloss',
            random_state=random_state
            #subsample=best_hyperparams_grid_search['xgboost__subsample'],
            #colsample_bytree=best_hyperparams_grid_search['xgboost__colsample_bytree'],
        )

model_test = "XG_Boost"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (4892, 9)
features_train shape: (4892, 8)
target_train shape: (4892,)
Relatório salvo em: test_result_XG_Boost_2023_[]_False_False.txt


In [92]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

scale_pos_weight_val = dmu.get_scale_pos_weight(reference_year)

base_model = XGBClassifier(
            gamma=0.2,
            learning_rate=0.01,
            max_depth=5,
            n_estimators=100,
            reg_alpha=1,
            reg_lambda=1.5,
            scale_pos_weight=scale_pos_weight_val,
            eval_metric='logloss',
            random_state=random_state
            #subsample=best_hyperparams_grid_search['xgboost__subsample'],
            #colsample_bytree=best_hyperparams_grid_search['xgboost__colsample_bytree'],
        )

model_test = "XG_Boost"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2312, 9)
features_train shape: (2312, 8)
target_train shape: (2312,)
Relatório salvo em: test_result_XG_Boost_2023_[]_True_False.txt


In [93]:
dados = '../data/selected_data.csv'
reference_year = 2018
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# base model
base_model = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 7, p = 1, weights = 'uniform',  )

model_test = "KNN"
optimal_threshold = 0.57
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (497, 9)
features_train shape: (497, 8)
target_train shape: (497,)
Relatório salvo em: test_result_KNN_2018_[]_False_False.txt


found 0 physical cores < 1
  File "d:\Dropbox\Documentos\IFPB\Pesquisa\Taxa de Evasão\Dropout Rate Project\dropout_rate_project\.venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


In [94]:
dados = '../data/selected_data.csv'
reference_year = 2018
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# base model
base_model = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 7, p = 1, weights = 'uniform',  )

model_test = "KNN"
optimal_threshold = 0.57
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (289, 9)
features_train shape: (289, 8)
target_train shape: (289,)
Relatório salvo em: test_result_KNN_2018_[]_True_False.txt


In [95]:
dados = '../data/selected_data.csv'
reference_year = 2019
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# base model
base_model = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 7, p = 1, weights = 'uniform',  )

model_test = "KNN"
optimal_threshold = 0.86
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (967, 9)
features_train shape: (967, 8)
target_train shape: (967,)
Relatório salvo em: test_result_KNN_2019_[]_False_False.txt


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [96]:
dados = '../data/selected_data.csv'
reference_year = 2019
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# base model
base_model = KNeighborsClassifier(metric = 'euclidean', n_neighbors = 5, p = 1, weights = 'uniform',  )

model_test = "KNN"
optimal_threshold = 0.6
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (582, 9)
features_train shape: (582, 8)
target_train shape: (582,)
Relatório salvo em: test_result_KNN_2019_[]_True_False.txt


In [97]:
dados = '../data/selected_data.csv'
reference_year = 2020
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# base model
base_model = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 7, p = 1, weights = 'uniform',  )

model_test = "KNN"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1550, 9)
features_train shape: (1550, 8)
target_train shape: (1550,)
Relatório salvo em: test_result_KNN_2020_[]_False_False.txt


In [98]:
dados = '../data/selected_data.csv'
reference_year = 2020
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# base model
base_model = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 7, p = 1, weights = 'uniform',  )

model_test = "KNN"
optimal_threshold = 0.71
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (902, 9)
features_train shape: (902, 8)
target_train shape: (902,)
Relatório salvo em: test_result_KNN_2020_[]_True_False.txt


In [99]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[2020]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# base model
base_model = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 7, p = 1, weights = 'uniform',  )

model_test = "KNN"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1550, 9)
features_train shape: (1550, 8)
target_train shape: (1550,)
Relatório salvo em: test_result_KNN_2021_[2020]_False_False.txt


In [100]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[2020]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# base model
base_model = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 7, p = 1, weights = 'uniform',  )

model_test = "KNN"
optimal_threshold = 0.71
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (902, 9)
features_train shape: (902, 8)
target_train shape: (902,)
Relatório salvo em: test_result_KNN_2021_[2020]_True_False.txt


In [101]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# base model
base_model = KNeighborsClassifier(metric = 'euclidean', n_neighbors = 9, p = 1, weights = 'uniform',  )

model_test = "KNN"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2636, 9)
features_train shape: (2636, 8)
target_train shape: (2636,)
Relatório salvo em: test_result_KNN_2021_[]_False_False.txt


In [102]:
dados = '../data/selected_data.csv'
reference_year = 2021
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# base model
base_model = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 5, p = 1, weights = 'uniform',  )

model_test = "KNN"
optimal_threshold = 0.08
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1175, 9)
features_train shape: (1175, 8)
target_train shape: (1175,)
Relatório salvo em: test_result_KNN_2021_[]_True_False.txt


In [103]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[2020]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# base model
base_model = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 9, p = 1, weights = 'uniform',  )

model_test = "KNN"
optimal_threshold = 0.67
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2899, 9)
features_train shape: (2899, 8)
target_train shape: (2899,)
Relatório salvo em: test_result_KNN_2022_[2020]_False_False.txt


In [104]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[2020]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# base model
base_model = KNeighborsClassifier(metric = 'euclidean', n_neighbors = 9, p = 1, weights = 'uniform',  )

model_test = "KNN"
optimal_threshold = 0.67
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1436, 9)
features_train shape: (1436, 8)
target_train shape: (1436,)
Relatório salvo em: test_result_KNN_2022_[2020]_True_False.txt


In [105]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# base model
base_model = KNeighborsClassifier(metric = 'euclidean', n_neighbors = 9, p = 1, weights = 'uniform',  )

model_test = "KNN"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (3985, 9)
features_train shape: (3985, 8)
target_train shape: (3985,)
Relatório salvo em: test_result_KNN_2022_[]_False_False.txt


In [106]:
dados = '../data/selected_data.csv'
reference_year = 2022
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# base model
base_model = KNeighborsClassifier(metric = 'euclidean', n_neighbors = 9, p = 1, weights = 'uniform',  )

model_test = "KNN"
optimal_threshold = 0.89
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (1709, 9)
features_train shape: (1709, 8)
target_train shape: (1709,)
Relatório salvo em: test_result_KNN_2022_[]_True_False.txt


In [126]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[2020]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# base model
base_model = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 7, p = 1, weights = 'uniform',  )

model_test = "KNN"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (3806, 9)
features_train shape: (3806, 8)
target_train shape: (3806,)
Relatório salvo em: test_result_KNN_2023_[2020]_False_False.txt


In [108]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[2020]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# base model
base_model = KNeighborsClassifier(metric = 'euclidean', n_neighbors = 9, p = 1, weights = 'uniform',  )

model_test = "KNN"
optimal_threshold = 0.78
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2039, 9)
features_train shape: (2039, 8)
target_train shape: (2039,)
Relatório salvo em: test_result_KNN_2023_[2020]_True_False.txt


In [109]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[]
data_filter = False
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# base model
base_model = KNeighborsClassifier(metric = 'euclidean', n_neighbors = 9, p = 1, weights = 'uniform',  )

model_test = "KNN"
optimal_threshold = 0.5
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (4892, 9)
features_train shape: (4892, 8)
target_train shape: (4892,)
Relatório salvo em: test_result_KNN_2023_[]_False_False.txt


In [110]:
dados = '../data/selected_data.csv'
reference_year = 2023
drop_year=[]
data_filter = True
concat_inputs = False
random_state = 0

df = prepared_data_test(
    dados, data_filter, concat_inputs
)

# base model
base_model = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 7, p = 1, weights = 'uniform',  )

model_test = "KNN"
optimal_threshold = 0.86
best_threshold_tpr = 0.5

test_evaluated(df, reference_year, drop_year, random_state,
                                 data_filter, concat_inputs,
                                 base_model, model_test, optimal_threshold, best_threshold_tpr)

df_train shape: (2312, 9)
features_train shape: (2312, 8)
target_train shape: (2312,)
Relatório salvo em: test_result_KNN_2023_[]_True_False.txt
