In [2]:
import json
import pandas as pd
from pycaret.classification import setup, compare_models, save_model, pull, predict_model

# Carregar as configurações do arquivo config.json
try:
    with open('config.json', 'r') as f:
        config = json.load(f)
except FileNotFoundError:
    print("Arquivo config.json não encontrado.")
    raise

# Verificar a extensão do arquivo para carregar corretamente
dataset_path = f'./datasets/{config["file"]}'
try:
    if dataset_path.endswith('.csv'):
        df = pd.read_csv(dataset_path)
    elif dataset_path.endswith('.xlsx'):
        df = pd.read_excel(dataset_path)
    else:
        raise ValueError("Formato de arquivo não suportado. Use CSV ou XLSX.")
except Exception as e:
    print(f"Erro ao carregar o dataset: {e}")
    raise

# Verificar se o dataframe tem colunas e dados
if df.empty or len(df.columns) == 0:
    raise ValueError("Erro: O dataset está vazio ou sem colunas.")

# Verificar se a variável alvo está no dataset
if config["target"] not in df.columns:
    raise ValueError(f"A coluna alvo '{config['target']}' não está presente no dataset.")

# Verificar se a coluna de target contém valores válidos
if df[config["target"]].isnull().sum() > 0:
    raise ValueError(f"A coluna alvo '{config['target']}' contém valores nulos. Por favor, limpe ou preencha esses valores.")

# Verificações e ajustes de configuração
normalize = config.get("normalize", False)
normalize_method = config.get("normalization_method", None) if normalize else None

remove_multicollinearity = config.get("remove_multicollinearity", False)
multicollinearity_threshold = config.get("multicollinearity_threshold", None) if remove_multicollinearity else None

fold_strategy = config.get("fold_strategy", None)  # Se não for informado, usa None
fold_number = config.get("fold_number", 10) if fold_strategy else None  # Se fold_strategy não estiver marcado, ignoramos

# Setup do PyCaret
clf = setup(
    data=df,
    target=config["target"],
    session_id=config["session_id"],
    normalize=normalize,
    normalize_method=normalize_method,
    train_size=1 - config["test_size"],
    fold_strategy=fold_strategy,
    fold=fold_number,
    remove_multicollinearity=remove_multicollinearity,
    multicollinearity_threshold=multicollinearity_threshold
)

# Treinar os modelos e salvar o melhor
best_model = compare_models()

# Verificar se best_model foi retornado corretamente
if best_model is None or (isinstance(best_model, list) and len(best_model) == 0):
    raise ValueError("Nenhum modelo foi comparado ou retornado. Verifique as configurações.")

# Se best_model for uma lista, pegar o primeiro modelo
if isinstance(best_model, list):
    best_model = best_model[0]

# Salvar o melhor modelo
save_model(best_model, './models/best_model')

# Puxar os resultados e salvar em CSV
results = pull()
results.to_csv('./models/results.csv', index=False)

# Predizer no dataset e salvar as predicoes
predictions = predict_model(best_model)
predictions.to_csv('./models/predictions.csv', index=False)

# Salvar os parâmetros do melhor modelo
with open('./models/best_model_params.json', 'w') as f:
    json.dump(best_model.get_params(), f)

# Salvar o nome do modelo no arquivo config.json
with open('config.json', 'r') as f:
    config = json.load(f)

config['model_name'] = best_model.__class__.__name__  # Obtendo o nome da classe do modelo

with open('config.json', 'w') as f:
    json.dump(config, f)

print("Treinamento e predições completos. Resultados foram salvos.")


Unnamed: 0,Description,Value
0,Session id,1245
1,Target,sex
2,Target type,Binary
3,Target mapping,"Female: 0, Male: 1"
4,Original data shape,"(333, 7)"
5,Transformed data shape,"(333, 11)"
6,Transformed train set shape,"(266, 11)"
7,Transformed test set shape,"(67, 11)"
8,Numeric features,4
9,Categorical features,2


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.9135,0.9651,0.9135,0.9181,0.9133,0.8271,0.8316,0.04
ridge,Ridge Classifier,0.906,0.9648,0.906,0.9118,0.9057,0.8121,0.8178,0.475
lr,Logistic Regression,0.8947,0.9655,0.8947,0.8992,0.8944,0.7895,0.7939,0.82
svm,SVM - Linear Kernel,0.891,0.9623,0.891,0.8984,0.8904,0.7821,0.7893,0.02
knn,K Neighbors Classifier,0.8835,0.9474,0.8835,0.8934,0.8827,0.767,0.7768,0.44
gbc,Gradient Boosting Classifier,0.8722,0.9346,0.8722,0.8771,0.8717,0.7446,0.7494,0.055
rf,Random Forest Classifier,0.8647,0.9391,0.8647,0.8706,0.864,0.7295,0.7353,0.09
et,Extra Trees Classifier,0.8571,0.9557,0.8571,0.8656,0.8563,0.7144,0.7227,0.065
ada,Ada Boost Classifier,0.8421,0.9312,0.8421,0.8491,0.841,0.6845,0.6913,0.045
lightgbm,Light Gradient Boosting Machine,0.8271,0.9227,0.8271,0.8383,0.8251,0.6545,0.6653,0.075


Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.8955,0.9688,0.8955,0.8958,0.8955,0.7909,0.7913


Treinamento e predições completos. Resultados foram salvos.


In [26]:
pip install pycaret


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
