In [1]:
import json
import pandas as pd
from pycaret.classification import setup, compare_models, save_model, pull, predict_model

# Carregar as configurações do arquivo config.json
try:
    with open('config.json', 'r') as f:
        config = json.load(f)
except FileNotFoundError:
    print("Arquivo config.json não encontrado.")
    raise

# # Carregar o dataset
# try:
#     df = pd.read_csv(f'./datasets/{config["file"]}')
# except pd.errors.EmptyDataError:
#     print("Erro: Dataset está vazio ou mal formatado.")
#     raise

# Verificar a extensão do arquivo para carregar corretamente
dataset_path = f'./datasets/{config["file"]}'
try:
    if dataset_path.endswith('.csv'):
        df = pd.read_csv(dataset_path)
    elif dataset_path.endswith('.xlsx'):
        df = pd.read_excel(dataset_path)
    else:
        raise ValueError("Formato de arquivo não suportado. Use CSV ou XLSX.")
except Exception as e:
    print(f"Erro ao carregar o dataset: {e}")
    raise

# Verificar se o dataframe tem colunas e dados
if df.empty or len(df.columns) == 0:
    raise ValueError("Erro: O dataset está vazio ou sem colunas.")

# Verificar se a variável alvo está no dataset
if config["target"] not in df.columns:
    raise ValueError(f"A coluna alvo '{config['target']}' não está presente no dataset.")

# Verificar se a coluna de target contém valores válidos
if df[config["target"]].isnull().sum() > 0:
    raise ValueError(f"A coluna alvo '{config['target']}' contém valores nulos. Por favor, limpe ou preencha esses valores.")

# Verificações e ajustes de configuração
normalize = config.get("normalize", False)
normalize_method = config.get("normalization_method", None) if normalize else None

remove_multicollinearity = config.get("remove_multicollinearity", False)
multicollinearity_threshold = config.get("multicollinearity_threshold", None) if remove_multicollinearity else None

fold_strategy = config.get("fold_strategy", None)  # Se não for informado, usa None
fold_number = config.get("fold_number", 10) if fold_strategy else None  # Se fold_strategy não estiver marcado, ignoramos

# Setup do PyCaret
clf = setup(
    data=df,
    target=config["target"],
    session_id=config["session_id"],
    normalize=normalize,
    normalize_method=normalize_method,
    train_size=1 - config["test_size"],
    fold_strategy=fold_strategy,
    fold=fold_number,
    remove_multicollinearity=remove_multicollinearity,
    multicollinearity_threshold=multicollinearity_threshold
)

# Treinar os modelos e salvar o melhor
best_model = compare_models()

# Verificar se best_model foi retornado corretamente
if best_model is None or (isinstance(best_model, list) and len(best_model) == 0):
    raise ValueError("Nenhum modelo foi comparado ou retornado. Verifique as configurações.")

# Se best_model for uma lista, pegar o primeiro modelo
if isinstance(best_model, list):
    best_model = best_model[0]

# Salvar o melhor modelo
save_model(best_model, './models/best_model')

# Puxar os resultados e salvar em CSV
results = pull()
results.to_csv('./models/results.csv', index=False)

# Predizer no dataset e salvar as predicoes
predictions = predict_model(best_model)
predictions.to_csv('./models/predictions.csv', index=False)

# Salvar os parâmetros do melhor modelo
with open('./models/best_model_params.json', 'w') as f:
    json.dump(best_model.get_params(), f)

# Salvar o nome do modelo no arquivo config.json
with open('config.json', 'r') as f:
    config = json.load(f)

config['model_name'] = best_model.__class__.__name__  # Obtendo o nome da classe do modelo

with open('config.json', 'w') as f:
    json.dump(config, f)

print("Treinamento e predições completos. Resultados foram salvos.")


Unnamed: 0,Description,Value
0,Session id,1245
1,Target,species
2,Target type,Multiclass
3,Target mapping,"Adelie: 0, Chinstrap: 1, Gentoo: 2"
4,Original data shape,"(333, 7)"
5,Transformed data shape,"(333, 9)"
6,Transformed train set shape,"(233, 9)"
7,Transformed test set shape,"(100, 9)"
8,Numeric features,4
9,Categorical features,2


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.22
ridge,Ridge Classifier,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0125
lda,Linear Discriminant Analysis,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0125
lr,Logistic Regression,0.9957,0.0,0.9957,0.9959,0.9956,0.9932,0.9933,0.33
svm,SVM - Linear Kernel,0.9914,0.0,0.9914,0.9919,0.9914,0.9865,0.9868,0.205
et,Extra Trees Classifier,0.9914,0.9998,0.9914,0.992,0.9912,0.9864,0.9868,0.0325
rf,Random Forest Classifier,0.9828,0.9987,0.9828,0.9831,0.9827,0.9731,0.9733,0.0375
lightgbm,Light Gradient Boosting Machine,0.9784,0.9996,0.9784,0.9787,0.9782,0.9661,0.9665,0.185
gbc,Gradient Boosting Classifier,0.9742,0.0,0.9742,0.9745,0.9741,0.9596,0.9598,0.04
dt,Decision Tree Classifier,0.9399,0.9522,0.9399,0.9409,0.9398,0.9059,0.9064,0.0125


Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.98,0.9991,0.98,0.98,0.98,0.9686,0.9686


Treinamento e predições completos. Resultados foram salvos.


In [2]:
pip install pycaret











Note: you may need to restart the kernel to use updated packages.
