In [1]:
import json
import pandas as pd
from pycaret.classification import setup, compare_models, save_model, pull, predict_model

# Carregar as configurações do arquivo config.json
try:
    with open('config.json', 'r') as f:
        config = json.load(f)
except FileNotFoundError:
    print("Arquivo config.json não encontrado.")
    raise

# Verificar a extensão do arquivo para carregar corretamente
dataset_path = f'./datasets/{config["file"]}'
try:
    if dataset_path.endswith('.csv'):
        df = pd.read_csv(dataset_path)
    elif dataset_path.endswith('.xlsx'):
        df = pd.read_excel(dataset_path)
    else:
        raise ValueError("Formato de arquivo não suportado. Use CSV ou XLSX.")
except Exception as e:
    print(f"Erro ao carregar o dataset: {e}")
    raise

# Remover caracteres especiais dos nomes das colunas
df.columns = df.columns.str.replace('[^A-Za-z0-9_]', '_', regex=True)

# Verificar se o dataframe tem colunas e dados
if df.empty or len(df.columns) == 0:
    raise ValueError("Erro: O dataset está vazio ou sem colunas.")

# Verificar se a variável alvo está no dataset
if config["target"] not in df.columns:
    raise ValueError(f"A coluna alvo '{config['target']}' não está presente no dataset.")

# Verificar se a coluna de target contém valores válidos
if df[config["target"]].isnull().sum() > 0:
    raise ValueError(f"A coluna alvo '{config['target']}' contém valores nulos. Por favor, limpe ou preencha esses valores.")

# Verificações e ajustes de configuração
normalize = config.get("normalize", False)
normalize_method = config.get("normalization_method", None) if normalize else None

remove_multicollinearity = config.get("remove_multicollinearity", False)
multicollinearity_threshold = config.get("multicollinearity_threshold", None) if remove_multicollinearity else None

fold_strategy = config.get("fold_strategy", None)  # Se não for informado, usa None
fold_number = config.get("fold_number", 10) if fold_strategy else None  # Se fold_strategy não estiver marcado, ignoramos

fix_imbalance = config.get("fix_imbalance", False)
pca = config.get("pca", False)
pca_method = config.get("pca_method", None) if pca else None
pca_components = config.get("pca_components", None) if pca else None

feature_selection = config.get("feature_selection", False)
n_features_to_select = config.get("n_features_to_select", None) if feature_selection else None

# Validação para garantir que o número de features a ser selecionado seja válido
if feature_selection and n_features_to_select is not None:
    num_features = df.shape[1] - 1  # Número total de features menos a variável alvo
    if n_features_to_select >= num_features:
        raise ValueError(f"The number of features to select ({n_features_to_select}) must be less than the available features ({num_features}). Please adjust the value.")

# Setup do PyCaret
try:
    clf = setup(
    data=df,
    target=config["target"],
    session_id=config["session_id"],
    normalize=normalize,
    normalize_method=normalize_method,
    train_size=1 - config["test_size"],
    fold_strategy=fold_strategy,
    fold=fold_number,
    remove_multicollinearity=remove_multicollinearity,
    multicollinearity_threshold=multicollinearity_threshold,
    fix_imbalance=fix_imbalance,
    pca=pca,
    pca_method=pca_method,
    pca_components=pca_components,
    feature_selection=feature_selection,
    n_features_to_select=n_features_to_select
)
except ValueError as e:
    print(f"Error during setup: {str(e)}")
    raise

# Puxar a tabela de parâmetros de configuração do PyCaret e salvar
setup_summary = pull()
setup_summary.to_csv('./models/setup_summary.csv', index=False)

# Treinar os modelos e salvar o melhor
best_model = compare_models()

# Verificar se best_model foi retornado corretamente
if best_model is None or (isinstance(best_model, list) and len(best_model) == 0):
    raise ValueError("Nenhum modelo foi comparado ou retornado. Verifique as configurações.")

# Se best_model for uma lista, pegar o primeiro modelo
if isinstance(best_model, list):
    best_model = best_model[0]

# Salvar o melhor modelo
save_model(best_model, './models/best_model')

# Puxar os resultados e salvar em CSV
results = pull()
results.to_csv('./models/results.csv', index=False)

# Predizer no dataset e salvar as predicoes
predictions = predict_model(best_model)
predictions.to_csv('./models/predictions.csv', index=False)

# Salvar os parâmetros do melhor modelo
with open('./models/best_model_params.json', 'w') as f:
    json.dump(best_model.get_params(), f)

# Salvar o nome do modelo no arquivo config.json
with open('config.json', 'r') as f:
    config = json.load(f)

config['model_name'] = best_model.__class__.__name__  # Obtendo o nome da classe do modelo

with open('config.json', 'w') as f:
    json.dump(config, f)

print("Treinamento e predições completos. Resultados foram salvos.")

Unnamed: 0,Description,Value
0,Session id,1245
1,Target,Species
2,Target type,Multiclass
3,Target mapping,"Iris-setosa: 0, Iris-versicolor: 1, Iris-virginica: 2"
4,Original data shape,"(150, 5)"
5,Transformed data shape,"(150, 5)"
6,Transformed train set shape,"(105, 5)"
7,Transformed test set shape,"(45, 5)"
8,Numeric features,4
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lda,Linear Discriminant Analysis,0.9811,0.0,0.9811,0.983,0.9811,0.9717,0.9727,0.0075
qda,Quadratic Discriminant Analysis,0.9719,0.0,0.9719,0.9728,0.9718,0.9578,0.9583,0.0075
lr,Logistic Regression,0.9626,0.0,0.9626,0.9643,0.9625,0.9439,0.9448,0.4125
knn,K Neighbors Classifier,0.9626,0.9889,0.9626,0.9644,0.9625,0.9439,0.9449,0.23
nb,Naive Bayes,0.9626,0.9959,0.9626,0.9643,0.9625,0.9439,0.9448,0.205
et,Extra Trees Classifier,0.9626,0.9969,0.9626,0.9643,0.9625,0.9439,0.9448,0.0275
dt,Decision Tree Classifier,0.9533,0.9649,0.9533,0.9567,0.9529,0.93,0.9321,0.2
rf,Random Forest Classifier,0.9533,0.9969,0.9533,0.9567,0.9529,0.93,0.9321,0.035
gbc,Gradient Boosting Classifier,0.9533,0.0,0.9533,0.9567,0.9529,0.93,0.9321,0.0275
lightgbm,Light Gradient Boosting Machine,0.953,0.9968,0.953,0.9537,0.9529,0.9294,0.9299,0.075


Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Linear Discriminant Analysis,0.9778,0.9956,0.9778,0.9792,0.9778,0.9667,0.9674


Treinamento e predições completos. Resultados foram salvos.


In [2]:
pip install pycaret











Note: you may need to restart the kernel to use updated packages.
