In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold
import optuna
import numpy as np


df = pd.read_csv(r'data\train_cleaned.csv')

  from .autonotebook import tqdm as notebook_tqdm


# Código para comparação entre XGBoost e CatBoost

## Preparação dos dados para o **XGBoost**

In [2]:
# Transformando o alvo em 0/1
y = df['Transported'].astype(int)

# Removendo colunas irrelevantes para o modelo (sem engenharia de features)
cols_to_drop = ['PassengerId', 'Group', 'Surname', 'Cabin', 'Transported']
X_raw = df.drop(columns=cols_to_drop)

# Copiando os dados por segurança
X_xgb = X_raw.copy()

# Converter colunas booleanas (CryoSleep, VIP) para  0 ou 1
bool_cols = ['CryoSleep', 'VIP']
for col in bool_cols:
    # Preenche nulos com False (ou moda) antes de converter para garantir integridade
    X_xgb[col] = X_xgb[col].fillna(False).astype(int)

# One-Hot Encoding para XGBoost
X_xgb = pd.get_dummies(X_xgb, columns=['HomePlanet', 'Destination'], dummy_na=False)

# Divisão Treino/Teste para XGBoost
X_train_xgb, X_test_xgb, y_train, y_test = train_test_split(
    X_xgb, y, test_size=0.2, random_state=42
)

print("--- Formato dos dados ---")
print(f"XGBoost Input Shape: {X_train_xgb.shape} (Colunas expandidas com One-Hot)")



--- Formato dos dados ---
XGBoost Input Shape: (6954, 16) (Colunas expandidas com One-Hot)


## Preparação dos dados para o **CatBoost**

In [3]:
# Copiando os dados por segurança
X_cat = X_raw.copy()

# Guardando as colunas categóricas para passar para o modelo
cat_features = ['HomePlanet', 'Destination', 'CryoSleep', 'VIP']

# Preencher nulos em categóricas com 'Missing' e garantir que são strings
for col in cat_features:
    X_cat[col] = X_cat[col].fillna('Missing').astype(str)

# Divisão Treino/Teste para Catboost
X_train_cat, X_val_cat, y_train, y_val = train_test_split(
    X_cat, y, test_size=0.2, random_state=42
)

print("--- Formato dos dados ---")
print(f"CatBoost Input Shape: {X_train_cat.shape} (Colunas originais mantidas)")

--- Formato dos dados ---
CatBoost Input Shape: (6954, 12) (Colunas originais mantidas)


## Treinamento

In [4]:
model_cat = CatBoostClassifier(
    iterations=1000,           # Quantidade de árvores (épocas)
    learning_rate=0.03,        # Velocidade de aprendizado
    depth=6,                   # Profundidade da árvore
    l2_leaf_reg=3,             # Regularização L2 (evita overfitting)
    loss_function='Logloss',   # Função de perda para classificação binária
    eval_metric='Accuracy',    # Métrica para acompanhar durante o treino
    cat_features=cat_features, # LISTA IMPORTANTE: avisa quais colunas são texto
    random_seed=42,            # Para reprodutibilidade
    verbose=100,               # Imprime o progresso a cada 100 iterações
    early_stopping_rounds=50   # Para se o modelo não melhorar após 50 rodadas
)

print("Iniciando treinamento do CatBoost...")
model_cat.fit(
    X_train_cat, y_train,
    eval_set=(X_val_cat, y_val), # Dados de validação para medir performance
    use_best_model=True            # Garante que ao final ele use a melhor versão encontrada
)

print("\n--- Resultados Finais ---")
y_pred = model_cat.predict(X_val_cat)
acc = accuracy_score(y_val, y_pred)
print(f"Acurácia no vale: {acc:.4f}")
print("\nRelatório de Classificação:\n", classification_report(y_val, y_pred))

Iniciando treinamento do CatBoost...
0:	learn: 0.7837216	test: 0.7682576	best: 0.7682576 (0)	total: 191ms	remaining: 3m 11s
100:	learn: 0.8006903	test: 0.7855089	best: 0.7860840 (86)	total: 4.04s	remaining: 36s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7883841288
bestIteration = 113

Shrink model to first 114 iterations.

--- Resultados Finais ---
Acurácia no vale: 0.7884

Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.82      0.73      0.77       861
           1       0.76      0.85      0.80       878

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739



## Treinamento com Optuna

In [5]:
def objective(trial):
    """
    Função que o Optuna vai rodar várias vezes (trials).
    Em cada rodada, ele escolhe um conjunto diferente de parâmetros.
    """
    
    param = {
        'iterations': 1000,                         # Mantemos fixo alto, early_stopping cuida do resto
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'depth': trial.suggest_int('depth', 4, 10), # Testa profundidades de 4 a 10
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
        'loss_function': 'Logloss',
        'eval_metric': 'Accuracy',
        'cat_features': cat_features,
        'random_seed': 42,
        'verbose': False,                           # Desliga prints para não poluir o log do Optuna
        'early_stopping_rounds': 50
    }

    
    model = CatBoostClassifier(**param)
    
    model.fit(
        X_train_cat, y_train,
        eval_set=(X_val_cat, y_val),
        use_best_model=True
    )

    
    preds = model.predict(X_val_cat)
    accuracy = accuracy_score(y_val, preds)
    
    return accuracy


print("Iniciando busca de hiperparâmetros com Optuna...")
study = optuna.create_study(direction='maximize') # Queremos maximizar a Acurácia
study.optimize(objective, n_trials=20)            # Executa 20 tentativas diferentes

print("\n--- Melhores Parâmetros Encontrados ---")
print(study.best_params)


print("\nTreinando modelo final...")
final_params = study.best_params
final_params['cat_features'] = cat_features # Adiciona fixos que não foram otimizados
final_params['iterations'] = 1000
final_params['early_stopping_rounds'] = 50
final_params['verbose'] = 100

best_model = CatBoostClassifier(**final_params)
best_model.fit(X_train_cat, y_train, eval_set=(X_val_cat, y_val), use_best_model=True)

print("\n--- Resultados Finais (Melhor Modelo) ---")
y_pred = best_model.predict(X_val_cat)
acc = accuracy_score(y_val, y_pred)
print(f"Acurácia Final: {acc:.4f}")

[I 2025-12-04 13:40:01,558] A new study created in memory with name: no-name-0e3e0309-247b-418d-a817-7abd65d46b9a


Iniciando busca de hiperparâmetros com Optuna...


[I 2025-12-04 13:40:07,462] Trial 0 finished with value: 0.7849338700402531 and parameters: {'learning_rate': 0.026605760701390455, 'depth': 8, 'l2_leaf_reg': 3}. Best is trial 0 with value: 0.7849338700402531.
[I 2025-12-04 13:40:13,133] Trial 1 finished with value: 0.777458309373203 and parameters: {'learning_rate': 0.001356218814094983, 'depth': 9, 'l2_leaf_reg': 5}. Best is trial 0 with value: 0.7849338700402531.
[I 2025-12-04 13:40:16,837] Trial 2 finished with value: 0.7809085681426107 and parameters: {'learning_rate': 0.032335773506104525, 'depth': 7, 'l2_leaf_reg': 9}. Best is trial 0 with value: 0.7849338700402531.
[I 2025-12-04 13:40:21,096] Trial 3 finished with value: 0.7786083956296722 and parameters: {'learning_rate': 0.002430829837345541, 'depth': 6, 'l2_leaf_reg': 4}. Best is trial 0 with value: 0.7849338700402531.
[I 2025-12-04 13:40:29,404] Trial 4 finished with value: 0.7883841288096607 and parameters: {'learning_rate': 0.06136781983687496, 'depth': 5, 'l2_leaf_reg':


--- Melhores Parâmetros Encontrados ---
{'learning_rate': 0.0809849723070665, 'depth': 5, 'l2_leaf_reg': 10}

Treinando modelo final...
0:	learn: 0.6647803	test: 0.6654199	best: 0.6654199 (0)	total: 55.3ms	remaining: 55.2s
100:	learn: 0.4132517	test: 0.4381817	best: 0.4381570 (99)	total: 6.34s	remaining: 56.4s
200:	learn: 0.3982980	test: 0.4338794	best: 0.4338794 (200)	total: 12.1s	remaining: 48.3s
300:	learn: 0.3829121	test: 0.4313740	best: 0.4313287 (299)	total: 18.2s	remaining: 42.2s
400:	learn: 0.3686503	test: 0.4304681	best: 0.4304681 (400)	total: 24.3s	remaining: 36.4s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4301355772
bestIteration = 402

Shrink model to first 403 iterations.

--- Resultados Finais (Melhor Modelo) ---
Acurácia Final: 0.7930


## Treinamento com Optuna e Cross Validation

In [6]:
X_train_full, X_holdout, y_train_full, y_holdout = train_test_split(
    X_cat, y, test_size=0.2, random_state=42
)

def objective(trial):
    param = {
        'iterations': 1000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
        'loss_function': 'Logloss',
        'eval_metric': 'Accuracy',
        'cat_features': cat_features,
        'random_seed': 42,
        'verbose': False,
        'early_stopping_rounds': 50
    }

    
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []

    # O loop abaixo treina o modelo 5 vezes para CADA sugestão do Optuna
    for train_index, valid_index in kf.split(X_train_full, y_train_full):
        # Separa os dados da dobra atual
        X_tr, X_val = X_train_full.iloc[train_index], X_train_full.iloc[valid_index]
        y_tr, y_val = y_train_full.iloc[train_index], y_train_full.iloc[valid_index]

        # Cria e treina o modelo
        model = CatBoostClassifier(**param)
        model.fit(X_tr, y_tr, eval_set=(X_val, y_val), use_best_model=True)

        # Avalia
        preds = model.predict(X_val)
        score = accuracy_score(y_val, preds)
        scores.append(score)

   
    return np.mean(scores)

# --- Executa o Optuna ---
print("Iniciando otimização com Cross-Validation...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10) # n_trials baixo para exemplo, use 20-50 na prática

print("\n--- Melhores Parâmetros (Validados via CV) ---")
print(study.best_params)

# --- Treino Final ---
# Agora treinamos com TODO o X_train_full usando os melhores parametros
print("\nTreinando modelo final...")
final_model = CatBoostClassifier(**study.best_params, 
                                 cat_features=cat_features,
                                 verbose=100)

final_model.fit(X_train_full, y_train_full)

# --- A PROVA DE FOGO ---
# Testamos no Holdout Set (que o Optuna nunca viu nos folds)
print("\n--- Teste no Holdout Set (Dados virgens) ---")
holdout_preds = final_model.predict(X_holdout)
print(f"Acurácia Real: {accuracy_score(y_holdout, holdout_preds):.4f}")

[I 2025-12-04 13:43:00,539] A new study created in memory with name: no-name-00ec66ff-0a72-4d17-8063-61b64fe288a9


Iniciando otimização com Cross-Validation...


[I 2025-12-04 13:43:37,920] Trial 0 finished with value: 0.7968083620810038 and parameters: {'learning_rate': 0.0830043590122327, 'depth': 6, 'l2_leaf_reg': 9}. Best is trial 0 with value: 0.7968083620810038.
[I 2025-12-04 13:44:11,110] Trial 1 finished with value: 0.7978144184867778 and parameters: {'learning_rate': 0.16680381460035978, 'depth': 7, 'l2_leaf_reg': 8}. Best is trial 1 with value: 0.7978144184867778.
[I 2025-12-04 13:44:48,606] Trial 2 finished with value: 0.7998281863366244 and parameters: {'learning_rate': 0.16597145835603172, 'depth': 6, 'l2_leaf_reg': 2}. Best is trial 2 with value: 0.7998281863366244.
[I 2025-12-04 13:45:15,854] Trial 3 finished with value: 0.7992530605278538 and parameters: {'learning_rate': 0.16663725902583437, 'depth': 5, 'l2_leaf_reg': 2}. Best is trial 2 with value: 0.7998281863366244.
[I 2025-12-04 13:45:50,107] Trial 4 finished with value: 0.7979588205783325 and parameters: {'learning_rate': 0.2067095825955767, 'depth': 7, 'l2_leaf_reg': 3}. 


--- Melhores Parâmetros (Validados via CV) ---
{'learning_rate': 0.15177994385423652, 'depth': 6, 'l2_leaf_reg': 2}

Treinando modelo final...
0:	learn: 0.6342131	total: 62.8ms	remaining: 1m 2s
100:	learn: 0.3688744	total: 7.22s	remaining: 1m 4s
200:	learn: 0.3164891	total: 14.5s	remaining: 57.5s
300:	learn: 0.2797671	total: 21.4s	remaining: 49.8s
400:	learn: 0.2531435	total: 28.5s	remaining: 42.5s
500:	learn: 0.2299864	total: 35.9s	remaining: 35.8s
600:	learn: 0.2102341	total: 43s	remaining: 28.6s
700:	learn: 0.1951697	total: 50s	remaining: 21.3s
800:	learn: 0.1826427	total: 57.3s	remaining: 14.2s
900:	learn: 0.1714366	total: 1m 4s	remaining: 7.09s
999:	learn: 0.1610578	total: 1m 11s	remaining: 0us

--- Teste no Holdout Set (Dados virgens) ---
Acurácia Real: 0.7803
