In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical

In [15]:
import pickle

# Abre o arquivo .pkl usando pickle
with open('Titanic.pkl', 'rb') as f:
    data = pickle.load(f)

# Verifica a estrutura dos dados carregados
print("Tipo de dados carregados:", type(data))
print("Conteúdo dos dados:", data)

# Se for uma lista, acesse os elementos por índice
if isinstance(data, list):
    X_treino = data[0]
    X_teste = data[1]
    y_treino = data[2]
    y_teste = data[3]
    feature_names = data[4] if len(data) > 4 else None  # Verifica se feature_names existe
else:
    raise TypeError("O arquivo .pkl não contém uma lista. Verifique a estrutura dos dados.")

# Verifica os dados carregados
print("X_treino:", X_treino)
print("X_teste:", X_teste)
print("y_treino:", y_treino)
print("y_teste:", y_teste)
print("Feature Names:", feature_names)

Tipo de dados carregados: <class 'list'>
Conteúdo dos dados: [     Pclass  Sex   Age  SibSp  Parch      Fare
331       1    1  45.5      0      0   28.5000
733       2    1  23.0      0      0   13.0000
382       3    1  32.0      0      0    7.9250
704       3    1  26.0      1      0    7.8542
813       3    0   6.0      4      2   31.2750
..      ...  ...   ...    ...    ...       ...
106       3    0  21.0      0      0    7.6500
270       1    1  30.0      0      0   31.0000
860       3    1  41.0      2      0   14.1083
435       1    0  14.0      1      2  120.0000
102       1    1  21.0      0      1   77.2875

[712 rows x 6 columns],      Pclass  Sex   Age  SibSp  Parch     Fare
709       3    1  30.0      1      1  15.2458
439       2    1  31.0      0      0  10.5000
840       3    1  20.0      0      0   7.9250
720       2    0   6.0      0      1  33.0000
39        3    0  14.0      1      0  11.2417
..      ...  ...   ...    ...    ...      ...
433       3    1  17.0     

In [16]:
# Espaço de busca para Random Forest
rf_params = {
    'n_estimators': Integer(50, 500),       # Número de árvores
    'max_depth': Integer(3, 20),            # Profundidade máxima
    'min_samples_split': Integer(2, 10),    # Mínimo de amostras para dividir um nó
    'min_samples_leaf': Integer(1, 5),       # Mínimo de amostras em uma folha
    'max_features': Categorical(['sqrt', 'log2', None])  # Número de features para split
}

# Criar o modelo
rf = RandomForestClassifier(random_state=42)

# Busca Bayesiana
rf_bayes = BayesSearchCV(
    rf,
    rf_params,
    n_iter=32,          # Número de iterações de busca
    cv=5,               # Número de folds de validação cruzada
    n_jobs=-1,          # Usar todos os cores do CPU
    random_state=42,
    verbose=1
)

# Treinar
rf_bayes.fit(X_treino, y_treino)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [17]:
# Melhores parâmetros
print("Melhores parâmetros (Random Forest):", rf_bayes.best_params_)

Melhores parâmetros (Random Forest): OrderedDict({'max_depth': 19, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 111})


In [18]:
# Avaliar no teste
y_pred = rf_bayes.predict(X_teste)
print("Acurácia (Random Forest):", accuracy_score(y_teste, y_pred))

Acurácia (Random Forest): 0.8212290502793296


In [19]:
# Espaço de busca para Árvore de Decisão
dt_params = {
    'max_depth': Integer(3, 30),
    'min_samples_split': Integer(2, 10),
    'min_samples_leaf': Integer(1, 5),
    'max_features': Categorical(['sqrt', 'log2', None]),
    'criterion': Categorical(['gini', 'entropy'])
}

# Criar o modelo
dt = DecisionTreeClassifier(random_state=42)

# Busca Bayesiana
dt_bayes = BayesSearchCV(
    dt,
    dt_params,
    n_iter=32,
    cv=5,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# Treinar
dt_bayes.fit(X_treino, y_treino)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [20]:
# Melhores parâmetros
print("Melhores parâmetros (Árvore de Decisão):", dt_bayes.best_params_)

Melhores parâmetros (Árvore de Decisão): OrderedDict({'criterion': 'gini', 'max_depth': 3, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 10})


In [21]:
# Avaliar no teste
y_pred = dt_bayes.predict(X_teste)
print("Acurácia (Árvore de Decisão):", accuracy_score(y_teste, y_pred))

Acurácia (Árvore de Decisão): 0.7988826815642458


In [22]:
# Espaço de busca para Naive Bayes
nb_params = {
    'var_smoothing': Real(1e-10, 1e-2, prior='log-uniform')  # Suavização da variância
}

# Criar o modelo
nb = GaussianNB()

# Busca Bayesiana
nb_bayes = BayesSearchCV(
    nb,
    nb_params,
    n_iter=32,
    cv=5,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# Treinar
nb_bayes.fit(X_treino, y_treino)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [23]:
# Melhores parâmetros
print("Melhores parâmetros (Naive Bayes):", nb_bayes.best_params_)

Melhores parâmetros (Naive Bayes): OrderedDict({'var_smoothing': 2.224459696801683e-06})


In [24]:
# Avaliar no teste
y_pred = nb_bayes.predict(X_teste)
print("Acurácia (Naive Bayes):", accuracy_score(y_teste, y_pred))

Acurácia (Naive Bayes): 0.770949720670391
