In [38]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from skopt import gp_minimize
from skopt.space import Real, Categorical
import numpy as np




In [39]:
trainData = pd.read_csv("archive/train.csv")
testData = pd.read_csv("archive/test.csv")

In [40]:

trainData

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [41]:
sex_mapping = {'male': 0, 'female': 1}
port_mapping = {'S': 1, 'C': 2, 'Q': 3}

def prepare_data(dataset: pd.DataFrame) -> pd.DataFrame:
    datasetCopy = dataset.copy()
    datasetCopy['Woman'] = datasetCopy['Sex'].map(sex_mapping)

    # Substituindo inplace por atribuição direta
    datasetCopy['Age'] = datasetCopy['Age'].fillna(datasetCopy['Age'].median())
    datasetCopy['Fare'] = datasetCopy['Fare'].fillna(datasetCopy['Fare'].median())
    datasetCopy['Embarked'] = datasetCopy['Embarked'].fillna(datasetCopy['Embarked'].mode()[0])

    datasetCopy['Port'] = datasetCopy['Embarked'].map(port_mapping)
    datasetCopy['child'] = np.where(datasetCopy['Age'] < 14, 1, 0)

    return datasetCopy


In [42]:
y_train = trainData['Survived']
X_train = trainData.drop(columns=['Survived', 'PassengerId'], axis=1)
X_test = testData.drop(columns=['PassengerId'], axis=1)

X_train = prepare_data(X_train)
X_test = prepare_data(X_test)

In [43]:
features = ['Age', 'Woman', 'Fare', 'Port', 'child']
X_train = X_train[features]
X_test = X_test[features]

In [44]:
from skopt.space import Real, Categorical, Integer

space = [
    Integer(100, 500, name='n_estimators'),
    Integer(1, 100, name='max_depth'),
    Categorical(['gini', 'entropy', 'log_loss'], name='criterion'),
    Integer(1, 100, name='min_samples_leaf'),
    Integer(2, 100, name='max_leaf_nodes'),
    Categorical([True, False], name='bootstrap')
]

def lr_predict(params):
    n_estimators, max_depth, criterion, min_samples_leaf, max_leaf_nodes, bootstrap = params

    try:
        model_lr = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            criterion=criterion,
            min_samples_leaf=min_samples_leaf,
            max_leaf_nodes=max_leaf_nodes,
            bootstrap=bootstrap,
            random_state=42
        )
        score = cross_val_score(model_lr, X_train, y_train, cv=5, scoring='accuracy')
        return -score.mean()
    except Exception as e: # Added exception handling to see the error if it occurs again
        print(f"Error during cross-validation: {e}")
        return np.inf

In [45]:

# Execução da otimização
result = gp_minimize(
    func=lr_predict,
    dimensions=space,
    n_calls=50,
    random_state=42
)

# Resultados
print("Melhores parâmetros encontrados:")
print(f"n_estimators: {result.x[0]}")
print(f"max_depth: {result.x[1]}")
print(f"criterion: {result.x[2]}")
print(f"min_samples_leaf: {result.x[3]}")
print(f"max_leaf_nodes: {result.x[4]}")
print(f"bootstrap: {result.x[5]}")

Melhores parâmetros encontrados:
n_estimators: 131
max_depth: 47
criterion: entropy
min_samples_leaf: 4
max_leaf_nodes: 57
bootstrap: True


In [46]:
from sklearn.metrics import mean_squared_error # Import mean_squared_error

model_lr = RandomForestClassifier(n_estimators=result.x[0], max_depth=result.x[1], criterion=result.x[2], min_samples_leaf=result.x[3], max_leaf_nodes=result.x[4], bootstrap=result.x[5], random_state=42)
model_lr.fit(X_train, y_train)
y_pred = model_lr.predict(X_test)
mse = mean_squared_error(y_train, model_lr.predict(X_train))
print("Mean Squared Error on training data:", (mse*100))
print("Training Accuracy:", model_lr.score(X_train, y_train)*100)

Mean Squared Error on training data: 13.468013468013467
Training Accuracy: 86.53198653198653
