In [4]:
!pip install optuna pandas scikit-learn xgboost



In [None]:
import pandas as pd
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

# Step 1: Load the Titanic dataset
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Step 2: Preprocess the data
def preprocess(data):
    # Fill missing values
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)

    # Convert categorical columns to numeric
    label_encoder = LabelEncoder()
    for col in ['Sex', 'Embarked']:
        data[col] = label_encoder.fit_transform(data[col])

    # Drop unnecessary columns
    data.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True, errors='ignore')
    return data

# Preprocess train and test datasets
train_data = preprocess(train_data)
test_data = preprocess(test_data)

# Separate features and target
X = train_data.drop(columns=['Survived'])
y = train_data['Survived']
X_test = test_data

# Split train data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=123)

# Step 3: Define the Optuna objective function
def objective(trial):
    # Suggest hyperparameters
    model_type = trial.suggest_categorical("model_type", ["RandomForest", "XGBoost"])

    if model_type == "RandomForest":
        n_estimators = trial.suggest_int("n_estimators", 50, 300)
        max_depth = trial.suggest_int("max_depth", 5, 50)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 4)

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=123
        )

    elif model_type == "XGBoost":
        learning_rate = trial.suggest_float("learning_rate", 0.01, 0.2)
        n_estimators = trial.suggest_int("n_estimators", 50, 300)
        max_depth = trial.suggest_int("max_depth", 3, 20)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)

        model = XGBClassifier(
            learning_rate=learning_rate,
            n_estimators=n_estimators,
            max_depth=max_depth,
            colsample_bytree=colsample_bytree,
            random_state=123,
            #use_label_encoder=False,
            eval_metric='logloss'
        )

    # Evaluate the model using cross-validation
    score = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy").mean()
    return score

# Step 4: Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# Best hyperparameters
print("Best hyperparameters:", study.best_params)

# Step 5: Train the final model with optimal parameters
best_params = study.best_params

if best_params["model_type"] == "RandomForest":
    final_model = RandomForestClassifier(
        n_estimators=best_params["n_estimators"],
        max_depth=best_params["max_depth"],
        min_samples_split=best_params["min_samples_split"],
        min_samples_leaf=best_params["min_samples_leaf"],
        random_state=123
    )
elif best_params["model_type"] == "XGBoost":
    final_model = XGBClassifier(
        learning_rate=best_params["learning_rate"],
        n_estimators=best_params["n_estimators"],
        max_depth=best_params["max_depth"],
        colsample_bytree=best_params["colsample_bytree"],
        random_state=123,
        eval_metric='logloss'
    )

final_model.fit(X_train, y_train)

# Step 6: Evaluate on the validation set
accuracy = final_model.score(X_val, y_val)
print(f"Validation Accuracy: {accuracy:.2f}")

# Step 7: Make predictions on the test dataset
predictions = final_model.predict(X_test)
print("Test Predictions:", predictions[:10])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on 