In [7]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
import optuna

# Step 1 & 2: Business Understanding & Data Understanding
# Load the Titanic dataset
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Save PassengerId for the test set for final submission
test_passenger_ids = test_data["PassengerId"]

# Drop unneeded columns
train_data = train_data.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])
test_data = test_data.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])

# Separate target and features
X = train_data.drop("Survived", axis=1)
y = train_data["Survived"]

# Step 3: Data Preparation
# Identify categorical and numerical columns
cat_features = ["Sex", "Embarked"]
num_features = ["Age", "SibSp", "Parch", "Fare"]

# Preprocessing pipeline
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define an objective function for Optuna to optimize SelectKBest and RandomForest parameters
def objective(trial):
    # Define the number of features to select
    k = trial.suggest_int("k", 1, X_train.shape[1])

    # Define the RandomForest parameters
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    max_depth = trial.suggest_int("max_depth", 3, 20)

    # Create a pipeline with preprocessing, feature selection, and the RandomForest model
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('feature_selection', SelectKBest(score_func=f_classif, k=k)),
        ('classifier', RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42))
    ])

    # Train the model
    model_pipeline.fit(X_train, y_train)
    
    # Validate the model
    y_pred = model_pipeline.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    
    return accuracy

# Optimize with Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# Output the best parameters
print("Best trial:")
best_trial = study.best_trial
print(f" - Accuracy: {best_trial.value}")
print(f" - Parameters: {best_trial.params}")

# Step 4: Use the best parameters for the final model
best_k = best_trial.params['k']
best_n_estimators = best_trial.params['n_estimators']
best_max_depth = best_trial.params['max_depth']

# Build the final model pipeline with optimized parameters
final_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif, k=best_k)),
    ('classifier', RandomForestClassifier(n_estimators=best_n_estimators, max_depth=best_max_depth, random_state=42))
])

# Train the final model on the full training set
final_model.fit(X_train, y_train)

# Step 5: Evaluation
y_val_pred = final_model.predict(X_val)
print("Accuracy on validation set:", accuracy_score(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Classification Report:\n", classification_report(y_val, y_val_pred))

# Step 6: Deployment (Generate predictions on test data)
X_test_prepared = test_data
test_predictions = final_model.predict(X_test_prepared)

# Save results to CSV, including PassengerId
submission = pd.DataFrame({
    "PassengerId": test_passenger_ids,
    "Survived": test_predictions
})
submission.to_csv("submission.csv", index=False)

[I 2024-10-25 19:56:51,124] A new study created in memory with name: no-name-48a689a3-75df-4a88-8c0a-d689409d6283
[I 2024-10-25 19:56:51,211] Trial 0 finished with value: 0.7821229050279329 and parameters: {'k': 5, 'n_estimators': 142, 'max_depth': 3}. Best is trial 0 with value: 0.7821229050279329.
[I 2024-10-25 19:56:51,300] Trial 1 finished with value: 0.7821229050279329 and parameters: {'k': 4, 'n_estimators': 133, 'max_depth': 3}. Best is trial 0 with value: 0.7821229050279329.
[I 2024-10-25 19:56:51,397] Trial 2 finished with value: 0.7821229050279329 and parameters: {'k': 1, 'n_estimators': 186, 'max_depth': 16}. Best is trial 0 with value: 0.7821229050279329.
[I 2024-10-25 19:56:51,455] Trial 3 finished with value: 0.7877094972067039 and parameters: {'k': 5, 'n_estimators': 91, 'max_depth': 5}. Best is trial 3 with value: 0.7877094972067039.
[I 2024-10-25 19:56:51,543] Trial 4 finished with value: 0.8044692737430168 and parameters: {'k': 3, 'n_estimators': 131, 'max_depth': 17}

Best trial:
 - Accuracy: 0.8044692737430168
 - Parameters: {'k': 3, 'n_estimators': 131, 'max_depth': 17}
Accuracy on validation set: 0.8044692737430168
Confusion Matrix:
 [[92 13]
 [22 52]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.88      0.84       105
           1       0.80      0.70      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

