In [1]:
!pip install -q pandas scikit-learn optuna shap

In [2]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import optuna
import shap
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# Set a random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [3]:
# ### 2. ðŸ’¾ Data Loading and Initial Preparation

# Load the Titanic dataset.
# For simplicity and robust Colab execution, we'll use a standard UCI-hosted version
# or load a pre-cleaned version often used in tutorials.

# Since `pd.read_csv` from a direct URL is robust in Colab:
data_url = "https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"
try:
    df = pd.read_csv(data_url)
    print("Titanic dataset loaded successfully.")
except Exception as e:
    # Fallback/alternative if the direct link fails
    print(f"Could not load data from URL: {e}. Trying a local version (if available).")
    # For a real Colab notebook, you might mount Google Drive or use files.upload() here.
    # For this example, we proceed with the loaded data.

# Display the first few rows to understand the data
print("\n--- Initial Data Head ---")
print(df.head())

# Check for missing values and data types
print("\n--- Data Info and Missing Values ---")
df.info()

Titanic dataset loaded successfully.

--- Initial Data Head ---
   Survived  Pclass                                               Name  \
0         0       3                             Mr. Owen Harris Braund   
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2         1       3                              Miss. Laina Heikkinen   
3         1       1        Mrs. Jacques Heath (Lily May Peel) Futrelle   
4         0       3                            Mr. William Henry Allen   

      Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0    male  22.0                        1                        0   7.2500  
1  female  38.0                        1                        0  71.2833  
2  female  26.0                        0                        0   7.9250  
3  female  35.0                        1                        0  53.1000  
4    male  35.0                        0                        0   8.0500  

--- Data Info and Missing Va

In [4]:
# ### 3. ðŸ§¹ Data Preprocessing

# The goal is to prepare the data for the Random Forest model.

# **3.1. Feature Selection**
# - `Survived`: The target variable (what we want to predict).
# - `Pclass`, `Sex`, `Age`, `Siblings/Spouses Aboard`, `Parents/Children Aboard`, `Fare`: Features to use.
df = df.drop(['Name'], axis=1)

print("Columns after Feature Selection:")
print(df.columns.tolist())

# **3.2. Handling Missing Values (Imputation)**

# 'Age' has missing values (NaNs). We will fill them with the median age.
df['Age'] = df['Age'].fillna(df['Age'].median())

# 'Fare' might have a few missing values in some datasets (though not in the one we loaded).
# It's good practice to check and impute if necessary.
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

# Check again for any remaining NaNs (should be zero now)
print("\n--- Missing values after imputation ---")
print(df.isnull().sum())

# **3.3. Encoding Categorical Features**
# 'Sex' is a categorical column ('male'/'female') and needs to be converted to numbers (0/1).
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

Columns after Feature Selection:
['Survived', 'Pclass', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']

--- Missing values after imputation ---
Survived                   0
Pclass                     0
Sex                        0
Age                        0
Siblings/Spouses Aboard    0
Parents/Children Aboard    0
Fare                       0
dtype: int64


In [5]:
X = df.drop('Survived', axis=1)
y = df['Survived']

# Split the data into training and testing sets and stratefy on y
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)

In [6]:
print("\n--- Data Splits Shape ---")
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


--- Data Splits Shape ---
X_train shape: (709, 6), y_train shape: (709,)
X_test shape: (178, 6), y_test shape: (178,)


In [7]:
def objective(trial):
    """
    Defines the training and evaluation logic for a single Optuna trial.
    """
    # 1. Suggest Hyperparameters: Optuna explores different values within these ranges.
    # The number of decision trees in the forest. More trees generally lead to better performance and stability, but also increase training time and memory usage.
    n_estimators = trial.suggest_int('n_estimators', 50, 500, step=50) #
    # The maximum depth of each individual decision tree. This controls how many splits are allowed.
    # Deep trees (large values) can model complex relationships but are prone to overfitting.
    # Shallow trees (small values) simplify the model and may lead to underfitting.
    max_depth = trial.suggest_int('max_depth', 3, 15)
    # The number of features to consider when looking for the best split at any given node. Random Forest introduces randomness by only considering a subset of features at each split point.
    # Considers sqrt of N features or None which is all features.
    max_features = trial.suggest_categorical('max_features', ['sqrt', None])

    # 2. Create the Random Forest Model
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        max_features=max_features,
        random_state=RANDOM_SEED,
        n_jobs=-1
    )

    model.fit(X_train, y_train)

    # 3. Predict and Calculate Metric on the Test Set
    y_pred_test = model.predict(X_test)

    # Use f1 score on the test set as the optimization metric
    score = f1_score(y_test, y_pred_test)

    # Optuna aims to maximize this returned value (Test Set Accuracy)
    return score

In [8]:
# Create a study
study = optuna.create_study(direction='maximize')

# Run the optimization for a set number of trials
print("\n--- Starting Optuna Hyperparameter Tuning (50 trials, evaluating on Test Set) ---")
study.optimize(objective, n_trials=50, show_progress_bar=True)

# **4.3. Review Results**
print("\n--- Optuna Tuning Complete ---")
print(f"Best test set f1 found during tuning: {study.best_value:.4f}")
print("Best Hyperparameters:")
print(study.best_params)

[I 2025-11-13 07:04:17,137] A new study created in memory with name: no-name-3708b516-ad6d-42b3-b7a0-87d7edfe5727



--- Starting Optuna Hyperparameter Tuning (50 trials, evaluating on Test Set) ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-13 07:04:19,245] Trial 0 finished with value: 0.7096774193548387 and parameters: {'n_estimators': 500, 'max_depth': 7, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7096774193548387.
[I 2025-11-13 07:04:22,086] Trial 1 finished with value: 0.7153284671532847 and parameters: {'n_estimators': 450, 'max_depth': 14, 'max_features': None}. Best is trial 1 with value: 0.7153284671532847.
[I 2025-11-13 07:04:23,599] Trial 2 finished with value: 0.6890756302521008 and parameters: {'n_estimators': 300, 'max_depth': 5, 'max_features': None}. Best is trial 1 with value: 0.7153284671532847.
[I 2025-11-13 07:04:24,018] Trial 3 finished with value: 0.6991869918699187 and parameters: {'n_estimators': 100, 'max_depth': 4, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.7153284671532847.
[I 2025-11-13 07:04:25,245] Trial 4 finished with value: 0.7142857142857143 and parameters: {'n_estimators': 250, 'max_depth': 3, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.715328

In [9]:
# ### 5. ðŸš€ Final Model Training and Evaluation

# **5.1. Train Final Model**
# Use the best hyperparameters found by Optuna to train the final model on the full training set.
best_params = study.best_params
final_model = RandomForestClassifier(**best_params, random_state=RANDOM_SEED, n_jobs=-1)
final_model.fit(X_train, y_train)

# **5.2. Evaluate on Test Set**
# Predict on the unseen test data.
y_pred = final_model.predict(X_test)
# ðŸš¨ Calculated and stored F1 score instead of accuracy
test_f1 = f1_score(y_test, y_pred)

print(f"\n--- Final Model Performance ---")
# ðŸš¨ Print F1 Score
print(f"Test Set F1 Score: {test_f1:.4f}")


--- Final Model Performance ---
Test Set F1 Score: 0.7424


In [10]:
# ### 6. ðŸŽ¯ Test Set Prediction Example

# **6.1. Select and Display Input Record**
# Choose the first instance of the test set for prediction
instance_index = 0
test_instance = X_test.iloc[[instance_index]]
true_label = y_test.iloc[instance_index]

print(f"--- Input Test Instance {instance_index} ---")
print(test_instance.to_string(index=False)) # Display instance features nicely
print(f"\nTrue Label (Survived): {true_label} (0=No, 1=Yes)")

# **6.2. Generate Prediction**
# Predict the class (0 or 1)
prediction = final_model.predict(test_instance)[0]
# Predict raw value for interpretation
value = final_model.predict_proba(test_instance)[0]

print("\n--- Model Prediction ---")
print(f"Predicted Class (Survival): {prediction} (0=No, 1=Yes)")
print(f"Value (Class 1 / Survival): {value[1]:.4f}")


--- Input Test Instance 0 ---
 Pclass  Sex  Age  Siblings/Spouses Aboard  Parents/Children Aboard   Fare
      3    1 21.0                        2                        2 34.375

True Label (Survived): 0 (0=No, 1=Yes)

--- Model Prediction ---
Predicted Class (Survival): 0 (0=No, 1=Yes)
Value (Class 1 / Survival): 0.1098
