In [1]:
import joblib
import numpy as np
import pandas as pd
import torch
from torch import nn

In [2]:
# Define custom functions so joblib knows what they are
drop_cols = ["Name", "Ticket", "Cabin"]

def drop_unneeded_cols_df(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop(columns=drop_cols, errors="ignore")

def log1p_fare_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["Fare"] = np.log1p(df["Fare"])
    return df

# Define the model architecture so PyTorch can hold the saved weights
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size=16, num_layers=1, dropout=0.1):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, int(hidden_size))
        self.fc3 = nn.Linear(int(hidden_size), 1)
        self.dropout = nn.Dropout(dropout)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        for _ in range(1, self.num_layers):
            x = torch.relu(self.fc2(x))
            x = self.dropout(x)
        x = self.fc3(x)
        return x

In [3]:
# Load the Scikit-Learn Preprocessor
preprocess = joblib.load("../models/titanic_preprocessor.joblib")

# Create the model (Note: Make sure input_size matches your training data!
model = SimpleNN(input_size=10, hidden_size=16, num_layers=2, dropout=0.1)
model.load_state_dict(torch.load("../models/titanic_best_model.pth"))
model.eval() # CRITICAL: Turn off dropout for real predictions!

SimpleNN(
  (fc1): Linear(in_features=10, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=1, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [4]:
movie_characters = pd.DataFrame([
    {
        "PassengerId": 9991,
        "Pclass": 3,               # Jack was in 3rd class
        "Name": "Dawson, Mr. Jack",
        "Sex": "male",
        "Age": 20,
        "SibSp": 0,                # Traveled with Fabrizio, but no family
        "Parch": 0,
        "Ticket": "A/5 21171",     # Won in a poker game
        "Fare": 0.0,               # Technically didn't pay
        "Cabin": None,
        "Embarked": "S"            # Southampton
    },
    {
        "PassengerId": 9992,
        "Pclass": 1,               # Rose was in 1st class
        "Name": "DeWitt Bukater, Miss. Rose",
        "Sex": "female",
        "Age": 17,
        "SibSp": 1,                # Traveled with Cal (fiancé)
        "Parch": 1,                # Traveled with her mother
        "Ticket": "PC 17599",
        "Fare": 150.00,            # Expensive 1st class suite
        "Cabin": "B52",
        "Embarked": "S"
    }
]).set_index("PassengerId")

In [5]:
# Preprocess the data using ONLY .transform()
X_movie = preprocess.transform(movie_characters)

# Convert to PyTorch Tensor
X_movie_t = torch.tensor(X_movie, dtype=torch.float32)

# Make Predictions
with torch.no_grad():
    logits = model(X_movie_t)
    probabilities = torch.sigmoid(logits).numpy()

# Print the results beautifully
print("--- TITANIC SURVIVAL PREDICTIONS ---")
for i, name in enumerate(["Jack Dawson", "Rose DeWitt Bukater"]):
    prob = probabilities[i][0] * 100
    status = "SURVIVES" if prob >= 50 else "TRAGICALLY DIES"
    print(f"{name}: {prob:.1f}% chance of survival -> {status}")

--- TITANIC SURVIVAL PREDICTIONS ---
Jack Dawson: 7.2% chance of survival -> TRAGICALLY DIES
Rose DeWitt Bukater: 93.8% chance of survival -> SURVIVES


![title](../images/table.png)

In [6]:
# 1. Load the raw test data
test_df = pd.read_csv("../dataset/test.csv", index_col="PassengerId")

# 2. Preprocess the data (This drops the 'Name' column behind the scenes)
X_test_processed = preprocess.transform(test_df)
X_test_t = torch.tensor(X_test_processed, dtype=torch.float32)

# 3. Make Predictions
with torch.no_grad():
    logits = model(X_test_t)
    probabilities = torch.sigmoid(logits).numpy().flatten()
    predictions = (probabilities >= 0.5).astype(int)

# 4. Create a readable results table
# We pull the 'Name' directly from the original test_df!
results_df = pd.DataFrame({
    "Name": test_df["Name"],
    "Probability": probabilities,
    "Prediction": predictions
})

# Optional: Convert 1s and 0s into actual words for the audience
results_df["Outcome"] = results_df["Prediction"].map({1: "Survived", 0: "Tragically Died"})

# Format the probability to look like a percentage
results_df["Probability"] = (results_df["Probability"] * 100).round(1).astype(str) + "%"

# Drop the raw 0/1 prediction column to make it cleaner
results_df = results_df.drop(columns=["Prediction"])

# 5. Display the first 15 passengers
print("\n--- TEST DATASET PREDICTIONS ---")
results_df


--- TEST DATASET PREDICTIONS ---


Unnamed: 0_level_0,Name,Probability,Outcome
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
892,"Kelly, Mr. James",11.8%,Tragically Died
893,"Wilkes, Mrs. James (Ellen Needs)",31.7%,Tragically Died
894,"Myles, Mr. Thomas Francis",5.9%,Tragically Died
895,"Wirz, Mr. Albert",11.9%,Tragically Died
896,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",50.8%,Survived
...,...,...,...
1305,"Spector, Mr. Woolf",11.2%,Tragically Died
1306,"Oliva y Ocana, Dona. Fermina",94.8%,Survived
1307,"Saether, Mr. Simon Sivertsen",8.2%,Tragically Died
1308,"Ware, Mr. Frederick",11.2%,Tragically Died
