In [1220]:
import pandas as pd
import numpy as np

df = pd.read_csv("./data/train.csv")
df = df.drop("PassengerId", axis=1)
df = df.drop("Name", axis=1)
df = df.dropna()

In [1221]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.loc[:, "Destination"] = le.fit_transform(df["Destination"])
df.loc[:, "HomePlanet"] = le.fit_transform(df["HomePlanet"])


In [1222]:
df['CryoSleep'] = df['CryoSleep'].apply(lambda x: np.array(x, dtype=bool))
df['VIP'] = df['VIP'].apply(lambda x: np.array(x, dtype=bool))

In [1223]:
df[['Deck', 'Room', 'Seat']] = df['Cabin'].str.extractall(r'\b([A-Za-z0-9]|[A-Za-z0-9]+)\b').unstack()

df.loc[:, "Deck"] = le.fit_transform(df["Deck"])
df.loc[:, "Seat"] = le.fit_transform(df["Seat"])

df = df.drop("Cabin", axis=1)

In [1224]:
selected_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Transported', "Deck", "Seat"]
df = df.loc[:, selected_cols]

In [1225]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler

# cols_to_transform = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "HomePlanet", "Destination", "Age", "Deck", "Room", "Seat"]
cols_to_transform = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "HomePlanet", "Destination", "Age", "Deck", "Seat"]

ct = make_column_transformer(
    (MinMaxScaler(), cols_to_transform), remainder='passthrough'
)

In [1226]:
from sklearn.model_selection import train_test_split

X = df.drop("Transported", axis=1)
y = df["Transported"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [1227]:
ct.fit(X_train)
X_train_normal = ct.transform(X_train)
X_test_normal = ct.transform(X_test)

In [1228]:
import torch
from torch import nn

device = "cuda" if torch.cuda.is_available() else "cpu"

In [1229]:
train_data = torch.tensor(X_train_normal, dtype=torch.float32).to(device)
train_labels = torch.tensor(y_train.values, dtype=torch.float32).to(device)

test_data = torch.tensor(X_test_normal, dtype=torch.float32).to(device)
test_labels = torch.tensor(y_test.values, dtype=torch.float32).to(device)

In [1230]:
from torch.utils.data import DataLoader, TensorDataset

batch_size = 32

dataset = TensorDataset(train_data, train_labels)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [1231]:
input_size = train_data.shape[1]
hidden_size = 65
output_size = 1

model = nn.Sequential(
    nn.Linear(input_size, hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, output_size),
    nn.Sigmoid()
).to(device)

In [1232]:
loss_fn = nn.BCELoss()

optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)
# optimizer = torch.optim.SGD(params=model.parameters(), lr=0.01)

In [1233]:
epochs = 100

for epoch in range(epochs):
    model.train()
    for batch, (data, labels) in enumerate(dataloader):
        # Move data to device
        data, labels = data.to(device), labels.to(device)
        
        # Forward pass
        pred = model(data)
        loss = loss_fn(pred.squeeze(), labels)
        
        # Backward pass and update weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.inference_mode():
        test_pred = model(test_data)

        test_loss = loss_fn(test_pred.squeeze(),
                            test_labels)
        
        correct = (torch.round(test_pred).squeeze() == test_labels).sum().item()
        total = test_labels.shape[0]
        accuracy = correct / total

    if epoch % 5 == 0 or epoch == epochs-1:
        print(f"Epoch: {epoch+1} | Loss: {loss:.2f} | Test: {test_loss:.2f} | Test acc: {accuracy:.2f}")


Epoch: 1 | Loss: 0.80 | Test: 0.51 | Test acc: 0.75
Epoch: 6 | Loss: 0.50 | Test: 0.42 | Test acc: 0.81
Epoch: 11 | Loss: 0.96 | Test: 0.41 | Test acc: 0.82
Epoch: 16 | Loss: 0.95 | Test: 0.41 | Test acc: 0.81
Epoch: 21 | Loss: 0.37 | Test: 0.40 | Test acc: 0.82
Epoch: 26 | Loss: 0.11 | Test: 0.40 | Test acc: 0.82
Epoch: 31 | Loss: 0.09 | Test: 0.40 | Test acc: 0.81
Epoch: 36 | Loss: 0.40 | Test: 0.41 | Test acc: 0.81
Epoch: 41 | Loss: 0.86 | Test: 0.40 | Test acc: 0.81
Epoch: 46 | Loss: 0.41 | Test: 0.39 | Test acc: 0.82
Epoch: 51 | Loss: 0.21 | Test: 0.40 | Test acc: 0.82
Epoch: 56 | Loss: 0.49 | Test: 0.40 | Test acc: 0.81
Epoch: 61 | Loss: 0.36 | Test: 0.39 | Test acc: 0.83
Epoch: 66 | Loss: 0.18 | Test: 0.39 | Test acc: 0.83
Epoch: 71 | Loss: 0.26 | Test: 0.39 | Test acc: 0.82
Epoch: 76 | Loss: 0.03 | Test: 0.40 | Test acc: 0.81
Epoch: 81 | Loss: 0.19 | Test: 0.38 | Test acc: 0.83
Epoch: 86 | Loss: 0.25 | Test: 0.38 | Test acc: 0.82
Epoch: 91 | Loss: 1.23 | Test: 0.38 | Test acc: 

In [1234]:
i = 1003
model.eval()
with torch.inference_mode():
    test_pred = model(test_data[i])
float(test_pred[0]), "Correct" if test_pred[0] < 0.5 and not test_labels[i] or test_pred[0] > 0.5 and  test_labels[i] else "Wrong"


(0.0662808045744896, 'Correct')

In [1235]:
challenge_df = pd.read_csv("./data/test.csv")

In [1236]:
passenger_ids = challenge_df.pop("PassengerId")
passenger_ids = passenger_ids.to_frame()

In [1237]:
cols_with_nan = challenge_df.columns[challenge_df.isna().any()].tolist()

for col in cols_with_nan:
    most_common_value = challenge_df[col].value_counts().idxmax()
    challenge_df[col].fillna(most_common_value, inplace=True)
    

In [1238]:
challenge_df.loc[:, "Destination"] = le.fit_transform(challenge_df["Destination"])
challenge_df.loc[:, "HomePlanet"] = le.fit_transform(challenge_df["HomePlanet"])

challenge_df['CryoSleep'] = challenge_df['CryoSleep'].apply(lambda x: np.array(x, dtype=bool))
challenge_df['VIP'] = challenge_df['VIP'].apply(lambda x: np.array(x, dtype=bool))

In [1239]:
challenge_df[['Deck', 'Room', 'Seat']] = challenge_df['Cabin'].str.extractall(r'\b([A-Za-z0-9]|[A-Za-z0-9]+)\b').unstack()

challenge_df.loc[:, "Deck"] = le.fit_transform(challenge_df["Deck"])
challenge_df.loc[:, "Seat"] = le.fit_transform(challenge_df["Seat"])

challenge_df = challenge_df.drop("Cabin", axis=1)

In [1240]:
selected_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', "Deck", "Seat"]
challenge_df = challenge_df.loc[:, selected_cols]

In [1241]:
X_challenge_data = ct.transform(challenge_df)
X_challenge_data = torch.tensor(X_challenge_data, dtype=torch.float32).to(device)

In [1242]:
model.eval()
with torch.inference_mode():
    challenge_preds = torch.round(model(X_challenge_data)).to(dtype=bool).squeeze()

In [1243]:
bool_list = challenge_preds.tolist()
passenger_ids["Transported"] = bool_list

In [1246]:
passenger_ids.to_csv('./data/prediction_results.csv', index=False)