In [42]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [43]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train.shape, test.shape

((8693, 14), (4277, 13))

In [44]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [45]:
target = 'Transported'

X = train.drop(columns=['PassengerId', 'Name', target])
y = train[target]
# 10 folds
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
])

cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

# Preprocessing
preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols),
])

## Logistic Regression

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

logistic_model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(
        max_iter=1000
    ))
])

logistic_scores = cross_val_score(
    logistic_model, X, y, cv=skf, scoring="accuracy", n_jobs=-1
)

print("Logistic CV Accuracy: ", logistic_scores.round(4))
print("Mean:", logistic_scores.mean().round(4))
print("Std:", logistic_scores.std().round(4))

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Logistic CV Accuracy:  [0.7828 0.7989 0.808  0.7618 0.8113 0.7894 0.8055 0.7722 0.7906 0.7687]
Mean: 0.7889
Std: 0.0165


## SVM

In [47]:
from sklearn.svm import SVC

svm_model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", SVC(kernel="linear"))
])

svm_scores = cross_val_score(
    svm_model, X, y, cv=skf, scoring="accuracy", n_jobs=-1
)

print("SVM CV Accuracy: ", svm_scores.mean().round(4))

SVM CV Accuracy:  0.7868


## MLP

In [48]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

X_processed = preprocessor.fit_transform(X)
# Convert sparse to dense if needed
if hasattr(X_processed, "toarray"):
    X_processed = X_processed.toarray()
y_processed = y.astype(int).values
X_np = X_processed.astype(np.float32)
y_np = y.astype(int).values.astype(np.float32)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [32]:
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 1),
        )

    def forward(self, X):
        return self.network(X)

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fold_accuracies = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_np, y_np)):

    print(f"\nFold {fold+1}")

    X_train_fold = torch.tensor(X_np[train_idx])
    y_train_fold = torch.tensor(y_np[train_idx])
    X_val_fold = torch.tensor(X_np[val_idx])
    y_val_fold = torch.tensor(y_np[val_idx])

    train_loader = DataLoader(
        TensorDataset(X_train_fold, y_train_fold),
        batch_size=64,
        shuffle=True
    )

    model = MLP(X_np.shape[1]).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Train
    epochs = 50
    for epoch in range(epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)

            optimizer.zero_grad()
            outputs = model(xb).squeeze()
            loss = criterion(outputs, yb)
            loss.backward()
            optimizer.step()

    # Validate
    model.eval()
    with torch.no_grad():
        outputs = model(X_val_fold.to(device)).squeeze()
        preds = torch.sigmoid(outputs) > 0.5
        acc = (preds.cpu() == y_val_fold.bool()).float().mean()

    print("Validation Accuracy:", acc.item())
    fold_accuracies.append(acc.item())



Fold 1
Validation Accuracy: 0.7717078924179077

Fold 2
Validation Accuracy: 0.7234042286872864

Fold 3
Validation Accuracy: 0.7458309531211853

Fold 4
Validation Accuracy: 0.7220943570137024

Fold 5
Validation Accuracy: 0.7341772317886353


In [34]:
mlp_score = np.mean(fold_accuracies)
mlp_std = np.std(fold_accuracies)

print(f"MLP CV Accuracy: {mlp_score:.4f}")
print(f"Std: {mlp_std:.4f}")

MLP CV Accuracy: 0.7394
Std: 0.0183


In [35]:
print(f"Logistic: {logistic_scores.mean():.4f}")
print(f"SVM: {svm_scores.mean():.4f}")
print(f"MLP: {mlp_score:.4f}")

Logistic: 0.7864
SVM: 0.7889
MLP: 0.7394


## Fit best model on full data

In [54]:
def produce_submission(model, model_name):
    model.fit(X, y)
    X_test = test.copy()
    test_predictions = model.predict(X_test).astype(bool)
    submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Transported": test_predictions
    })
    submission.to_csv(
        f"data/submission_{model_name}.csv",
        index=False
    )

In [65]:
produce_submission(logistic_model, "logistic")