In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Load train data
datapath_train = "https://raw.githubusercontent.com/kagglechallengegroup18/Kaggle_Spaceship_Titanic/refs/heads/main/train.csv"
df = pd.read_csv(datapath_train)

In [None]:
# Drop 'Name'
df.drop(columns=['Name'], inplace=True)

In [None]:
# Create TotalSpending
df['TotalSpending'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

In [None]:
# Define columns
numerical_columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
categorical_columns = ['CryoSleep', 'HomePlanet', 'Destination']
vip_column = ['VIP']
cat_cols_plus_vip = categorical_columns + vip_column

In [None]:
# Handle Cabin split
df['Cabin'] = df['Cabin'].fillna('Unknown/0/U')
df[['Deck', 'CabinNum', 'Side']] = df['Cabin'].str.split('/', expand=True)
df['CabinNum'] = pd.to_numeric(df['CabinNum'], errors='coerce')
df.drop(columns=['Cabin'], inplace=True)

In [None]:
# Will one-hot encode these categorical columns later (drop_first=True)
categorical_cols_for_dummies = cat_cols_plus_vip + ['Deck', 'Side']

In [None]:
# --- Numerical imputation ---
knn_imputer_num = KNNImputer(n_neighbors=5)
df[numerical_columns] = knn_imputer_num.fit_transform(df[numerical_columns])

In [None]:
# --- Categorical + VIP encoding + imputation ---

# Convert all categorical + VIP to string, replace missing with 'Unknown'
for col in cat_cols_plus_vip:
    df[col] = df[col].astype(str).replace('nan', 'Unknown')

# Fit LabelEncoders
label_encoders = {}
for col in cat_cols_plus_vip:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Fit KNN imputer on all categorical + VIP columns combined
knn_imputer_cat = KNNImputer(n_neighbors=5)
df[cat_cols_plus_vip] = knn_imputer_cat.fit_transform(df[cat_cols_plus_vip])

# Round and inverse transform all categorical + VIP columns
for col in cat_cols_plus_vip:
    df[col] = df[col].round(0).astype(int)
    df[col] = label_encoders[col].inverse_transform(df[col])

# --- One-hot encode categorical variables ---
df = pd.get_dummies(df, columns=categorical_cols_for_dummies, drop_first=True)

# Prepare features and target
X = df.drop(['Transported', 'PassengerId'], axis=1)
y = df['Transported'].astype(int)

In [None]:
# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Validation predictions and evaluation
y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

Validation Accuracy: 0.7947096032202415
              precision    recall  f1-score   support

           0       0.78      0.82      0.80       861
           1       0.81      0.77      0.79       878

    accuracy                           0.79      1739
   macro avg       0.80      0.79      0.79      1739
weighted avg       0.80      0.79      0.79      1739



In [None]:
# === TEST DATA PREPROCESSING ===
datapath_test = "https://raw.githubusercontent.com/kagglechallengegroup18/Kaggle_Spaceship_Titanic/refs/heads/main/test.csv"
df_test = pd.read_csv(datapath_test)


In [None]:
# Drop Name
df_test.drop(columns=['Name'], inplace=True)

In [None]:
# Total spending feature
df_test['TotalSpending'] = df_test[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

In [None]:
# Handle Cabin split same way as train
df_test['Cabin'] = df_test['Cabin'].fillna('Unknown/0/U')
df_test[['Deck', 'CabinNum', 'Side']] = df_test['Cabin'].str.split('/', expand=True)
df_test['CabinNum'] = pd.to_numeric(df_test['CabinNum'], errors='coerce')
df_test.drop(columns=['Cabin'], inplace=True)

# Impute numerical columns
df_test[numerical_columns] = knn_imputer_num.transform(df_test[numerical_columns])

# Convert categorical + VIP to string and replace missing
for col in cat_cols_plus_vip:
    df_test[col] = df_test[col].astype(str).replace('nan', 'Unknown')

# Label encode test categorical + VIP with train-fitted encoders
for col in cat_cols_plus_vip:
    df_test[col] = label_encoders[col].transform(df_test[col])

# Impute categorical + VIP columns in test data
df_test[cat_cols_plus_vip] = knn_imputer_cat.transform(df_test[cat_cols_plus_vip])

# Round and inverse transform
for col in cat_cols_plus_vip:
    df_test[col] = df_test[col].round(0).astype(int)
    df_test[col] = label_encoders[col].inverse_transform(df_test[col])

# One-hot encode test categorical columns (same as train)
df_test = pd.get_dummies(df_test, columns=categorical_cols_for_dummies, drop_first=True)

# Fix missing columns in test to match train columns (add missing cols filled with 0)
missing_cols = set(X.columns) - set(df_test.columns)
for col in missing_cols:
    df_test[col] = 0

# Reorder columns to match training set
df_test = df_test[X.columns]

In [None]:
# Predict on test data
test_preds = model.predict(df_test)

In [None]:
# Prepare submission
submission = pd.DataFrame({
    'PassengerId': pd.read_csv(datapath_test)['PassengerId'],
    'Transported': test_preds.astype(bool)
})


In [None]:
submission.to_csv('submission.csv', index=False)
print("Submission saved to submission.csv")

Submission saved to submission.csv


In [None]:
from google.colab import files

files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>