In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [None]:
# Load train data
datapath_train = "https://raw.githubusercontent.com/kagglechallengegroup18/Kaggle_Spaceship_Titanic/refs/heads/main/train.csv"
df = pd.read_csv(datapath_train)

In [None]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [None]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
HomePlanet,201
CryoSleep,217
Cabin,199
Destination,182
Age,179
VIP,203
RoomService,181
FoodCourt,183
ShoppingMall,208


In [None]:
# Split 'Cabin' into 'Deck', 'CabinNum', 'Side'
df[['Deck', 'CabinNum', 'Side']] = df['Cabin'].str.split('/', expand=True)

In [None]:
# Treat invalid ages (0.0) as missing
df['Age'] = df['Age'].replace(0.0, np.nan)

In [None]:
# Impute categorical columns with mode
for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']:
    df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
  df[col].fillna(df[col].mode()[0], inplace=True)


In [None]:
# Boolean mapping
df['CryoSleep'] = df['CryoSleep'].map({'True': 1, 'False': 0, True: 1, False: 0})
df['VIP'] = df['VIP'].map({'True': 1, 'False': 0, True: 1, False: 0})
df['Transported'] = df['Transported'].map({True: 1, False: 0})

In [None]:
#  Feature Engineering
service_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df['TotalSpend'] = df[service_cols].sum(axis=1)
df['TotalSpend_log'] = np.log1p(df['TotalSpend'])

In [None]:
# Encode group as frequency
df['Group'] = df['PassengerId'].apply(lambda x: x.split('_')[0])
group_freq = df['Group'].value_counts().to_dict()
df['Group'] = df['Group'].map(group_freq)

In [None]:
# Feature: No spending flag
df['NoSpending'] = (df[service_cols].sum(axis=1) == 0).astype(int)

In [None]:
# Drop unused columns
df.drop(['Name', 'Cabin', 'PassengerId', 'CabinNum', 'TotalSpend'], axis=1, inplace=True)

In [None]:
#  One-hot encode remaining categoricals
df = pd.get_dummies(df, columns=['HomePlanet', 'Destination', 'Deck', 'Side'], drop_first=True)

In [None]:
#  KNN Imputation for Numerical Columns

# Select numeric columns with potential missing values
numeric_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpend_log']
imputer = KNNImputer(n_neighbors=5)
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

In [None]:
# Model

X = df.drop('Transported', axis=1)
y = df['Transported']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [None]:
# Hyperparameter Tuning
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [0.5, 1.0, 2.0]
}

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=25,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)
search.fit(X_train, y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
# model Evaluation
best_model = search.best_estimator_
val_preds = best_model.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)

print("\n✅ Best Parameters:", search.best_params_)
print(f"✅ Validation Accuracy: {val_acc:.4f}")


✅ Best Parameters: {'subsample': 0.7, 'reg_lambda': 1.0, 'reg_alpha': 0.01, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
✅ Validation Accuracy: 0.8120


In [None]:
datapath_test="https://raw.githubusercontent.com/kagglechallengegroup18/Kaggle_Spaceship_Titanic/refs/heads/main/test.csv"
test_df= pd.read_csv(datapath_test)
passenger_ids = test_df["PassengerId"].copy()

In [None]:
# Preprocessing
test_df[['Deck', 'CabinNum', 'Side']] = test_df['Cabin'].str.split('/', expand=True)
test_df['Age'] = test_df['Age'].replace(0.0, np.nan)

# Mode impute categorical
for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']:
    test_df[col].fillna(test_df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(test_df[col].mode()[0], inplace=True)
  test_df[col].fillna(test_df[col].mode()[0], inplace=True)


In [None]:
# Boolean conversion
test_df['CryoSleep'] = test_df['CryoSleep'].map({'True': 1, 'False': 0, True: 1, False: 0})
test_df['VIP'] = test_df['VIP'].map({'True': 1, 'False': 0, True: 1, False: 0})

# Total spend and log
spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
test_df['TotalSpend'] = test_df[spend_cols].sum(axis=1)
test_df['TotalSpend_log'] = np.log1p(test_df['TotalSpend'])

# Group frequency
test_df['Group'] = test_df['PassengerId'].apply(lambda x: x.split('_')[0])
group_freq = test_df['Group'].value_counts().to_dict()
test_df['Group'] = test_df['Group'].map(group_freq)

In [None]:
# No spending
test_df['NoSpending'] = (test_df[spend_cols].sum(axis=1) == 0).astype(int)

# Drop unused
test_df.drop(['Name', 'Cabin', 'PassengerId', 'CabinNum', 'TotalSpend'], axis=1, inplace=True)

# One-hot encode
test_df = pd.get_dummies(test_df, columns=['HomePlanet', 'Destination', 'Deck', 'Side'], drop_first=True)

In [None]:
# KNN Impute numerics
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpend_log']
imputer = KNNImputer(n_neighbors=5)
test_df[num_cols] = imputer.fit_transform(test_df[num_cols])

In [None]:
#Predict & Save Submission ---
preds = best_model.predict(test_df)
submission = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Transported": preds.astype(bool)
})

In [None]:
submission.to_csv("submission_XGBoost1.csv", index=False)
print(" submission.csv is saved")

 submission.csv is saved


In [None]:
from google.colab import files

files.download('submission_XGBoost1.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>