In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [2]:
# Load train data
datapath_train = "https://raw.githubusercontent.com/kagglechallengegroup18/Kaggle_Spaceship_Titanic/refs/heads/main/train.csv"
df = pd.read_csv(datapath_train)

In [3]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
HomePlanet,201
CryoSleep,217
Cabin,199
Destination,182
Age,179
VIP,203
RoomService,181
FoodCourt,183
ShoppingMall,208


In [5]:
df[['Deck', 'CabinNum', 'Side']] = df['Cabin'].str.split('/', expand=True)
df['Age'] = df['Age'].replace(0.0, np.nan)

In [6]:
# Mode imputation
for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']:
    df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
  df[col].fillna(df[col].mode()[0], inplace=True)


In [7]:
# Boolean encoding
df['CryoSleep'] = df['CryoSleep'].map({'True': 1, 'False': 0, True: 1, False: 0})
df['VIP'] = df['VIP'].map({'True': 1, 'False': 0, True: 1, False: 0})
df['Transported'] = df['Transported'].map({True: 1, False: 0})


In [8]:
# Spending features
spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df['TotalSpend'] = df[spend_cols].sum(axis=1)
df['TotalSpend_log'] = np.log1p(df['TotalSpend'])


In [9]:
# Group features
df['Group'] = df['PassengerId'].apply(lambda x: x.split('_')[0])
group_sizes = df['Group'].value_counts().to_dict()
df['GroupSize'] = df['Group'].map(group_sizes)
df['NoSpendFlag'] = (df[spend_cols].sum(axis=1) == 0).astype(int)

In [10]:
# Frequency encode group
df['Group'] = df['Group'].map(group_sizes)

# Drop unused
df.drop(['Name', 'Cabin', 'PassengerId', 'CabinNum', 'TotalSpend'], axis=1, inplace=True)

# One-hot encoding
df = pd.get_dummies(df, columns=['HomePlanet', 'Destination', 'Deck', 'Side'], drop_first=True)

In [11]:
#  KNN Imputation
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpend_log']
imputer = KNNImputer(n_neighbors=5)
df[num_cols] = imputer.fit_transform(df[num_cols])


In [12]:
#  Split & Train model
X = df.drop('Transported', axis=1)
y = df['Transported']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [13]:

#  Grid SearchCV
param_grid = {
    'n_estimators': [150, 200, 250],
    'max_depth': [4, 5, 6],
    'learning_rate': [0.03, 0.05, 0.07],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1],
    'reg_alpha': [0.01, 0.1],
    'reg_lambda': [1.0, 2.0]
}


In [14]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
grid = GridSearchCV(
    model,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 864 candidates, totalling 2592 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [15]:
#  Evaluate Best Model
best_model = grid.best_estimator_
y_pred = best_model.predict(X_val)
val_acc = accuracy_score(y_val, y_pred)

print("\nBest Parameters:", grid.best_params_)
print(f" Validation Accuracy: {val_acc:.4f}")


Best Parameters: {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 200, 'reg_alpha': 0.01, 'reg_lambda': 1.0, 'subsample': 1.0}
 Validation Accuracy: 0.8091


In [16]:
datapath_test="https://raw.githubusercontent.com/kagglechallengegroup18/Kaggle_Spaceship_Titanic/refs/heads/main/test.csv"
test_df= pd.read_csv(datapath_test)
passenger_ids = test_df["PassengerId"].copy()

In [17]:
# Preprocessing (same as train)
test_df[['Deck', 'CabinNum', 'Side']] = test_df['Cabin'].str.split('/', expand=True)
test_df['Age'] = test_df['Age'].replace(0.0, np.nan)

for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']:
    test_df[col].fillna(test_df[col].mode()[0], inplace=True)

test_df['CryoSleep'] = test_df['CryoSleep'].map({'True': 1, 'False': 0, True: 1, False: 0})
test_df['VIP'] = test_df['VIP'].map({'True': 1, 'False': 0, True: 1, False: 0})

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(test_df[col].mode()[0], inplace=True)
  test_df[col].fillna(test_df[col].mode()[0], inplace=True)


In [18]:
# Spending features
spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
test_df['TotalSpend'] = test_df[spend_cols].sum(axis=1)
test_df['TotalSpend_log'] = np.log1p(test_df['TotalSpend'])

In [19]:
# Group + engineered features
test_df['Group'] = test_df['PassengerId'].apply(lambda x: x.split('_')[0])
group_sizes = test_df['Group'].value_counts().to_dict()
test_df['GroupSize'] = test_df['Group'].map(group_sizes)
test_df['NoSpendFlag'] = (test_df[spend_cols].sum(axis=1) == 0).astype(int)
test_df['Group'] = test_df['Group'].map(group_sizes)

In [20]:
# Drop unused
test_df.drop(['Name', 'Cabin', 'PassengerId', 'CabinNum', 'TotalSpend'], axis=1, inplace=True)

# One-hot encoding
test_df = pd.get_dummies(test_df, columns=['HomePlanet', 'Destination', 'Deck', 'Side'], drop_first=True)

In [21]:
#  KNN Imputation
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpend_log']
imputer = KNNImputer(n_neighbors=5)
test_df[num_cols] = imputer.fit_transform(test_df[num_cols])


In [22]:
preds = best_model.predict(test_df)

In [23]:
#  Prepare submission
submission = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Transported": preds.astype(bool)
})

In [24]:
submission.to_csv("submission_XGBoost_gridsearch.csv", index=False)
print("Submission file saved as submission.csv")

Submission file saved as submission.csv


In [26]:
from google.colab import files

files.download('submission_XGBoost_gridsearch.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>