In [None]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

In [None]:
# Load train data
datapath_train = "https://raw.githubusercontent.com/kagglechallengegroup18/Kaggle_Spaceship_Titanic/refs/heads/main/train.csv"
df = pd.read_csv(datapath_train)

In [None]:
# --- Preprocessing & Feature Engineering ---
df[['Deck', 'CabinNum', 'Side']] = df['Cabin'].str.split('/', expand=True)
df['CabinNum'] = pd.to_numeric(df['CabinNum'], errors='coerce')
df['Age'] = df['Age'].replace(0.0, np.nan)

In [None]:
for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']:
    df[col].fillna(df[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
  df[col].fillna(df[col].mode()[0], inplace=True)


In [None]:
df['CryoSleep'] = df['CryoSleep'].map({'True': 1, 'False': 0, True: 1, False: 0})
df['VIP'] = df['VIP'].map({'True': 1, 'False': 0, True: 1, False: 0})
df['Transported'] = df['Transported'].map({True: 1, False: 0})

In [None]:
spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df['TotalSpend'] = df[spend_cols].sum(axis=1)
df['TotalSpend_log'] = np.log1p(df['TotalSpend'])
df['LowSpender'] = (df['TotalSpend_log'] < 1.0).astype(int)
df['SpendTypeDominant'] = df[spend_cols].idxmax(axis=1)

In [None]:
# --- Group features ---
df['Group'] = df['PassengerId'].apply(lambda x: x.split('_')[0])
group_sizes = df['Group'].value_counts().to_dict()
df['GroupSize'] = df['Group'].map(group_sizes)
df['IsAlone'] = (df['GroupSize'] == 1).astype(int)
df['Group'] = df['Group'].map(group_sizes)
df['NoSpendFlag'] = (df[spend_cols].sum(axis=1) == 0).astype(int)
df['GroupHasVIP'] = df.groupby('Group')['VIP'].transform('max')
df['GroupCryoRate'] = df.groupby('Group')['CryoSleep'].transform('mean')

In [None]:
# --- Destination features ---
df['To_55_Cancri_E'] = (df['Destination'] == '55 Cancri e').astype(int)
df['Cryo_Dest_Combo'] = df['CryoSleep'].astype(str) + '_' + df['Destination']

In [None]:
# --- Family-aware features ---
df['Surname'] = df['Name'].str.split(' ').str[-1]
df['FamilyGroup'] = df['Group'].astype(str) + '_' + df['Surname']
df['FamilyGroupSize'] = df['FamilyGroup'].map(df['FamilyGroup'].value_counts())
df['IsFamilyGroup'] = (df['FamilyGroupSize'] > 1).astype(int)

In [None]:
# --- Feature interactions ---
df['Age_VIP'] = df['Age'] * df['VIP']
df['GroupSize_Cryo'] = df['GroupSize'] * df['CryoSleep']
df['Spend_per_Age'] = df['TotalSpend_log'] / (df['Age'] + 1)
df['Room_Food_Ratio'] = df['RoomService'] / (df['FoodCourt'] + 1)
df['Spa_VR_Ratio'] = df['Spa'] / (df['VRDeck'] + 1)

In [None]:
# Drop only if the columns exist
drop_cols = ['Name', 'Cabin', 'PassengerId', 'TotalSpend', 'Surname', 'FamilyGroup']
df.drop([col for col in drop_cols if col in df.columns], axis=1, inplace=True)

In [None]:
# --- One-hot encode ---
df = pd.get_dummies(df, columns=[
    'SpendTypeDominant', 'Cryo_Dest_Combo',
    'HomePlanet', 'Destination', 'Deck', 'Side'
], drop_first=True)

In [None]:
# --- Impute numerics ---
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpend_log', 'CabinNum']
imputer = KNNImputer(n_neighbors=5)
df[num_cols] = imputer.fit_transform(df[num_cols])

In [None]:
# --- Split dataset ---
X = df.drop('Transported', axis=1)
y = df['Transported']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [None]:
# --- Train CatBoost with GridSearchCV ---
params = {
    'depth': [4, 6],
    'learning_rate': [0.03, 0.1],
    'iterations': [200],
    'l2_leaf_reg': [1, 3]
}

In [None]:
model = CatBoostClassifier(loss_function='Logloss', verbose=0, random_state=42)
grid = GridSearchCV(model, param_grid=params, cv=3, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)


In [None]:
# --- Evaluation ---
best_model = grid.best_estimator_
val_preds = best_model.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)

In [None]:
print("Validation Accuracy:", val_acc)
print("Best Parameters:", grid.best_params_)

Validation Accuracy: 0.8033352501437608
Best Parameters: {'depth': 4, 'iterations': 200, 'l2_leaf_reg': 3, 'learning_rate': 0.1}


In [None]:
datapath_test="https://raw.githubusercontent.com/kagglechallengegroup18/Kaggle_Spaceship_Titanic/refs/heads/main/test.csv"
test_df_processed= pd.read_csv(datapath_test)
passenger_ids = test_df_processed["PassengerId"].copy()

In [None]:
# Cabin decomposition
if 'Cabin' in test_df_processed.columns:
    test_df_processed[['Deck', 'CabinNum', 'Side']] = test_df_processed['Cabin'].str.split('/', expand=True)
    test_df_processed['CabinNum'] = pd.to_numeric(test_df_processed['CabinNum'], errors='coerce')
else:
    test_df_processed['Deck'] = test_df_processed['CabinNum'] = test_df_processed['Side'] = np.nan

# Replace Age == 0.0 with NaN
test_df_processed['Age'] = test_df_processed['Age'].replace(0.0, np.nan)

# Fill mode for categoricals
for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']:
    if col in test_df_processed.columns:
        test_df_processed[col].fillna(test_df_processed[col].mode()[0], inplace=True)

# Boolean mapping
test_df_processed['CryoSleep'] = test_df_processed['CryoSleep'].map({'True': 1, 'False': 0, True: 1, False: 0})
test_df_processed['VIP'] = test_df_processed['VIP'].map({'True': 1, 'False': 0, True: 1, False: 0})

# Spending features
spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
test_df_processed['TotalSpend'] = test_df_processed[spend_cols].sum(axis=1)
test_df_processed['TotalSpend_log'] = np.log1p(test_df_processed['TotalSpend'])
test_df_processed['LowSpender'] = (test_df_processed['TotalSpend_log'] < 1.0).astype(int)
test_df_processed['SpendTypeDominant'] = test_df_processed[spend_cols].idxmax(axis=1)

# Group features
test_df_processed['Group'] = test_df_processed['PassengerId'].apply(lambda x: x.split('_')[0])
group_sizes = test_df_processed['Group'].value_counts().to_dict()
test_df_processed['GroupSize'] = test_df_processed['Group'].map(group_sizes)
test_df_processed['IsAlone'] = (test_df_processed['GroupSize'] == 1).astype(int)
test_df_processed['Group'] = test_df_processed['Group'].map(group_sizes)
test_df_processed['NoSpendFlag'] = (test_df_processed[spend_cols].sum(axis=1) == 0).astype(int)
test_df_processed['GroupHasVIP'] = test_df_processed.groupby('Group')['VIP'].transform('max')
test_df_processed['GroupCryoRate'] = test_df_processed.groupby('Group')['CryoSleep'].transform('mean')

# Destination-based features
test_df_processed['To_55_Cancri_E'] = (test_df_processed['Destination'] == '55 Cancri e').astype(int)
test_df_processed['Cryo_Dest_Combo'] = test_df_processed['CryoSleep'].astype(str) + '_' + test_df_processed['Destination']

# Name-based family features
test_df_processed['Surname'] = test_df_processed['Name'].str.split(' ').str[-1]
test_df_processed['FamilyGroup'] = test_df_processed['Group'].astype(str) + '_' + test_df_processed['Surname']
test_df_processed['FamilyGroupSize'] = test_df_processed['FamilyGroup'].map(test_df_processed['FamilyGroup'].value_counts())
test_df_processed['IsFamilyGroup'] = (test_df_processed['FamilyGroupSize'] > 1).astype(int)

# Interaction features
test_df_processed['Age_VIP'] = test_df_processed['Age'] * test_df_processed['VIP']
test_df_processed['GroupSize_Cryo'] = test_df_processed['GroupSize'] * test_df_processed['CryoSleep']
test_df_processed['Spend_per_Age'] = test_df_processed['TotalSpend_log'] / (test_df_processed['Age'] + 1)
test_df_processed['Room_Food_Ratio'] = test_df_processed['RoomService'] / (test_df_processed['FoodCourt'] + 1)
test_df_processed['Spa_VR_Ratio'] = test_df_processed['Spa'] / (test_df_processed['VRDeck'] + 1)

# Drop unused
drop_cols = ['Name', 'Cabin', 'PassengerId', 'TotalSpend', 'Surname', 'FamilyGroup']
test_df_processed.drop([col for col in drop_cols if col in test_df_processed.columns], axis=1, inplace=True)

# One-hot encode new categorical
test_df_processed = pd.get_dummies(test_df_processed, columns=[
    'SpendTypeDominant', 'Cryo_Dest_Combo', 'HomePlanet', 'Destination', 'Deck', 'Side'
], drop_first=True)

# Align columns with training set
for col in X.columns:
    if col not in test_df_processed.columns:
        test_df_processed[col] = 0
test_df_processed = test_df_processed[X.columns]  # same order

# Impute numerics
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpend_log', 'CabinNum']
imputer = KNNImputer(n_neighbors=5)
test_df_processed[num_cols] = imputer.fit_transform(test_df_processed[num_cols])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df_processed[col].fillna(test_df_processed[col].mode()[0], inplace=True)
  test_df_processed[col].fillna(test_df_processed[col].mode()[0], inplace=True)


In [None]:

# --- Predict and Save Submission ---
test_preds = best_model.predict(test_df_processed)
submission = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Transported": test_preds.astype(bool)
})
submission.to_csv("CATBooster2.csv", index=False)
print("CATBooster2.csv saved ")

CATBooster2.csv saved 


In [None]:
from google.colab import files

files.download('CATBooster2.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>