In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score

In [3]:
# Load train data
datapath_train = "https://raw.githubusercontent.com/kagglechallengegroup18/Kaggle_Spaceship_Titanic/refs/heads/main/train.csv"
df = pd.read_csv(datapath_train)

In [4]:
df[['Deck', 'CabinNum', 'Side']] = df['Cabin'].str.split('/', expand=True)
df['Age'] = df['Age'].replace(0.0, np.nan)

In [5]:
# Categorical mode fill
for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']:
    df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
  df[col].fillna(df[col].mode()[0], inplace=True)


In [6]:
# Boolean encoding
df['CryoSleep'] = df['CryoSleep'].map({'True': 1, 'False': 0, True: 1, False: 0})
df['VIP'] = df['VIP'].map({'True': 1, 'False': 0, True: 1, False: 0})
df['Transported'] = df['Transported'].map({True: 1, False: 0})

In [7]:
# Spending and log
spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df['TotalSpend'] = df[spend_cols].sum(axis=1)
df['TotalSpend_log'] = np.log1p(df['TotalSpend'])

In [8]:
# Group features
df['Group'] = df['PassengerId'].apply(lambda x: x.split('_')[0])
group_sizes = df['Group'].value_counts().to_dict()
df['GroupSize'] = df['Group'].map(group_sizes)
df['NoSpendFlag'] = (df[spend_cols].sum(axis=1) == 0).astype(int)
df['Group'] = df['Group'].map(group_sizes)

In [9]:
# Interaction features
df['Age_VIP'] = df['Age'] * df['VIP']
df['GroupSize_Cryo'] = df['GroupSize'] * df['CryoSleep']
df['Spend_per_Age'] = df['TotalSpend_log'] / (df['Age'] + 1)
df['Room_Food_Ratio'] = df['RoomService'] / (df['FoodCourt'] + 1)
df['Spa_VR_Ratio'] = df['Spa'] / (df['VRDeck'] + 1)

In [10]:
# Drop unused
df.drop(['Name', 'Cabin', 'PassengerId', 'CabinNum', 'TotalSpend'], axis=1, inplace=True)

In [11]:
# KNN impute numerics
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpend_log']
imputer = KNNImputer(n_neighbors=5)
df[num_cols] = imputer.fit_transform(df[num_cols])


In [12]:

#  Split
X = df.drop('Transported', axis=1)
y = df['Transported']

In [13]:
# Categorical feature indices (CatBoost native)
cat_features = [X.columns.get_loc(col) for col in X.select_dtypes(include='object').columns]

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [15]:
#  Train CatBoost
params = {
    'depth': [4, 6],
    'learning_rate': [0.03, 0.1],
    'iterations': [200, 300],
    'l2_leaf_reg': [1, 3]
}

In [16]:
model = CatBoostClassifier(loss_function='Logloss', verbose=0, random_state=42)
grid = GridSearchCV(estimator=model, param_grid=params, cv=3, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train, cat_features=cat_features)

In [17]:
best_model = grid.best_estimator_
val_preds = best_model.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)

In [18]:
print("Best Parameters:", grid.best_params_)
print(f"Validation Accuracy: {val_acc:.4f}")

Best Parameters: {'depth': 6, 'iterations': 300, 'l2_leaf_reg': 3, 'learning_rate': 0.03}
Validation Accuracy: 0.8120


In [19]:
datapath_test="https://raw.githubusercontent.com/kagglechallengegroup18/Kaggle_Spaceship_Titanic/refs/heads/main/test.csv"
test_df= pd.read_csv(datapath_test)
passenger_ids = test_df["PassengerId"].copy()

In [20]:
# Preprocess like training
test_df[['Deck', 'CabinNum', 'Side']] = test_df['Cabin'].str.split('/', expand=True)
test_df['Age'] = test_df['Age'].replace(0.0, np.nan)

In [21]:
for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']:
    test_df[col].fillna(test_df[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(test_df[col].mode()[0], inplace=True)
  test_df[col].fillna(test_df[col].mode()[0], inplace=True)


In [22]:
test_df['CryoSleep'] = test_df['CryoSleep'].map({'True': 1, 'False': 0, True: 1, False: 0})
test_df['VIP'] = test_df['VIP'].map({'True': 1, 'False': 0, True: 1, False: 0})

spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
test_df['TotalSpend'] = test_df[spend_cols].sum(axis=1)
test_df['TotalSpend_log'] = np.log1p(test_df['TotalSpend'])

In [23]:
test_df['Group'] = test_df['PassengerId'].apply(lambda x: x.split('_')[0])
group_sizes = test_df['Group'].value_counts().to_dict()
test_df['GroupSize'] = test_df['Group'].map(group_sizes)
test_df['NoSpendFlag'] = (test_df[spend_cols].sum(axis=1) == 0).astype(int)
test_df['Group'] = test_df['Group'].map(group_sizes)

In [24]:
# Interaction features
test_df['Age_VIP'] = test_df['Age'] * test_df['VIP']
test_df['GroupSize_Cryo'] = test_df['GroupSize'] * test_df['CryoSleep']
test_df['Spend_per_Age'] = test_df['TotalSpend_log'] / (test_df['Age'] + 1)
test_df['Room_Food_Ratio'] = test_df['RoomService'] / (test_df['FoodCourt'] + 1)
test_df['Spa_VR_Ratio'] = test_df['Spa'] / (test_df['VRDeck'] + 1)

In [25]:
# Drop unused
test_df.drop(['Name', 'Cabin', 'PassengerId', 'CabinNum', 'TotalSpend'], axis=1, inplace=True)

# Align test to train features
test_df = test_df.reindex(columns=X.columns, fill_value=0)

In [26]:
# KNN impute
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpend_log']
imputer = KNNImputer(n_neighbors=5)
test_df[num_cols] = imputer.fit_transform(test_df[num_cols])

In [27]:

# --- Predict and Save Submission ---
test_preds = best_model.predict(test_df)
submission = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Transported": test_preds.astype(bool)
})
submission.to_csv("CATBooster1.csv", index=False)
print("CATBooster1.csv saved ")

CATBooster1.csv saved 


In [28]:
from google.colab import files

#files.download('CATBoost.csv')