In [None]:

pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier, Pool
import gc

In [None]:
datapath_train = "https://raw.githubusercontent.com/kagglechallengegroup18/Kaggle_Spaceship_Titanic/refs/heads/main/train.csv"
df_train = pd.read_csv(datapath_train)

In [None]:
datapath_test="https://raw.githubusercontent.com/kagglechallengegroup18/Kaggle_Spaceship_Titanic/refs/heads/main/test.csv"
df_test= pd.read_csv(datapath_test)
passenger_ids = df_test["PassengerId"].copy()

In [None]:
#  PREPROCESSING FUNCTION
def preprocess(df, imputer=None, is_train=True):
    df = df.copy()

    df['Age'] = df['Age'].replace(0, np.nan)
    cat_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']
    df[cat_cols] = df[cat_cols].fillna('Missing')
    df['CryoSleep'] = df['CryoSleep'].astype(str)
    df['VIP'] = df['VIP'].astype(str)

    df[['Deck', 'CabinNum', 'Side']] = df['Cabin'].str.split('/', expand=True)
    df[['Deck', 'CabinNum', 'Side']] = df[['Deck', 'CabinNum', 'Side']].fillna('Missing')

    df['Group'] = df['PassengerId'].str.split('_').str[0]
    spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df['TotalSpend'] = df[spend_cols].sum(axis=1)
    for col in spend_cols:
        df[f'{col}_Spent'] = (df[col] > 0).astype(int)

    df['NameLength'] = df['Name'].apply(lambda x: len(str(x)))
    df['HasName'] = df['Name'].apply(lambda x: 0 if x == 'Missing' else 1)

    num_cols = ['Age'] + spend_cols
    if is_train:
        imputer = KNNImputer(n_neighbors=5)
        df[num_cols] = imputer.fit_transform(df[num_cols])
    else:
        df[num_cols] = imputer.transform(df[num_cols])

    drop_cols = ['PassengerId', 'Name', 'Cabin']
    if is_train:
        y = df['Transported'].astype(int)
        X = df.drop(columns=drop_cols + ['Transported'])
        return X, y, imputer
    else:
        return df.drop(columns=drop_cols)

In [None]:
X, y, imputer = preprocess(df_train, is_train=True)
X_test = preprocess(df_test, imputer=imputer, is_train=False)

cat_cols = [col for col in X.columns if X[col].dtype == "object"]

In [None]:
# STRATIFIED K-FOLD CV ON CPU
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
models = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"\n🔁 Fold {fold+1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    train_pool = Pool(X_train, y_train, cat_features=cat_cols)
    val_pool = Pool(X_val, y_val, cat_features=cat_cols)

    model = CatBoostClassifier(
        task_type='GPU',  #  SAFE FOR CV
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        random_seed=42,
        early_stopping_rounds=50,
        verbose=100
    )

    model.fit(train_pool, eval_set=val_pool)
    preds = model.predict(X_val)
    acc = accuracy_score(y_val, preds)
    print(f" Fold {fold+1} Accuracy: {acc:.4f}")

    cv_scores.append(acc)
    models.append(model)
    del model
    gc.collect()

print(f"\n Mean CV Accuracy: {np.mean(cv_scores):.4f}")


🔁 Fold 1
0:	learn: 0.6691644	test: 0.6673266	best: 0.6673266 (0)	total: 47.2ms	remaining: 47.1s
100:	learn: 0.4050262	test: 0.4119958	best: 0.4119958 (100)	total: 3.61s	remaining: 32.1s
200:	learn: 0.3873229	test: 0.4028500	best: 0.4028500 (200)	total: 7.08s	remaining: 28.1s
300:	learn: 0.3771120	test: 0.4000466	best: 0.3998917 (272)	total: 10.4s	remaining: 24.2s
400:	learn: 0.3713039	test: 0.3982386	best: 0.3982264 (397)	total: 13.6s	remaining: 20.3s
500:	learn: 0.3636505	test: 0.3966979	best: 0.3966978 (499)	total: 16.8s	remaining: 16.8s
600:	learn: 0.3590242	test: 0.3956791	best: 0.3956658 (595)	total: 20s	remaining: 13.3s
700:	learn: 0.3535127	test: 0.3943636	best: 0.3942438 (695)	total: 23.3s	remaining: 9.95s
800:	learn: 0.3489582	test: 0.3936976	best: 0.3936525 (797)	total: 26.6s	remaining: 6.61s
900:	learn: 0.3441885	test: 0.3936469	best: 0.3933207 (865)	total: 30s	remaining: 3.29s
bestTest = 0.3933207253
bestIteration = 865
Shrink model to first 866 iterations.
✅ Fold 1 Accura

In [None]:
# FINAL MODEL ON GPU
final_model = CatBoostClassifier(
    task_type='GPU',
    devices='0',
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    early_stopping_rounds=50,
    verbose=100
)

final_pool = Pool(X, y, cat_features=cat_cols)
final_model.fit(final_pool)

0:	learn: 0.6673812	total: 36.4ms	remaining: 36.4s
100:	learn: 0.4030496	total: 3.39s	remaining: 30.2s
200:	learn: 0.3843093	total: 6.7s	remaining: 26.6s
300:	learn: 0.3742187	total: 10s	remaining: 23.3s
400:	learn: 0.3671586	total: 13.2s	remaining: 19.8s
500:	learn: 0.3606411	total: 16.6s	remaining: 16.6s
600:	learn: 0.3556532	total: 20s	remaining: 13.2s
700:	learn: 0.3507442	total: 23.3s	remaining: 9.95s
800:	learn: 0.3466523	total: 26.6s	remaining: 6.61s
900:	learn: 0.3424710	total: 29.9s	remaining: 3.29s
999:	learn: 0.3392797	total: 33.2s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x78c6c3d5b110>

In [None]:
# OPTIMIZE THRESHOLD ON VALIDATION SET
val_probs = final_model.predict_proba(X_val)[:, 1]

best_threshold = 0.5
best_score = 0

for t in np.arange(0.3, 0.71, 0.01):
    val_preds = (val_probs > t).astype(int)
    score = accuracy_score(y_val, val_preds)  # or use f1_score
    if score > best_score:
        best_score = score
        best_threshold = t

print(f"\n Best threshold: {best_threshold:.2f} with accuracy: {best_score:.4f}")


 Best threshold: 0.51 with accuracy: 0.8280


In [None]:
#  PREDICT ON TEST SET USING OPTIMIZED THRESHOLD
test_pool = Pool(X_test, cat_features=cat_cols)
test_probs = final_model.predict_proba(test_pool)[:, 1]
test_preds = (test_probs > best_threshold).astype(bool)

In [None]:
#  CREATE SUBMISSION FILE
submission = df_test[['PassengerId']].copy()
submission['Transported'] = test_preds
submission.to_csv("submission10.csv", index=False)

In [None]:
from google.colab import files

files.download('submission10.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>