In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install catboost



In [3]:
!pip install category_encoders



In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import StratifiedKFold
from category_encoders import TargetEncoder

In [24]:
# Handle missing values
for col in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    train_data[col] = train_data[col].fillna(train_data[col].mean())
    test_data[col] = test_data[col].fillna(test_data[col].mean())

In [25]:
# Convert categorical columns to strings and fill NaN
for col in ['CryoSleep', 'VIP', 'HomePlanet', 'Destination']:
    train_data[col] = train_data[col].astype(str).fillna('Unknown')
    test_data[col] = test_data[col].astype(str).fillna('Unknown')

train_data['Cabin'] = train_data['Cabin'].astype(str).fillna('Unknown/0/Unknown')
test_data['Cabin'] = test_data['Cabin'].astype(str).fillna('Unknown/0/Unknown')

In [26]:
# Feature engineering
# Split Cabin into Deck, Num, and Side
train_data[['Deck', 'Num', 'Side']] = train_data['Cabin'].str.split('/', expand=True)
test_data[['Deck', 'Num', 'Side']] = test_data['Cabin'].str.split('/', expand=True)
train_data['Num'] = pd.to_numeric(train_data['Num'], errors='coerce').fillna(0).astype(int)
test_data['Num'] = pd.to_numeric(test_data['Num'], errors='coerce').fillna(0).astype(int)
train_data['Deck'] = train_data['Deck'].fillna('Unknown')
train_data['Side'] = train_data['Side'].fillna('Unknown')
test_data['Deck'] = test_data['Deck'].fillna('Unknown')
test_data['Side'] = test_data['Side'].fillna('Unknown')

In [27]:
# Extract family size from Name
train_data['Surname'] = train_data['Name'].str.split().str[-1].fillna('Unknown')
test_data['Surname'] = test_data['Name'].str.split().str[-1].fillna('Unknown')
train_data['FamilySize'] = train_data.groupby('Surname')['PassengerId'].transform('count')
test_data['FamilySize'] = test_data.groupby('Surname')['PassengerId'].transform('count')
train_data['FamilySize'] = train_data['FamilySize'].fillna(1)
test_data['FamilySize'] = test_data['FamilySize'].fillna(1)

In [28]:
# Calculate TotalSpend and new features
train_data['TotalSpend'] = train_data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
test_data['TotalSpend'] = test_data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
train_data['CryoSleep_Binary'] = train_data['CryoSleep'].map({'True': 1, 'False': 0, 'Unknown': 0}).fillna(0).astype(int)
test_data['CryoSleep_Binary'] = test_data['CryoSleep'].map({'True': 1, 'False': 0, 'Unknown': 0}).fillna(0).astype(int)
train_data['CryoSpendInteraction'] = train_data['CryoSleep_Binary'] * train_data['TotalSpend']
test_data['CryoSpendInteraction'] = test_data['CryoSleep_Binary'] * test_data['TotalSpend']

In [29]:
# Polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = ['Age', 'TotalSpend', 'Num']
poly_train = poly.fit_transform(train_data[poly_features])
poly_test = poly.transform(test_data[poly_features])
poly_columns = [f'poly_{i}' for i in range(poly_train.shape[1])]
train_data[poly_columns] = poly_train
test_data[poly_columns] = poly_test

In [30]:
# Scale numerical features
numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpend', 'Num', 'CryoSpendInteraction', 'FamilySize', 'CryoSleep_Binary'] + poly_columns
scaler = StandardScaler()
train_data[numerical_features] = scaler.fit_transform(train_data[numerical_features])
test_data[numerical_features] = scaler.transform(test_data[numerical_features])


In [31]:
# Prepare features and target
categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side', 'Surname']
X_train = train_data.drop(columns=['PassengerId', 'Name', 'Transported', 'Cabin'])
y_train = train_data['Transported'].astype(int)
X_test = test_data.drop(columns=['PassengerId', 'Name', 'Cabin'])

In [32]:
# Check class imbalance
print("Class distribution:", y_train.value_counts())
class_counts = y_train.value_counts()
class_weight_0 = class_counts[1] / len(y_train)
class_weight_1 = class_counts[0] / len(y_train)
class_weights = {0: class_weight_0 * 1.5, 1: class_weight_1 * 1.5}

Class distribution: Transported
1    4378
0    4315
Name: count, dtype: int64


In [33]:
# Target encoding with manual cross-validation
encoder = TargetEncoder(cols=categorical_features)
# Initialize encoded DataFrames with numerical columns only
X_train_encoded = pd.DataFrame(train_data[numerical_features].copy(), columns=numerical_features)
X_test_encoded = pd.DataFrame(test_data[numerical_features].copy(), columns=numerical_features)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in skf.split(X_train, y_train):
    X_train_fold = X_train.iloc[train_idx]
    y_train_fold = y_train.iloc[train_idx]
    X_val_fold = X_train.iloc[val_idx]
    encoder.fit(X_train_fold[categorical_features], y_train_fold)
    encoded_vals = encoder.transform(X_val_fold[categorical_features])
    # Assign encoded values to the validation indices
    for col in categorical_features:
        X_train_encoded.loc[val_idx, col] = encoded_vals[col].values
encoder.fit(X_train[categorical_features], y_train)  # Fit on full train for test
X_test_encoded[categorical_features] = encoder.transform(X_test[categorical_features])


In [34]:
# Ensure all encoded columns are float
for col in categorical_features:
    X_train_encoded[col] = X_train_encoded[col].astype(float)
    X_test_encoded[col] = X_test_encoded[col].astype(float)

# Combine encoded and numerical features (already combined in initialization)
X_train_final = X_train_encoded
X_test_final = X_test_encoded

In [35]:
# Cross-validation and model training
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
test_preds = np.zeros(len(test_data))
models = []

from catboost import CatBoostClassifier
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.15],  # Slightly broadened
    'depth': [3, 4, 5],
    'l2_leaf_reg': [5, 10, 15]
}

best_score = 0
best_params = None
for lr in param_grid['learning_rate']:
    for d in param_grid['depth']:
        for l2 in param_grid['l2_leaf_reg']:
            model = CatBoostClassifier(
                iterations=500,
                learning_rate=lr,
                depth=d,
                l2_leaf_reg=l2,
                eval_metric='Accuracy',
                early_stopping_rounds=30,
                verbose=0,
                class_weights=class_weights
            )
            cv_scores = []
            for train_idx, val_idx in skf.split(X_train_final, y_train):
                X_train_fold = X_train_final.iloc[train_idx]
                y_train_fold = y_train.iloc[train_idx]
                X_val_fold = X_train_final.iloc[val_idx]
                y_val_fold = y_train.iloc[val_idx]
                model.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], verbose=0)
                val_preds = model.predict_proba(X_val_fold)[:, 1]
                val_preds = (val_preds > 0.5).astype(int)
                cv_scores.append((val_preds == y_val_fold).mean())
            mean_cv_score = np.mean(cv_scores)
            if mean_cv_score > best_score:
                best_score = mean_cv_score
                best_params = {'learning_rate': lr, 'depth': d, 'l2_leaf_reg': l2}

print(f"Best CV Score: {best_score:.4f} with params: {best_params}")

Best CV Score: 0.8120 with params: {'learning_rate': 0.15, 'depth': 5, 'l2_leaf_reg': 5}


In [36]:
# Train final model with best parameters and log feature importance
best_model = CatBoostClassifier(
    iterations=500,
    learning_rate=best_params['learning_rate'],
    depth=best_params['depth'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    eval_metric='Accuracy',
    early_stopping_rounds=30,
    verbose=0,
    class_weights=class_weights
)

In [37]:
# Fit on full training data to get feature importance
best_model.fit(X_train_final, y_train)
feature_importance = best_model.get_feature_importance()
feature_names = X_train_final.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
print("Feature Importance:\n", importance_df.sort_values(by='Importance', ascending=False))

for train_idx, val_idx in skf.split(X_train_final, y_train):
    X_train_fold = X_train_final.iloc[train_idx]
    y_train_fold = y_train.iloc[train_idx]
    X_val_fold = X_train_final.iloc[val_idx]
    y_val_fold = y_train.iloc[val_idx]
    best_model.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], verbose=0)
    models.append(best_model)
    test_preds += best_model.predict_proba(X_test_final)[:, 1] / n_splits

Feature Importance:
                  Feature  Importance
24                  Deck    9.296289
20            HomePlanet    9.160655
4                    Spa    6.930574
12                poly_1    6.439589
5                 VRDeck    6.368154
2              FoodCourt    6.073270
26               Surname    4.931209
25                  Side    4.508336
7                    Num    4.447268
1            RoomService    3.855540
18                poly_7    3.546942
3           ShoppingMall    3.527178
17                poly_6    2.919483
14                poly_3    2.918155
22           Destination    2.851151
16                poly_5    2.850233
10      CryoSleep_Binary    2.841244
15                poly_4    2.698337
13                poly_2    2.354907
9             FamilySize    1.961645
19                poly_8    1.847215
21             CryoSleep    1.770067
8   CryoSpendInteraction    1.508348
6             TotalSpend    1.312818
11                poly_0    1.156212
0                

In [38]:
# Test a refined threshold range
thresholds = [0.45, 0.47, 0.48, 0.49, 0.50, 0.51, 0.52, 0.53]
best_score = 0
best_predictions = None

for thresh in thresholds:
    predictions = (test_preds > thresh).astype(int)
    submission = pd.DataFrame({
        'PassengerId': test_data['PassengerId'],
        'Transported': predictions.astype(bool)
    })
    # Simulate score using CV accuracy
    cv_scores = []
    for i, (train_idx, val_idx) in enumerate(skf.split(X_train_final, y_train)):
        X_train_fold = X_train_final.iloc[train_idx]
        y_train_fold = y_train.iloc[train_idx]
        X_val_fold = X_train_final.iloc[val_idx]
        y_val_fold = y_train.iloc[val_idx]
        val_preds = models[i].predict_proba(X_val_fold)[:, 1]
        val_preds = (val_preds > thresh).astype(int)
        cv_scores.append((val_preds == y_val_fold).mean())
    mean_cv_score = np.mean(cv_scores)
    print(f"Threshold {thresh}: CV Accuracy = {mean_cv_score:.4f}")
    if mean_cv_score > best_score:
        best_score = mean_cv_score
        best_predictions = predictions

Threshold 0.45: CV Accuracy = 0.8574
Threshold 0.47: CV Accuracy = 0.8582
Threshold 0.48: CV Accuracy = 0.8594
Threshold 0.49: CV Accuracy = 0.8595
Threshold 0.5: CV Accuracy = 0.8614
Threshold 0.51: CV Accuracy = 0.8605
Threshold 0.52: CV Accuracy = 0.8591
Threshold 0.53: CV Accuracy = 0.8602


In [39]:
# Use the best predictions
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Transported': best_predictions.astype(bool)
})

# Save to CSV
submission.to_csv('predictions.csv', index=False)

print(f"Best Threshold: {thresholds[np.argmax([np.mean(cv_scores) for thresh in thresholds])]}")
print(f"Cross-Validation Accuracy: {best_score:.4f}")

Best Threshold: 0.45
Cross-Validation Accuracy: 0.8614
