In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [18]:
from catboost import CatBoostClassifier, Pool
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [19]:
# Load and preprocess train data
train_data = pd.read_csv("/content/drive/MyDrive/Untitled folder/train.csv")
test_data = pd.read_csv("/content/drive/MyDrive/Untitled folder/test.csv")

In [20]:
# Handle missing values with improved imputation
for column in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    # Use mean imputation with a flag for missingness
    train_data[f'{column}_missing'] = train_data[column].isna()
    test_data[f'{column}_missing'] = test_data[column].isna()
    train_data[column] = train_data[column].fillna(train_data[column].mean())
    test_data[column] = test_data[column].fillna(test_data[column].mean())

In [21]:
for column in ['CryoSleep', 'VIP', 'HomePlanet', 'Destination']:
    train_data[column] = train_data[column].fillna('Unknown').astype(str)
    test_data[column] = test_data[column].fillna('Unknown').astype(str)

train_data['Cabin'] = train_data['Cabin'].fillna('Unknown/0/Unknown')
test_data['Cabin'] = test_data['Cabin'].fillna('Unknown/0/Unknown')


In [22]:
# Feature engineering
train_data['Deck'] = train_data['Cabin'].apply(lambda x: x.split('/')[0])
test_data['Deck'] = test_data['Cabin'].apply(lambda x: x.split('/')[0])
train_data['Side'] = train_data['Cabin'].apply(lambda x: x.split('/')[2] if '/' in x else 'Unknown')
test_data['Side'] = test_data['Cabin'].apply(lambda x: x.split('/')[2] if '/' in x else 'Unknown')
train_data['TotalSpend'] = train_data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
test_data['TotalSpend'] = test_data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

In [23]:
# Scale numerical features
numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpend']
scaler = StandardScaler()
train_data[numerical_features] = scaler.fit_transform(train_data[numerical_features])
test_data[numerical_features] = scaler.transform(test_data[numerical_features])

In [24]:
# Prepare features and target
categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']
for col in categorical_features:
    train_data[col] = train_data[col].astype(str)
    test_data[col] = test_data[col].astype(str)

X_train = train_data.drop(columns=['PassengerId', 'Cabin', 'Name', 'Transported'])
y_train = train_data['Transported'].astype(int)
X_test = test_data.drop(columns=['PassengerId', 'Cabin', 'Name'])

In [25]:
# Create CatBoost Pool with categorical features
train_pool = Pool(X_train, y_train, cat_features=categorical_features)
test_pool = Pool(X_test, cat_features=categorical_features)

In [26]:
# Initialize and train the model with tuned parameters
model = CatBoostClassifier(
    iterations=200,
    depth=8,
    learning_rate=0.05,
    l2_leaf_reg=3,
    verbose=10
)
model.fit(train_pool)

0:	learn: 0.6709115	total: 202ms	remaining: 40.2s
10:	learn: 0.5303761	total: 1.68s	remaining: 28.9s
20:	learn: 0.4741713	total: 3.11s	remaining: 26.5s
30:	learn: 0.4434617	total: 4.21s	remaining: 22.9s
40:	learn: 0.4265011	total: 4.93s	remaining: 19.1s
50:	learn: 0.4120001	total: 5.65s	remaining: 16.5s
60:	learn: 0.4038480	total: 6.52s	remaining: 14.9s
70:	learn: 0.3970137	total: 7.32s	remaining: 13.3s
80:	learn: 0.3917205	total: 7.9s	remaining: 11.6s
90:	learn: 0.3863753	total: 8.47s	remaining: 10.2s
100:	learn: 0.3822715	total: 8.9s	remaining: 8.73s
110:	learn: 0.3807631	total: 9.18s	remaining: 7.36s
120:	learn: 0.3784132	total: 9.43s	remaining: 6.16s
130:	learn: 0.3759957	total: 9.73s	remaining: 5.12s
140:	learn: 0.3731584	total: 10s	remaining: 4.2s
150:	learn: 0.3686448	total: 10.3s	remaining: 3.35s
160:	learn: 0.3657483	total: 10.6s	remaining: 2.58s
170:	learn: 0.3630277	total: 10.9s	remaining: 1.85s
180:	learn: 0.3597865	total: 11.3s	remaining: 1.18s
190:	learn: 0.3571454	total:

<catboost.core.CatBoostClassifier at 0x78178a1aec10>

In [27]:
# Predict on test data
predictions = model.predict(test_pool)

# Create submission DataFrame
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Transported': predictions.astype(bool)
})

# Save to CSV
submission.to_csv('predictions.csv', index=False)

# Evaluate model
train_predictions = model.predict(train_pool)
accuracy = (train_predictions == y_train).mean()
print(f"Training Accuracy: {accuracy:.4f}")

Training Accuracy: 0.8322
