In [20]:
pip install catboost



In [31]:
# Required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from catboost import CatBoostClassifier

# Load the dataset
df = pd.read_csv('/content/train.csv')
print(df.head())

  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1         True  
2        False  
3        False  
4         True  


In [32]:
# Identify categorical columns in the DataFrame
categorical_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']

for col in categorical_cols:
    df[col] = df[col].astype(str)

for col in categorical_cols:
    df[col].fillna('Unknown', inplace=True)

df['Cabin'].fillna('1', inplace=True)
df['Cabin'] = df['Cabin'].astype('category')

df['CryoSleep'] = df['CryoSleep'].replace({'True': True, 'False': False})
df['VIP'] = df['VIP'].replace({'True': True, 'False': False})

df['Age'].fillna(df['Age'].mean(), inplace=True)

In [27]:
# Prepare features (X) and target variable (y)
X = df.drop(columns=['PassengerId', 'Name', 'Transported'])
y = df['Transported'].astype(int)

# Sample a fraction of the data for quicker testing
df_sample = df.sample(frac=0.1, random_state=42)
X_sample = df_sample.drop(columns=['PassengerId', 'Name', 'Transported'])
y_sample = df_sample['Transported'].astype(int)

# Split the sample data into training and testing sets
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.33, random_state=42)

In [28]:
# Define the CatBoostClassifier
model = CatBoostClassifier(verbose=0)

# Set the hyperparameters for tuning (reduced grid)
param_grid = {
    'depth': [4, 6],
    'learning_rate': [0.01, 0.1],
    'iterations': [100],
    'l2_leaf_reg': [1, 3]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           scoring='f1',
                           cv=2,  # Reduced to 2 folds
                           verbose=2,
                           n_jobs=-1)

# Fit the model using GridSearchCV on the sample data
grid_search.fit(X_train_sample, y_train_sample, cat_features=categorical_cols)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test data
y_pred = best_model.predict(X_test_sample)


Fitting 2 folds for each of 8 candidates, totalling 16 fits


In [30]:
# Print best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Calculate accuracy
accuracy = accuracy_score(y_test_sample, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Calculate F1 score
f1 = f1_score(y_test_sample, y_pred)
print(f"F1 Score: {f1:.4f}")

# Predict probabilities for AUC-ROC calculation
y_pred_proba = best_model.predict_proba(X_test_sample)[:, 1]

# Calculate AUC-ROC score
auc_roc = roc_auc_score(y_test_sample, y_pred_proba)
print(f"AUC-ROC Score: {auc_roc:.4f}")

Best Hyperparameters: {'depth': 6, 'iterations': 100, 'l2_leaf_reg': 3, 'learning_rate': 0.1}
Accuracy: 0.7979
F1 Score: 0.7914
AUC-ROC Score: 0.8611
