In [9]:
pip install catboost



In [22]:
# Required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from catboost import CatBoostClassifier

# Load the dataset
df = pd.read_csv('/content/train.csv')
print(df.head())

  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1         True  
2        False  
3        False  
4         True  


In [24]:
# Identify categorical columns in the DataFrame
categorical_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']

for col in categorical_cols:
    df[col] = df[col].astype(str)

for col in categorical_cols:
    df[col].fillna('Unknown', inplace=True)

df['Cabin'].fillna('1', inplace=True)
df['Cabin'] = df['Cabin'].astype('category')

df['CryoSleep'] = df['CryoSleep'].replace({'True': True, 'False': False})
df['VIP'] = df['VIP'].replace({'True': True, 'False': False})

df['Age'].fillna(df['Age'].mean(), inplace=True)

# Prepare features (X) and target variable (y)
X = df.drop(columns=['PassengerId', 'Name', 'Transported'])
y = df['Transported'].astype(int)

In [27]:
# Sample a fraction of the data for quicker testing
df_sample = df.sample(frac=0.05, random_state=42)  # Use a smaller fraction
X_sample = df_sample.drop(columns=['PassengerId', 'Name', 'Transported'])
y_sample = df_sample['Transported'].astype(int)

# Split the sample data into training and testing sets
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.33, random_state=42)

# Define the CatBoostClassifier
model = CatBoostClassifier(verbose=0)

In [29]:
print(df.dtypes)

PassengerId       object
HomePlanet        object
CryoSleep         object
Cabin           category
Destination       object
Age              float64
VIP               object
RoomService      float64
FoodCourt        float64
ShoppingMall     float64
Spa              float64
VRDeck           float64
Name              object
Transported         bool
dtype: object


In [30]:
# Check for null values in each column
null_values = df.isnull().sum()

# Display the count of null values
print(null_values[null_values > 0])


RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
dtype: int64


In [26]:
# Set the hyperparameters for tuning (new grid)
param_distributions = {
    'depth': [4, 5, 7],
    'learning_rate': [0.01, 0.05],
    'iterations': [100, 150],
    'l2_leaf_reg': [1, 5],
    'border_count': [32, 50],
    'scale_pos_weight': [1, 2]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model,
                                   param_distributions=param_distributions,
                                   scoring='f1',  # Change this to 'accuracy' or 'roc_auc' if needed
                                   n_iter=20,  # Number of random combinations to try
                                   cv=2,  # Reduced folds for quicker results
                                   verbose=2,
                                   n_jobs=-1,
                                   random_state=42)

# Fit the model using RandomizedSearchCV on the sample data
random_search.fit(X_train_sample, y_train_sample, cat_features=categorical_cols)

# Get the best model
best_model = random_search.best_estimator_

# Print best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Make predictions on the test sample
y_pred = best_model.predict(X_test_sample)


Fitting 2 folds for each of 20 candidates, totalling 40 fits
Best Hyperparameters: {'scale_pos_weight': 1, 'learning_rate': 0.05, 'l2_leaf_reg': 1, 'iterations': 150, 'depth': 5, 'border_count': 32}


In [21]:
# Calculate accuracy
accuracy = accuracy_score(y_test_sample, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Calculate F1 score
f1 = f1_score(y_test_sample, y_pred)
print(f"F1 Score: {f1:.4f}")

# Predict probabilities for AUC-ROC calculation
y_pred_proba = best_model.predict_proba(X_test_sample)[:, 1]

# Calculate AUC-ROC score
auc_roc = roc_auc_score(y_test_sample, y_pred_proba)
print(f"AUC-ROC Score: {auc_roc:.4f}")


Accuracy: 0.7500
F1 Score: 0.7692
AUC-ROC Score: 0.7893
