In [None]:
!pip install catboost openpyxl --quiet

# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score



# Load Data
train = pd.read_excel("/content/train.xlsx")
test = pd.read_excel("/content/test.xlsx")
submission = pd.read_excel("/content/sample_submission.xlsx")

# Combine for preprocessing
train['is_train'] = 1
test['is_train'] = 0
test['Transported'] = np.nan
df = pd.concat([train, test])

# --- Feature Engineering ---
df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)

# New features
df['TotalSpend'] = df[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].sum(axis=1)
df['NoSpending'] = df['TotalSpend'] == 0
df['FamilyGroup'] = df['PassengerId'].str.split('_').str[0]
df['Solo'] = df['FamilyGroup'].map(df['FamilyGroup'].value_counts()) == 1

# Fill missing values
num_cols = ['Age', 'RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
df[num_cols] = df[num_cols].fillna(df[num_cols].median())
df['HomePlanet'] = df['HomePlanet'].fillna('Earth')
df['CryoSleep'] = df['CryoSleep'].fillna(False)
df['Deck'] = df['Deck'].fillna('F')
df['Side'] = df['Side'].fillna('S')
df['VIP'] = df['VIP'].fillna(False)
df['Destination'] = df['Destination'].fillna('TRAPPIST-1e')
df['Name'] = df['Name'].fillna('Unknown')

# Encode categoricals
cat_cols = ['HomePlanet','CryoSleep','VIP','Deck','Side','Destination','NoSpending','Solo']
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Drop unused columns
df = df.drop(['PassengerId','Name','Cabin','Num','FamilyGroup','is_train'], axis=1)

# Split back
train = df[:len(train)]
test = df[len(train):].drop(columns='Transported')
train['Transported'] = train['Transported'].astype(bool)


X = train.drop(columns='Transported')
y = train['Transported']

test_X = test.copy()

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds = np.zeros(len(test))
val_preds = np.zeros(len(train))
thresholds = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = CatBoostClassifier(
        iterations=1500,
        learning_rate=0.02,
        depth=8,
        l2_leaf_reg=3,
        eval_metric='Accuracy',
        random_seed=42,
        verbose=0
    )

    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100)

    val_prob = model.predict_proba(X_val)[:, 1]
    test_prob = model.predict_proba(test_X)[:, 1]

    # Optimize threshold per fold
    best_thresh = 0.5
    best_f1 = 0
    for t in np.arange(0.40, 0.60, 0.01):
        score = f1_score(y_val, val_prob > t)
        if score > best_f1:
            best_f1 = score
            best_thresh = t
    thresholds.append(best_thresh)

    val_preds[val_idx] = val_prob
    test_preds += test_prob / kf.n_splits

# Use average optimized threshold
final_threshold = np.mean(thresholds)
print(f"Optimal Threshold: {final_threshold:.4f}")

submission['Transported'] = test_preds > final_threshold
submission.to_csv("submission_catboost_boosted.csv", index=False)
print("submission saved as 'submission_catboost_boosted.csv'")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Transported'] = train['Transported'].astype(bool)


Optimal Threshold: 0.4500
submission saved as 'submission_catboost_boosted.csv'
