In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

from catboost import CatBoostClassifier, Pool

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train['retention_status'] = train['retention_status'].map({
    "Stayed": 1,
    "Left": 0
}).astype(int)




def merge_rare(df, col, th=0.005):
    freq = df[col].value_counts(normalize=True)
    rare = freq[freq < th].index
    df[col] = df[col].replace(rare, "OTHER")

for col in train.select_dtypes(include="object").columns:
    merge_rare(train, col)
    merge_rare(test, col)

for df in [train, test]:
    # Avoid division errors
    df['years_since_founding'] = df['years_since_founding'].fillna(df['years_since_founding'].median())
    
    df['revenue_per_year'] = df['monthly_revenue_generated'] / (df['years_since_founding'] + 1)
    df['life_investment_ratio'] = df['years_with_startup'] / (df['founder_age'] + 1)
    df['age_at_founding'] = df['founder_age'] - df['years_with_startup']
    df['revenue_per_round'] = df['monthly_revenue_generated'] / (df['funding_rounds_led'] + 1)
    
    # Interaction features
    df['commitment_x_team'] = df['years_with_startup'] * df['team_size_category'].astype('category').cat.codes
    df['satisfaction_x_balance'] = df['venture_satisfaction'].astype('category').cat.codes * \
                                   df['work_life_balance_rating'].astype('category').cat.codes
TARGET = "retention_status"
ID_COL = "founder_id"

X = train.drop(columns=[TARGET, ID_COL])
y = train[TARGET]
X_test = test.drop(columns=[ID_COL])
cat_cols = X.select_dtypes(include="object").columns.tolist()

# Ensure all categorical columns are proper strings (no NaN, no floats)
for col in cat_cols:
    X[col] = X[col].astype(str)
    X_test[col] = X_test[col].astype(str)


cat_model = CatBoostClassifier(
    iterations=1500,
    learning_rate=0.03,
    depth=8,
    loss_function="Logloss",
    eval_metric="F1",
    random_state=42,
    verbose=False,
    task_type="CPU",
    l2_leaf_reg=4,
    early_stopping_rounds=50,
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(train))
test_preds = np.zeros(len(test))

fold = 1

for train_idx, valid_idx in skf.split(X, y):
    
    print(f"Training Fold {fold}...")
    
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    train_pool = Pool(X_train, y_train, cat_features=cat_cols)
    valid_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

    model = cat_model.fit(train_pool, eval_set=valid_pool)
    
    # OOF Predictions
    oof_preds[valid_idx] = model.predict_proba(X_valid)[:,1]
    
    # Test Predictions
    test_preds += model.predict_proba(X_test)[:,1] / skf.n_splits
    
    print(f"Fold {fold} F1:", f1_score(y_valid, (model.predict(X_valid)).astype(int)))
    
    fold += 1

threshold = 0.5
final_oof = (oof_preds >= threshold).astype(int)

print("\nOOF F1 Score:", f1_score(y, final_oof))

final_test_pred = (test_preds >= threshold).astype(int)

submission = pd.DataFrame({
    "founder_id": test["founder_id"],
    "retention_status": final_test_pred
})

submission['retention_status'] = submission['retention_status'].map({1: "Stayed", 0: "Left"})
submission.to_csv("submission_catboost_best.csv", index=False)
print(" Submission Saved as submission_catboost_best.csv")
