In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score

# 1. PREPROCESSING 

print("Loading Data...")
train = pd.read_csv('BinaryTrain.csv')
test = pd.read_csv('TestBinary.csv')

TARGET = 'retention_status'
ID_COL = 'founder_id'

# Separate features and target
X = train.drop(columns=[TARGET, ID_COL])
y = train[TARGET]
X_test = test.drop(columns=[ID_COL], errors='ignore')

# Encode Target (Retained/Exited -> 0/1)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Define Preprocessing
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ],
    remainder='drop'
)

# Helper function to Train, Tune, and Save
def run_model(name, model, param_dist):
    print(f"\n=== Training {name} ===")

    # Create Pipeline
    clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Fast Hyperparameter Tuning
    search = RandomizedSearchCV(
        clf,
        param_distributions=param_dist,
        n_iter=10,             # 10 random combinations (Fast & Effective)
        scoring='f1_weighted', # Kaggle usually likes F1 or Accuracy
        cv=3,                  # 3-Fold CV is faster than 5
        n_jobs=-1,             # Use all CPU cores
        random_state=42,
        verbose=1
    )

    # Fit
    search.fit(X, y_encoded)
    print(f"  Best Params: {search.best_params_}")
    print(f"  Best CV Score: {search.best_score_:.4f}")

    # Predict on Test
    best_model = search.best_estimator_
    preds_encoded = best_model.predict(X_test)
    preds_labels = le.inverse_transform(preds_encoded)

    # Save Submission
    filename = f"submission_{name.lower().replace(' ', '_')}.csv"
    sub = pd.DataFrame({
        'founder_id': test['founder_id'],
        'retention_status': preds_labels
    })
    sub.to_csv(filename, index=False)
    print(f"Saved: {filename}")

# 2. MODEL DEFINITIONS

# 1. Random Forest

rf_params = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': [2, 5, 10],
    'model__class_weight': ['balanced', 'balanced_subsample', None]
}
run_model('Random Forest', RandomForestClassifier(random_state=42, n_jobs=-1), rf_params)


# 2. XGBoost

xgb_params = {
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'model__n_estimators': [100, 300, 500],
    'model__max_depth': [3, 5, 7],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0]
}
run_model('XGBoost', xgb.XGBClassifier(tree_method='hist', random_state=42, n_jobs=-1), xgb_params)


#3. LightGBM

lgbm_params = {
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__n_estimators': [100, 300, 500],
    'model__num_leaves': [31, 50, 70],
    'model__class_weight': ['balanced', None]
}
run_model('LightGBM', lgb.LGBMClassifier(random_state=42, n_jobs=-1, verbose=-1), lgbm_params)


# 4. CatBoost

cat_params = {
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__depth': [4, 6, 8],
    'model__iterations': [200, 500, 800]
}

run_model('CatBoost', CatBoostClassifier(random_state=42, verbose=0, thread_count=-1), cat_params)


# 5. Decision Tree
dt_params = {
    'model__max_depth': [5, 10, 20, None],
    'model__min_samples_leaf': [1, 5, 10, 20],
    'model__criterion': ['gini', 'entropy']
}
run_model('Decision Tree', DecisionTreeClassifier(random_state=42), dt_params)

print("\nAll models trained and submissions saved!")