In [None]:
#  LIGHTGBM + ADVANCED SVM SOFT-VOTING ENSEMBLE
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.svm import LinearSVC
from sklearn.kernel_approximation import Nystroem
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV

print("Loading data...")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

TARGET = "retention_status"
ID_COL = "founder_id"

train[TARGET] = train[TARGET].map({"Stayed": 1, "Left": 0}).astype(int)


for df in [train, test]:

    df["years_since_founding"] = df["years_since_founding"].fillna(
        df["years_since_founding"].median()
    )

    df["revenue_per_year"] = df["monthly_revenue_generated"] / (df["years_since_founding"] + 1)
    df["life_investment_ratio"] = df["years_with_startup"] / (df["founder_age"] + 1)
    df["age_at_founding"] = df["founder_age"] - df["years_with_startup"]
    df["revenue_per_round"] = df["monthly_revenue_generated"] / (df["funding_rounds_led"] + 1)

    df["commitment_x_team"] = df["years_with_startup"] * df["team_size_category"].astype("category").cat.codes
    df["satisfaction_x_balance"] = df["venture_satisfaction"].astype("category").cat.codes * \
                                   df["work_life_balance_rating"].astype("category").cat.codes


X = train.drop(columns=[TARGET, ID_COL, "founder_visibility"])
y = train[TARGET]
X_test = test.drop(columns=[ID_COL, "founder_visibility"])


num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])


# lgbm
lgbm = Pipeline([
    ("preprocessor", preprocessor),
    ("model", lgb.LGBMClassifier(random_state=42))
])

lgbm_search = RandomizedSearchCV(
    lgbm,
    param_distributions={
        "model__learning_rate": [0.01, 0.05],
        "model__n_estimators": [300, 500, 800],
        "model__num_leaves": [31, 50, 70],
        "model__class_weight": [None, "balanced"]
    },
    n_iter=10,
    scoring="f1_weighted",
    cv=3,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

print("\n Training LightGBMâ€¦")
lgbm_search.fit(X, y)
print("LightGBM Best:", lgbm_search.best_score_)


# svm
svm_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("feature_map", Nystroem(kernel="rbf", random_state=42)),
    ("svm", LinearSVC(class_weight="balanced", max_iter=8000))
])

svm_search = RandomizedSearchCV(
    svm_pipe,
    param_distributions={
        "feature_map__gamma": [0.0005, 0.001, 0.003, 0.005],
        "feature_map__n_components": [800, 1000, 1200, 1400],
        "svm__C": [0.3, 1, 3, 10]
    },
    n_iter=10,
    scoring="f1_weighted",
    cv=3,
    n_jobs=1,   # SVM safe mode
    verbose=1,
    random_state=42
)

print("\n Training Advanced SVMâ€¦")
svm_search.fit(X, y)
print("SVM Best:", svm_search.best_score_)
print("\nðŸ”® Creating Ensemble Predictionsâ€¦")

# LightGBM probability
lgbm_preds = lgbm_search.best_estimator_.predict_proba(X_test)[:, 1]

# SVM does NOT give proba â†’ convert decision_function to pseudo-proba
svm_decision = svm_search.best_estimator_.decision_function(X_test)
svm_probs = 1 / (1 + np.exp(-svm_decision))   # sigmoid

# Weighted average
ensemble_probs = (0.55 * lgbm_preds) + (0.45 * svm_probs)

# Convert to final labels
final_preds = (ensemble_probs >= 0.5).astype(int)
final_labels = np.where(final_preds == 1, "Stayed", "Left")

submission = pd.DataFrame({
    "founder_id": test["founder_id"],
    "retention_status": final_labels
})

submission.to_csv("submission_lgbm_svm_ensemble.csv", index=False)
print("\n Saved submission_lgbm_svm_ensemble.csv")
