In [None]:
#  ADVANCED NONLINEAR SVM USING NYSTROEM + LINEAR SVM  (FULL PIPELINE)
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.kernel_approximation import Nystroem
from sklearn.svm import LinearSVC
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV

# 1. LOAD DATA
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Encode target
train["retention_status"] = train["retention_status"].map({"Stayed": 1, "Left": 0}).astype(int)

# 2. FEATURE ENGINEERING
for df in [train, test]:
    df["years_since_founding"] = df["years_since_founding"].fillna(df["years_since_founding"].median())

    df["revenue_per_year"] = df["monthly_revenue_generated"] / (df["years_since_founding"] + 1)
    df["life_investment_ratio"] = df["years_with_startup"] / (df["founder_age"] + 1)
    df["age_at_founding"] = df["founder_age"] - df["years_with_startup"]
    df["revenue_per_round"] = df["monthly_revenue_generated"] / (df["funding_rounds_led"] + 1)
    
    df["commitment_x_team"] = df["years_with_startup"] * df["team_size_category"].astype("category").cat.codes
    df["satisfaction_x_balance"] = df["venture_satisfaction"].astype("category").cat.codes * \
                                   df["work_life_balance_rating"].astype("category").cat.codes


TARGET = "retention_status"
ID_COL = "founder_id"

X = train.drop(columns=[TARGET, ID_COL, "founder_visibility"])
y = train[TARGET]
X_test = test.drop(columns=[ID_COL, "founder_visibility"])

# 3. IDENTIFY COLUMNS
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

# 4. PREPROCESSORS
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True)),
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols),
])

# 5. ADVANCED SVM PIPELINE (NYSTROEM + LINEARSVC)
svm_pipeline = Pipeline([
    ("preprocessor", preprocessor),

    # Nystroem turns data nonlinear (RBF approx)
    ("feature_map", Nystroem(
        kernel="rbf",
        n_components=1200,     
        random_state=42
    )),

    ("svm", LinearSVC(
        class_weight="balanced", 
        max_iter=5000
    ))
])

# 6. HYPERPARAMETER SEARCH
param_dist = {
    "feature_map__n_components": [600, 800, 1000],
    "feature_map__gamma": [0.001, 0.005, 0.01],
    "svm__C": [0.1, 0.3, 1, 3, 10]
}

search = RandomizedSearchCV(
    svm_pipeline,
    param_distributions=param_dist,
    n_iter=10,
    scoring="f1",
    cv=3, 
    n_jobs=1,            # prevent memory crash
    verbose=1,
    random_state=42
)




print(" Running advanced nonlinear SVM search...")
search.fit(X, y)

print("Best Params:", search.best_params_)
print("Best CV F1:", search.best_score_)

# 7. TRAIN FINAL MODEL
final_model = search.best_estimator_

# 8. PREDICT ON TEST
test_predictions = final_model.predict(X_test)
test_labels = np.where(test_predictions == 1, "Stayed", "Left")

# 9. SUBMISSION
submission = pd.DataFrame({
    "founder_id": test["founder_id"],
    "retention_status": test_labels
})

submission.to_csv("submission_advanced_svm.csv", index=False)
print(" Saved submission_advanced_svm.csv")