In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import QuantileTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import LinearSVC
from sklearn.kernel_approximation import Nystroem
from sklearn.neural_network import MLPClassifier

# 1. LOAD DATA
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Encode target
train["retention_status"] = train["retention_status"].map({"Stayed": 1, "Left": 0}).astype(int)

# 2. FEATURE ENGINEERING 
for df in [train, test]:
    df["years_since_founding"] = df["years_since_founding"].fillna(df["years_since_founding"].median())
    
    # Ratios
    df["revenue_per_year"] = df["monthly_revenue_generated"] / (df["years_since_founding"] + 1)
    df["life_investment_ratio"] = df["years_with_startup"] / (df["founder_age"] + 1)
    df["age_at_founding"] = df["founder_age"] - df["years_with_startup"]
    

TARGET = "retention_status"
ID_COL = "founder_id"

X = train.drop(columns=[TARGET, ID_COL, "founder_visibility"])
y = train[TARGET]
X_test_submission = test.drop(columns=[ID_COL, "founder_visibility"])

# 3. SPLIT: 80% TRAIN, 20% VALIDATION
# This is the standard split. We train on 80% of the total data.
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

print(f" Training Set:   {X_train.shape[0]} rows")
print(f" Validation Set: {X_val.shape[0]} rows")


# 4. PREPROCESSORS
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    # QuantileTransformer normalizes data distribution, helping SVM/NN convergence
    ("scaler", QuantileTransformer(output_distribution="normal", random_state=42)),
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, X.select_dtypes(include=["number"]).columns),
    ("cat", categorical_transformer, X.select_dtypes(include=["object", "category", "bool"]).columns),
])

# 5. DEFINE MODELS

# --- SVM Pipeline (Nystroem + LinearSVC) ---
svm_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("feature_map", Nystroem(kernel="rbf", n_components=1200, random_state=42)),
    ("svm", LinearSVC(class_weight="balanced", max_iter=5000, dual=False))
])

# --- Neural Network Pipeline (MLP) ---
nn_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("nn", MLPClassifier(
        hidden_layer_sizes=(128, 64), 
        max_iter=1000, 
        early_stopping=True, # Stops if validation score stalls
        random_state=42
    ))
])

# 6. TRAIN & VALIDATE
print("\n Training SVM...")
svm_pipeline.fit(X_train, y_train)
svm_val_preds = svm_pipeline.predict(X_val)
svm_score = f1_score(y_val, svm_val_preds)

print(" Training Neural Network...")
nn_pipeline.fit(X_train, y_train)
nn_val_preds = nn_pipeline.predict(X_val)
nn_score = f1_score(y_val, nn_val_preds)

# 7. COMPARISON REPORT

print(f" SVM F1 Score: {svm_score:.4f}")
print(f" NN  F1 Score: {nn_score:.4f}")
print("-" * 40)

if svm_score > nn_score:
    print("SVM performed better when trained on full dataset.")
else:
    print("Neural Network performed better when trained on full dataset.")

# 8. GENERATE SUBMISSION FILES
print("\n Generating Submission Files..")

# SVM
svm_test_preds = svm_pipeline.predict(X_test_submission)
svm_labels = np.where(svm_test_preds == 1, "Stayed", "Left")
pd.DataFrame({
    "founder_id": test["founder_id"], 
    "retention_status": svm_labels
}).to_csv("submission_svm_split.csv", index=False)
print("Saved: submission_svm_split.csv")

# NN
nn_test_preds = nn_pipeline.predict(X_test_submission)
nn_labels = np.where(nn_test_preds == 1, "Stayed", "Left")
pd.DataFrame({
    "founder_id": test["founder_id"], 
    "retention_status": nn_labels
}).to_csv("submission_nn_split.csv", index=False)
print(" Saved: submission_nn_split.csv")

 Training Set:   47688 rows
 Validation Set: 11923 rows

 Training SVM...
 Training Neural Network...
 SVM F1 Score: 0.7529
 NN  F1 Score: 0.7632
----------------------------------------
Neural Network performed better when trained on full dataset.

 Generating Submission Files..
Saved: submission_svm_split.csv
 Saved: submission_nn_split.csv
