In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import QuantileTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import LinearSVC
from sklearn.kernel_approximation import Nystroem
from sklearn.neural_network import MLPClassifier

# 1. LOAD DATA
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Encode target
train["retention_status"] = train["retention_status"].map({"Stayed": 1, "Left": 0}).astype(int)

# 2. FEATURE ENGINEERING 
for df in [train, test]:
    df["years_since_founding"] = df["years_since_founding"].fillna(df["years_since_founding"].median())
    df["revenue_per_year"] = df["monthly_revenue_generated"] / (df["years_since_founding"] + 1)
    df["life_investment_ratio"] = df["years_with_startup"] / (df["founder_age"] + 1)
    df["log_revenue"] = np.log1p(df["monthly_revenue_generated"])

TARGET = "retention_status"
ID_COL = "founder_id"

# Prepare X (Train) and X_test (Submission Data)
X = train.drop(columns=[TARGET, ID_COL, "founder_visibility"])
y = train[TARGET]
X_test_submission = test.drop(columns=[ID_COL, "founder_visibility"])

# 3. SPLIT: TRAIN ON 20%, TEST ON 80% (Internal Check)
# We use train_size=0.2 to isolate just 20% for training the models
X_small_train, X_large_val, y_small_train, y_large_val = train_test_split(
    X, y, train_size=0.20, stratify=y, random_state=42
)

print(f" Training on Small Subset: {X_small_train.shape[0]} samples (20%)")
print(f" Validating on Remaining:  {X_large_val.shape[0]} samples (80%)")

# 4. PREPROCESSORS
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", QuantileTransformer(output_distribution="normal", random_state=42)),
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, X.select_dtypes(include=["number"]).columns),
    ("cat", categorical_transformer, X.select_dtypes(include=["object", "category", "bool"]).columns),
])

# 5. DEFINE MODELS

# --- SVM (Better for small data usually) ---
svm_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("nystroem", Nystroem(kernel="rbf", n_components=800, random_state=42)),
    ("svm", LinearSVC(class_weight="balanced", max_iter=5000, dual=False))
])

# --- Neural Network (Data hungry) ---
nn_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("nn", MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=1000, random_state=42))
])

# 6. TRAIN & COMPARE (Internal)
print("\n Training SVM on 20% data...")
svm_pipeline.fit(X_small_train, y_small_train)
svm_preds = svm_pipeline.predict(X_large_val)
svm_f1 = f1_score(y_large_val, svm_preds)

print(" Training NN on 20% data...")
nn_pipeline.fit(X_small_train, y_small_train)
nn_preds = nn_pipeline.predict(X_large_val)
nn_f1 = f1_score(y_large_val, nn_preds)

# 7. REPORT
print("\n" + "="*30)
print(f"RESULTS (Trained on 20% Data)")
print("="*30)
print(f" SVM F1: {svm_f1:.4f}")
print(f" NN  F1: {nn_f1:.4f}")

if svm_f1 > nn_f1:
    print("\n Conclusion: SVM generalizes better with limited data.")
else:
    print("\n Conclusion: Neural Network managed to learn better despite limited data.")

# 8. GENERATE SUBMISSION FILES
print("\n Generating Submission Files using test.csv...")

# SVM Submission
svm_test_preds = svm_pipeline.predict(X_test_submission)
svm_labels = np.where(svm_test_preds == 1, "Stayed", "Left")
pd.DataFrame({
    "founder_id": test["founder_id"], 
    "retention_status": svm_labels
}).to_csv("submission_svm_20percent.csv", index=False)
print("Saved: submission_svm_20percent.csv")

# NN Submission
nn_test_preds = nn_pipeline.predict(X_test_submission)
nn_labels = np.where(nn_test_preds == 1, "Stayed", "Left")
pd.DataFrame({
    "founder_id": test["founder_id"], 
    "retention_status": nn_labels
}).to_csv("submission_nn_20percent.csv", index=False)
print("Saved: submission_nn_20percent.csv")

 Training on Small Subset: 11922 samples (20%)
 Validating on Remaining:  47689 samples (80%)

 Training SVM on 20% data...
 Training NN on 20% data...

RESULTS (Trained on 20% Data)
 SVM F1: 0.7488
 NN  F1: 0.6980

 Conclusion: SVM generalizes better with limited data.

 Generating Submission Files using test.csv...
Saved: submission_svm_20percent.csv
Saved: submission_nn_20percent.csv
