In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import RobustScaler, OneHotEncoder, LabelEncoder # Changed to RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

# 1. Load Data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 2. FEATURE ENGINEERING
for df in [train, test]:
    # Add +1 to denominators to avoid DivisionByZero errors which confuse NNs
    df['revenue_per_year'] = df['monthly_revenue_generated'] / (df['years_since_founding'] + 1)
    df['life_investment_ratio'] = df['years_with_startup'] / (df['founder_age'] + 1)
    df['revenue_per_round'] = df['monthly_revenue_generated'] / (df['funding_rounds_led'] + 1)

# 3. Setup
TARGET = 'retention_status'
ID_COL = 'founder_id'

X = train.drop(columns=[TARGET, ID_COL, "founder_visibility"])
y = train[TARGET]
test_ids = test[ID_COL]
X_test = test.drop(columns=[ID_COL, "founder_visibility"], errors='ignore')

# Encode Target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# 4. PREPROCESSING
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler()) # Handles outliers better than Standard
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ],
    remainder='drop' 
)

# 5. MODEL
nn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('mlp', MLPClassifier(
        hidden_layer_sizes=(64, 32),        # Much simpler architecture
        activation='relu',
        solver='adam',
        alpha=0.05,                         # HIGH Regularization (prevents overfitting)
        batch_size=256,
        learning_rate_init=0.001,
        max_iter=500,
        early_stopping=True,                
        validation_fraction=0.1,
        n_iter_no_change=20,                
        random_state=42
    ))
])

# 6. TRAINING 
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds_accum = np.zeros(len(X_test))
cv_scores = []

print("Starting Robust Training...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y_encoded)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]
    
    nn_pipeline.fit(X_train, y_train)
    
    # Validate with STANDARD 0.5 threshold
    val_probs = nn_pipeline.predict_proba(X_val)[:, 1]
    val_preds = (val_probs >= 0.5).astype(int)
    
    score = f1_score(y_val, val_preds)
    cv_scores.append(score)
    print(f"Fold {fold+1} F1 (Thresh 0.5): {score:.4f}")
    
    # Accumulate Test Probabilities
    test_preds_accum += nn_pipeline.predict_proba(X_test)[:, 1] / 5

print(f"\nAverage CV F1: {np.mean(cv_scores):.4f}")

# 7. SUBMISSION
final_preds_binary = (test_preds_accum >= 0.5).astype(int) # STRICT 0.5 Threshold
final_preds_labels = le.inverse_transform(final_preds_binary)

submission = pd.DataFrame({
    'founder_id': test_ids,
    'retention_status': final_preds_labels
})

submission.to_csv('submission_nn.csv', index=False)
print("Submission Saved. Upload this one!")