In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# --- 1. LOAD DATA ---
print("Loading Data...")
# Adjust paths if needed (e.g. '../data/train.csv')
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

train['is_train'] = 1
test['is_train'] = 0
test['retention_status'] = 'Unknown'
combined = pd.concat([train, test], axis=0)

# --- 2. ADVANCED PRE-PROCESSING ---
print("Applying Advanced Feature Engineering...")

# A. CLEANING
cols_to_drop = ['founder_id', 'founder_visibility', 'innovation_support'] 
combined = combined.drop(columns=cols_to_drop)

# B. IMPUTE
combined['monthly_revenue_generated'] = combined['monthly_revenue_generated'].fillna(combined['monthly_revenue_generated'].median())
combined['years_since_founding'] = combined['years_since_founding'].fillna(combined['years_since_founding'].median())
combined['num_dependents'] = combined['num_dependents'].fillna(combined['num_dependents'].mode()[0])
combined['work_life_balance_rating'] = combined['work_life_balance_rating'].fillna('Unknown')
combined['venture_satisfaction'] = combined['venture_satisfaction'].fillna('Unknown')

# C. **NEW** INTERACTION FEATURES
# 1. Revenue Efficiency
combined['revenue_efficiency'] = np.log1p(combined['monthly_revenue_generated']) / (combined['years_since_founding'] + 1)

# 2. Founder Experience Gap
combined['prior_experience'] = combined['founder_age'] - combined['years_with_startup']

# 3. FIX: Convert Binary Columns IN PLACE (Fixes the 'No' error)
binary_map = {'No': 0, 'Yes': 1}
combined['working_overtime'] = combined['working_overtime'].map(binary_map)
combined['remote_operations'] = combined['remote_operations'].map(binary_map)

# 4. "Burnout Risk" (Using the now-numeric 'working_overtime')
sat_map = {'Unknown': 2, 'Low': 0, 'Poor': 0, 'Below Average': 1, 'Fair': 2, 'Medium': 2, 'Average': 2, 'Good': 3, 'High': 3, 'Very High': 4, 'Excellent': 4}
combined['satisfaction_score'] = combined['venture_satisfaction'].map(sat_map)
# Now we can safely use 'working_overtime' because it is 0/1
combined['burnout_index'] = combined['working_overtime'] / (combined['satisfaction_score'] + 1)

# D. ORDINAL ENCODING
for col in ['work_life_balance_rating', 'venture_satisfaction', 'startup_performance_rating', 'startup_reputation']:
    combined[col] = combined[col].map(sat_map).fillna(2) 

stage_map = {'Entry': 1, 'Mid': 2, 'Senior': 3, 'Growth': 3, 'Established': 4}
combined['startup_stage'] = combined['startup_stage'].map(stage_map).fillna(1)

# E. ONE-HOT ENCODING
combined = pd.get_dummies(combined, columns=['founder_gender', 'education_background', 'personal_status', 'founder_role', 'team_size_category', 'leadership_scope'], drop_first=True)

# --- 3. PREPARE MATRICES ---
print("Scaling Data...")
train_final = combined[combined['is_train'] == 1].drop(columns=['is_train'])
test_final = combined[combined['is_train'] == 0].drop(columns=['is_train', 'retention_status'])

y = train_final['retention_status'].map({'Stayed': 0, 'Left': 1})
X = train_final.drop(columns=['retention_status'])
X_submit = test_final[X.columns]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_submit_scaled = scaler.transform(X_submit)

# ==============================================================================
# 4. STACKING ENSEMBLE
# ==============================================================================
print("Training Stacking Ensemble (This will take 1-2 mins)...")

estimators = [
    ('gb', GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, max_depth=4, random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=300, max_depth=10, random_state=42)),
    ('mlp', MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42))
]

stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=3,
    n_jobs=-1
)

stacking_model.fit(X_scaled, y)
stack_preds = stacking_model.predict(X_submit_scaled)

# Save
inverse_map = {0: 'Stayed', 1: 'Left'}
submission = pd.DataFrame({
    'founder_id': test['founder_id'],
    'retention_status': [inverse_map[p] for p in stack_preds]
})

output_dir = '../output'
os.makedirs(output_dir, exist_ok=True)
filename = f'{output_dir}/submission_Advanced_Stacking.csv'
submission.to_csv(filename, index=False)
print(f"âœ… Saved Stacking Submission to: {filename}")

Loading Data...
Applying Advanced Feature Engineering...
Scaling Data...
Training Stacking Ensemble (This will take 1-2 mins)...
