In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, confusion_matrix
import lightgbm as lgb
from scipy.optimize import minimize 

TRAIN_PATH = os.path.join('..', 'data', 'train.csv')
TEST_PATH = os.path.join('..', 'data', 'test.csv')

# Load Data
try:
    train_df = pd.read_csv(TRAIN_PATH)
    test_df = pd.read_csv(TEST_PATH)
    print(f"Successfully loaded data from '{TRAIN_PATH}' and '{TEST_PATH}'.")
except FileNotFoundError:
    print(f"Error: Files not found. Ensure 'train.csv' and 'test.csv' are located at the path: {os.path.abspath(os.path.join(os.getcwd(), '..', 'data'))}")
    exit()

# Isolate the founder_id for the submission file
test_ids = test_df['founder_id']

Successfully loaded data from '..\data\train.csv' and '..\data\test.csv'.


In [2]:
def feature_engineering(df):
    """
    Applies the specified feature engineering steps.
    """
    df = df.copy()

    # Create Age at Founding
    df['age_at_founding'] = df['founder_age'] - df['years_since_founding']

    # Create Founder Tenure Ratio
    epsilon = 1e-6
    df['tenure_ratio'] = df['years_with_startup'] / (df['years_since_founding'] + epsilon)
    
    # Create proxy for 'unhappy' and 'overtime' interaction
    df['unhappy_overtime'] = (df['working_overtime'] == 'Yes').astype(int) * \
                             (df['venture_satisfaction'].map({'Low': 1, 'Medium': 0.5, 'High': 0, 'Very High': 0}))
    
    # Drop the ID column
    return df.drop('founder_id', axis=1, errors='ignore')

train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

# Target Encoding: Left=1, Stayed=0
train_df['retention_status'] = train_df['retention_status'].map({'Left': 1, 'Stayed': 0})

X = train_df.drop('retention_status', axis=1)
y = train_df['retention_status']
X_test = test_df

In [3]:
# --- 3.1 Define Feature Lists and Ordinal Maps ---
numerical_features = ['founder_age', 'years_with_startup', 'monthly_revenue_generated', 
                      'funding_rounds_led', 'distance_from_investor_hub', 
                      'num_dependents', 'years_since_founding', 'age_at_founding', 
                      'tenure_ratio', 'unhappy_overtime']

# Define categories and their order
ordinal_mappings = [
    ('work_life_balance_rating', ['Poor', 'Fair', 'Good', 'Excellent', 'Missing']),
    ('venture_satisfaction', ['Low', 'Medium', 'High', 'Very High', 'Missing']),
    ('startup_performance_rating', ['Below Average', 'Low', 'Average', 'High', 'Excellent']),
    ('startup_reputation', ['Low', 'Moderate', 'High', 'Excellent']),
    ('founder_visibility', ['Low', 'Medium', 'High', 'Very High'])
]
ordinal_cols = [col for col, _ in ordinal_mappings]
ordinal_categories = [order for _, order in ordinal_mappings]

nominal_cols = ['founder_gender', 'founder_role', 'working_overtime', 
                'education_background', 'personal_status', 
                'startup_stage', 'team_size_category', 
                'remote_operations', 'leadership_scope', 'innovation_support']


# --- 3.2 Define Preprocessing Steps ---

# 1. Numerical Pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 2. Ordinal Pipeline (Impute 'Missing', then Ordinal Encode)
ordinal_transformer = Pipeline(steps=[
    ('imputer_cat', SimpleImputer(strategy='constant', fill_value='Missing', missing_values=np.nan)),
    ('ordinal', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1))
])

# 3. Nominal Pipeline (One-Hot Encode)
nominal_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)


# --- 3.3 Create Column Transformer ---
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('ord', ordinal_transformer, ordinal_cols),
        ('nom', nominal_transformer, nominal_cols)
    ],
    remainder='drop', 
    n_jobs=-1
)

# Fit the preprocessor on training data
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test)

In [4]:
# --- 4.1 Define Model ---
lgbm_clf = lgb.LGBMClassifier(
    objective='binary',
    metric='None',
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

# --- 4.2 Cross-Validation for Threshold Optimization ---
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("Performing 5-Fold Cross-Validation for Threshold Optimization...")
oof_probas = cross_val_predict(
    lgbm_clf,
    X_processed,
    y,
    cv=skf,
    method='predict_proba',
    n_jobs=-1
)[:, 1]

# --- 4.3 Optimize Classification Threshold ---
from scipy.optimize import minimize_scalar

# Function to minimize (negative F1-score)
def objective_f1(threshold):
    y_pred_binary = (oof_probas >= threshold).astype(int)
    return -f1_score(y, y_pred_binary)

# Scalar bounded minimization
result = minimize_scalar(
    objective_f1,
    bounds=(0.01, 0.99),
    method='bounded',
    options={'xatol': 1e-5}
)

optimal_threshold = result.x
optimal_f1 = -result.fun

print(f"\n--- F1 Optimization Results ---")
print(f"Optimal Threshold: {optimal_threshold:.4f}")
print(f"Maximized F1-Score (OOF): {optimal_f1:.4f}")

# Sanity Check: Print Confusion Matrix at Optimal Threshold
y_pred_optimal = (oof_probas >= optimal_threshold).astype(int)
print("\nConfusion Matrix (OOF - predicting 'Left' as 1):")
print(confusion_matrix(y, y_pred_optimal))

# --- 4.4 Final Model Training and Prediction ---
print("\nTraining final model on full dataset...")
final_model = lgbm_clf.fit(X_processed, y)

# Predict probabilities on the test set
test_probas = final_model.predict_proba(X_test_processed)[:, 1]


Performing 5-Fold Cross-Validation for Threshold Optimization...

--- F1 Optimization Results ---
Optimal Threshold: 0.3354
Maximized F1-Score (OOF): 0.7509

Confusion Matrix (OOF - predicting 'Left' as 1):
[[18806 12459]
 [ 3816 24530]]

Training final model on full dataset...




In [5]:
# --- 5. Submission File Creation with Hard Labels (Corrected) ---

# 1. Use the optimal threshold found in Section 4.3
# CORRECTED LINE: Access result.x directly, as it is likely a scalar
optimal_threshold = result.x 

# Ensure it's a standard float for comparison, if necessary (good practice)
if isinstance(optimal_threshold, np.ndarray):
    optimal_threshold = optimal_threshold[0]
    
print(f"Using Optimal Threshold: {optimal_threshold:.4f}")

# 2. Convert the predicted probabilities (test_probas) into hard labels
# If probability >= optimal_threshold, predict 'Left' (the positive class, 1)
# If probability < optimal_threshold, predict 'Stayed' (the negative class, 0)
hard_labels = np.where(test_probas >= optimal_threshold, 'Left', 'Stayed')

# 3. Create the submission DataFrame
submission_df_final = pd.DataFrame({
    'founder_id': test_ids,
    'retention_status': hard_labels 
})

# 4. Save the submission file
submission_df_final.to_csv('lgbm_f1_hard_labels_submission_final.csv', index=False, encoding='utf-8', sep=',')

print("\n--- FINAL SUBMISSION CREATED ---")
print(f"File: 'lgbm_f1_hard_labels_submission_final.csv'")
print("First 5 rows of the submission:")
print(submission_df_final.head())

Using Optimal Threshold: 0.3354

--- FINAL SUBMISSION CREATED ---
File: 'lgbm_f1_hard_labels_submission_final.csv'
First 5 rows of the submission:
   founder_id retention_status
0       52685             Left
1       30585             Left
2       54656           Stayed
3       33442             Left
4       15667           Stayed


Trying Catboost+Xg+Lgb
 run these after preprocess


In [None]:
import xgboost as xgb
import catboost as cb
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import f1_score
from scipy.optimize import minimize
import numpy as np
import pandas as pd # Assuming test_ids and test_probas from LGBM are available

In [None]:
print("--- Training XGBoost Model ---")

xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False, 
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)

# 2.1. Train
xgb_clf.fit(X_processed, y)

# 2.2. Predict Probabilities
xgb_test_probas = xgb_clf.predict_proba(X_test_processed)[:, 1]
print("XGBoost Test Predictions Complete.")

In [None]:
print("\n--- Training CatBoost Model ---")

cb_clf = cb.CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3,
    loss_function='Logloss',
    verbose=0, # Suppress training output
    random_state=42
)

# 3.1. Train
cb_clf.fit(X_processed, y)

# 3.2. Predict Probabilities
cb_test_probas = cb_clf.predict_proba(X_test_processed)[:, 1]
print("CatBoost Test Predictions Complete.")

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Generate XGB OOF predictions
xgb_oof_probas = cross_val_predict(
    xgb_clf, 
    X_processed, 
    y, 
    cv=skf, 
    method='predict_proba', 
    n_jobs=-1
)[:, 1]

# Generate CB OOF predictions
cb_oof_probas = cross_val_predict(
    cb_clf, 
    X_processed, 
    y, 
    cv=skf, 
    method='predict_proba', 
    n_jobs=-1
)[:, 1]

In [None]:
# 4.1. Create Ensemble OOF Probability
# Note: oof_probas is from your original LGBM run
ensemble_oof_probas = (oof_probas + xgb_oof_probas + cb_oof_probas) / 3.0

# 4.2. Optimize Classification Threshold for the Ensemble

# The objective function remains the same
def objective_f1_ensemble(threshold):
    """Returns negative F1-score for minimization."""
    y_pred_binary = (ensemble_oof_probas >= threshold).astype(int)
    return -f1_score(y, y_pred_binary)

# CORRECTED: Change method='bounded' to method='L-BFGS-B' 
result_ensemble = minimize(
    objective_f1_ensemble, 
    x0=0.5,             
    method='L-BFGS-B',   # Corrected Solver Name
    bounds=[(0.01, 0.99)], 
    tol=1e-5
)

optimal_threshold_ensemble = result_ensemble.x[0] if isinstance(result_ensemble.x, np.ndarray) else result_ensemble.x
optimal_f1_ensemble = -result_ensemble.fun

print(f"\n--- Ensemble F1 Optimization Results ---")
print(f"Optimal Ensemble Threshold: {optimal_threshold_ensemble:.4f}")
print(f"Maximized Ensemble F1-Score (OOF): {optimal_f1_ensemble:.4f}")



In [None]:
# 4.3. Create Final Ensemble Test Probability
# Note: test_probas is from your original LGBM run
ensemble_test_probas = (test_probas + xgb_test_probas + cb_test_probas) / 3.0

# 4.4. Convert to Hard Labels
hard_labels_ensemble = np.where(
    ensemble_test_probas >= optimal_threshold_ensemble, 
    'Left', 
    'Stayed'
)

# 4.5. Create and Save Submission
submission_df_ensemble = pd.DataFrame({
    'founder_id': test_ids,
    'retention_status': hard_labels_ensemble 
})

submission_df_ensemble.to_csv('ensemble_xgb_lgbm_cb_f1_submission.csv', index=False, encoding='utf-8', sep=',')

print("\nEnsemble Submission File 'ensemble_xgb_lgbm_cb_f1_submission.csv' created successfully.")