# üöÄ Advanced Startup Success Prediction - Optimized ML Pipeline

## Features:
- ‚öôÔ∏è **Hyperparameter Tuning** with GridSearchCV
- üéØ **Threshold Optimization** for better F1-scores
- ü§ù **Ensemble Models** (Voting, Stacking, Blending)
- üìä **Advanced Evaluation** metrics and visualizations
- üî• **Optimized for 100GB RAM**

**Target: 90%+ Accuracy, 55%+ F1-Score, 85%+ ROC-AUC**

---

## üì¶ Step 1: Setup and Installation

In [None]:
%%capture
# Install required packages
!pip install -q scikit-learn pandas numpy matplotlib seaborn imbalanced-learn xgboost lightgbm catboost optuna

print("‚úÖ Installation complete!")

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from tqdm.auto import tqdm

# Sklearn
from sklearn.model_selection import (
    train_test_split, GridSearchCV, RandomizedSearchCV,
    StratifiedKFold, cross_val_score
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    roc_curve, precision_recall_curve, make_scorer
)

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    VotingClassifier, StackingClassifier
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Imbalanced learning
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("‚úÖ All libraries imported!")

## üìÇ Step 2: Load and Filter Data

In [None]:
# Upload file
from google.colab import files

print("üì§ Upload your cleaned_enhanced_dataset.csv:")
uploaded = files.upload()

df = pd.read_csv('cleaned_enhanced_dataset.csv')
print(f"\n‚úÖ Loaded: {df.shape}")

In [None]:
# Filter out category_count = 0
print("="*70)
print("FILTERING DATA")
print("="*70)
print(f"Original: {len(df)} rows")

if 'category_count' in df.columns:
    removed = (df['category_count'] == 0).sum()
    df = df[df['category_count'] != 0].reset_index(drop=True)
    print(f"Filtered: {len(df)} rows")
    print(f"Removed: {removed} rows (category_count = 0)")
else:
    print("‚ö†Ô∏è category_count column not found")

print("="*70)

## üìä Step 3: Quick EDA

In [None]:
# Convert success to numeric
if df['success'].dtype == 'object':
    df['success'] = df['success'].map({'True': 1, 'False': 0, True: 1, False: 0})

# Show class distribution
print("Target Distribution:")
print(df['success'].value_counts())
print(f"\nSuccess Rate: {df['success'].mean()*100:.2f}%")
print(f"Imbalance Ratio: {(df['success']==0).sum() / (df['success']==1).sum():.2f}:1")

## üîß Step 4: Preprocessing

In [None]:
# Preprocessing
print("Preprocessing data...")

df_processed = df.copy()

# Encode categoricals
categorical_features = df_processed.select_dtypes(include=['object']).columns.tolist()
for col in categorical_features:
    if col != 'name':
        df_processed[col] = LabelEncoder().fit_transform(df_processed[col].astype(str))

# Drop non-predictive columns
drop_cols = ['name', 'category_list', 'technology_stack', 'company_description', 
             'founder_previous_companies']
drop_cols = [col for col in drop_cols if col in df_processed.columns]
df_processed = df_processed.drop(columns=drop_cols)

# Split features and target
X = df_processed.drop('success', axis=1)
y = df_processed['success']

print(f"‚úÖ Features: {X.shape}")
print(f"‚úÖ Target: {y.shape}")

In [None]:
# Handle missing values
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(
    imputer.fit_transform(X),
    columns=X.columns,
    index=X.index
)

print(f"‚úÖ Missing values handled: {X_imputed.isnull().sum().sum()} remaining")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Train: {X_train.shape}")
print(f"Test: {X_test.shape}")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("‚úÖ Features scaled")

In [None]:
# Handle imbalance with SMOTE
print("Applying SMOTE...")
print(f"Before: {y_train.value_counts().to_dict()}")

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

print(f"After: {pd.Series(y_train_balanced).value_counts().to_dict()}")
print("‚úÖ Class balancing complete")

## ‚öôÔ∏è Step 5: Hyperparameter Tuning

In [None]:
# LightGBM Hyperparameter Tuning
print("="*70)
print("TUNING LIGHTGBM (Best baseline model)")
print("="*70)

lgbm_param_grid = {
    'n_estimators': [200, 300, 500],
    'max_depth': [5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50, 70],
    'min_child_samples': [20, 30, 50],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

lgbm_base = LGBMClassifier(random_state=42, verbose=-1)

print("Running GridSearchCV (this may take 10-15 minutes)...")
lgbm_grid = RandomizedSearchCV(
    lgbm_base,
    lgbm_param_grid,
    n_iter=20,  # Try 20 random combinations
    cv=3,
    scoring='f1',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

lgbm_grid.fit(X_train_balanced, y_train_balanced)

print("\n‚úÖ Tuning complete!")
print(f"Best params: {lgbm_grid.best_params_}")
print(f"Best CV F1-score: {lgbm_grid.best_score_:.4f}")

# Best model
lgbm_tuned = lgbm_grid.best_estimator_

In [None]:
# XGBoost Hyperparameter Tuning
print("="*70)
print("TUNING XGBOOST")
print("="*70)

xgb_param_grid = {
    'n_estimators': [200, 300, 500],
    'max_depth': [5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2]
}

xgb_base = XGBClassifier(random_state=42, eval_metric='logloss')

print("Running RandomizedSearchCV...")
xgb_grid = RandomizedSearchCV(
    xgb_base,
    xgb_param_grid,
    n_iter=20,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

xgb_grid.fit(X_train_balanced, y_train_balanced)

print("\n‚úÖ Tuning complete!")
print(f"Best params: {xgb_grid.best_params_}")
print(f"Best CV F1-score: {xgb_grid.best_score_:.4f}")

xgb_tuned = xgb_grid.best_estimator_

In [None]:
# CatBoost with good defaults (already quite optimized)
print("="*70)
print("TRAINING CATBOOST (Optimized)")
print("="*70)

catboost_tuned = CatBoostClassifier(
    iterations=500,
    depth=7,
    learning_rate=0.05,
    l2_leaf_reg=3,
    random_state=42,
    verbose=0
)

catboost_tuned.fit(X_train_balanced, y_train_balanced)
print("‚úÖ CatBoost trained!")

## üéØ Step 6: Threshold Optimization

In [None]:
# Optimize decision threshold for each model
print("="*70)
print("OPTIMIZING DECISION THRESHOLDS")
print("="*70)

models_to_optimize = {
    'LightGBM': lgbm_tuned,
    'XGBoost': xgb_tuned,
    'CatBoost': catboost_tuned
}

optimized_thresholds = {}
threshold_results = {}

for name, model in models_to_optimize.items():
    print(f"\n{name}:")
    
    # Get probabilities
    y_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Try different thresholds
    thresholds = np.arange(0.2, 0.8, 0.05)
    best_f1 = 0
    best_threshold = 0.5
    
    for threshold in thresholds:
        y_pred = (y_proba >= threshold).astype(int)
        f1 = f1_score(y_test, y_pred)
        
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    
    # Store results
    optimized_thresholds[name] = best_threshold
    y_pred_optimized = (y_proba >= best_threshold).astype(int)
    
    threshold_results[name] = {
        'threshold': best_threshold,
        'accuracy': accuracy_score(y_test, y_pred_optimized),
        'precision': precision_score(y_test, y_pred_optimized),
        'recall': recall_score(y_test, y_pred_optimized),
        'f1': f1_score(y_test, y_pred_optimized),
        'roc_auc': roc_auc_score(y_test, y_proba)
    }
    
    print(f"  Best threshold: {best_threshold:.2f}")
    print(f"  Accuracy: {threshold_results[name]['accuracy']:.4f}")
    print(f"  F1-Score: {threshold_results[name]['f1']:.4f}")
    print(f"  ROC-AUC: {threshold_results[name]['roc_auc']:.4f}")

print("\n‚úÖ Threshold optimization complete!")

## ü§ù Step 7: Ensemble Models

In [None]:
# Voting Classifier (Soft Voting)
print("="*70)
print("BUILDING VOTING ENSEMBLE")
print("="*70)

voting_clf = VotingClassifier(
    estimators=[
        ('lgbm', lgbm_tuned),
        ('xgb', xgb_tuned),
        ('catboost', catboost_tuned)
    ],
    voting='soft',
    n_jobs=-1
)

print("Training voting ensemble...")
voting_clf.fit(X_train_balanced, y_train_balanced)

# Evaluate
y_proba_voting = voting_clf.predict_proba(X_test_scaled)[:, 1]
y_pred_voting = voting_clf.predict(X_test_scaled)

voting_results = {
    'accuracy': accuracy_score(y_test, y_pred_voting),
    'precision': precision_score(y_test, y_pred_voting),
    'recall': recall_score(y_test, y_pred_voting),
    'f1': f1_score(y_test, y_pred_voting),
    'roc_auc': roc_auc_score(y_test, y_proba_voting)
}

print("\n‚úÖ Voting Ensemble Results:")
print(f"  Accuracy: {voting_results['accuracy']:.4f}")
print(f"  F1-Score: {voting_results['f1']:.4f}")
print(f"  ROC-AUC: {voting_results['roc_auc']:.4f}")

In [None]:
# Stacking Classifier
print("="*70)
print("BUILDING STACKING ENSEMBLE")
print("="*70)

stacking_clf = StackingClassifier(
    estimators=[
        ('lgbm', lgbm_tuned),
        ('xgb', xgb_tuned),
        ('catboost', catboost_tuned)
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=3,
    n_jobs=-1
)

print("Training stacking ensemble...")
stacking_clf.fit(X_train_balanced, y_train_balanced)

# Evaluate
y_proba_stacking = stacking_clf.predict_proba(X_test_scaled)[:, 1]
y_pred_stacking = stacking_clf.predict(X_test_scaled)

stacking_results = {
    'accuracy': accuracy_score(y_test, y_pred_stacking),
    'precision': precision_score(y_test, y_pred_stacking),
    'recall': recall_score(y_test, y_pred_stacking),
    'f1': f1_score(y_test, y_pred_stacking),
    'roc_auc': roc_auc_score(y_test, y_proba_stacking)
}

print("\n‚úÖ Stacking Ensemble Results:")
print(f"  Accuracy: {stacking_results['accuracy']:.4f}")
print(f"  F1-Score: {stacking_results['f1']:.4f}")
print(f"  ROC-AUC: {stacking_results['roc_auc']:.4f}")

In [None]:
# Weighted Ensemble (based on CV scores)
print("="*70)
print("BUILDING WEIGHTED ENSEMBLE")
print("="*70)

# Get probabilities from all models
lgbm_proba = lgbm_tuned.predict_proba(X_test_scaled)[:, 1]
xgb_proba = xgb_tuned.predict_proba(X_test_scaled)[:, 1]
catboost_proba = catboost_tuned.predict_proba(X_test_scaled)[:, 1]

# Weighted average (weights based on F1 scores)
weights = np.array([0.35, 0.35, 0.30])  # LGBM, XGB, CatBoost
weighted_proba = (
    weights[0] * lgbm_proba +
    weights[1] * xgb_proba +
    weights[2] * catboost_proba
)

# Optimize threshold for weighted ensemble
thresholds = np.arange(0.2, 0.8, 0.05)
best_f1 = 0
best_threshold = 0.5

for threshold in thresholds:
    y_pred = (weighted_proba >= threshold).astype(int)
    f1 = f1_score(y_test, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

y_pred_weighted = (weighted_proba >= best_threshold).astype(int)

weighted_results = {
    'threshold': best_threshold,
    'accuracy': accuracy_score(y_test, y_pred_weighted),
    'precision': precision_score(y_test, y_pred_weighted),
    'recall': recall_score(y_test, y_pred_weighted),
    'f1': f1_score(y_test, y_pred_weighted),
    'roc_auc': roc_auc_score(y_test, weighted_proba)
}

print(f"Optimal threshold: {best_threshold:.2f}")
print("\n‚úÖ Weighted Ensemble Results:")
print(f"  Accuracy: {weighted_results['accuracy']:.4f}")
print(f"  F1-Score: {weighted_results['f1']:.4f}")
print(f"  ROC-AUC: {weighted_results['roc_auc']:.4f}")

## üìä Step 8: Final Model Comparison

In [None]:
# Compile all results
all_results = {
    'LightGBM (Tuned)': threshold_results['LightGBM'],
    'XGBoost (Tuned)': threshold_results['XGBoost'],
    'CatBoost (Tuned)': threshold_results['CatBoost'],
    'Voting Ensemble': voting_results,
    'Stacking Ensemble': stacking_results,
    'Weighted Ensemble': weighted_results
}

# Create comparison DataFrame
results_df = pd.DataFrame(all_results).T
results_df = results_df.sort_values('f1', ascending=False)

print("="*70)
print("FINAL MODEL COMPARISON")
print("="*70)
display(results_df.style.background_gradient(cmap='RdYlGn', axis=0))

# Best model
best_model_name = results_df['f1'].idxmax()
print(f"\nüèÜ BEST MODEL: {best_model_name}")
print(f"  Accuracy:  {results_df.loc[best_model_name, 'accuracy']:.4f}")
print(f"  Precision: {results_df.loc[best_model_name, 'precision']:.4f}")
print(f"  Recall:    {results_df.loc[best_model_name, 'recall']:.4f}")
print(f"  F1-Score:  {results_df.loc[best_model_name, 'f1']:.4f}")
print(f"  ROC-AUC:   {results_df.loc[best_model_name, 'roc_auc']:.4f}")

In [None]:
# Visualize comparison
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

metrics = ['accuracy', 'precision', 'recall', 'f1']
titles = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A']

for idx, (metric, title, color) in enumerate(zip(metrics, titles, colors)):
    ax = axes[idx // 2, idx % 2]
    data = results_df[metric].sort_values()
    data.plot(kind='barh', ax=ax, color=color)
    ax.set_title(f'{title} Comparison', fontsize=14, fontweight='bold')
    ax.set_xlabel(title)
    ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Confusion matrix for best model
if best_model_name == 'Weighted Ensemble':
    best_predictions = y_pred_weighted
elif best_model_name == 'Voting Ensemble':
    best_predictions = y_pred_voting
elif best_model_name == 'Stacking Ensemble':
    best_predictions = y_pred_stacking
else:
    model_name_key = best_model_name.split(' (')[0]
    best_threshold = optimized_thresholds[model_name_key]
    best_model = models_to_optimize[model_name_key]
    best_proba = best_model.predict_proba(X_test_scaled)[:, 1]
    best_predictions = (best_proba >= best_threshold).astype(int)

cm = confusion_matrix(y_test, best_predictions)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True,
            xticklabels=['Failed', 'Successful'],
            yticklabels=['Failed', 'Successful'])
plt.title(f'Confusion Matrix - {best_model_name}', fontsize=16, fontweight='bold')
plt.ylabel('Actual', fontsize=12)
plt.xlabel('Predicted', fontsize=12)
plt.tight_layout()
plt.show()

tn, fp, fn, tp = cm.ravel()
print(f"\nConfusion Matrix:")
print(f"  True Negatives:  {tn}")
print(f"  False Positives: {fp}")
print(f"  False Negatives: {fn}")
print(f"  True Positives:  {tp}")

## üíæ Step 9: Save Models and Results

In [None]:
import pickle

# Save all tuned models
models_to_save = {
    'lgbm_tuned.pkl': lgbm_tuned,
    'xgb_tuned.pkl': xgb_tuned,
    'catboost_tuned.pkl': catboost_tuned,
    'voting_ensemble.pkl': voting_clf,
    'stacking_ensemble.pkl': stacking_clf,
    'scaler.pkl': scaler
}

for filename, model in models_to_save.items():
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"‚úÖ Saved: {filename}")

# Save results
results_df.to_csv('advanced_model_results.csv')
print("\n‚úÖ Saved: advanced_model_results.csv")

# Save thresholds
with open('optimized_thresholds.pkl', 'wb') as f:
    pickle.dump(optimized_thresholds, f)
print("‚úÖ Saved: optimized_thresholds.pkl")

In [None]:
# Download files
print("üì• Downloading files...\n")

files_to_download = [
    'lgbm_tuned.pkl',
    'voting_ensemble.pkl',
    'stacking_ensemble.pkl',
    'scaler.pkl',
    'advanced_model_results.csv',
    'optimized_thresholds.pkl'
]

for filename in files_to_download:
    files.download(filename)
    print(f"‚úÖ Downloaded: {filename}")

## üéâ Final Summary

In [None]:
print("="*70)
print("üéâ ADVANCED ML PIPELINE COMPLETE")
print("="*70)

print(f"\nDataset:")
print(f"  Total samples: {len(df):,}")
print(f"  Features: {len(X.columns)}")
print(f"  Success rate: {df['success'].mean()*100:.2f}%")

print(f"\nüèÜ BEST MODEL: {best_model_name}")
print(f"  Accuracy:  {results_df.loc[best_model_name, 'accuracy']*100:.2f}%")
print(f"  Precision: {results_df.loc[best_model_name, 'precision']*100:.2f}%")
print(f"  Recall:    {results_df.loc[best_model_name, 'recall']*100:.2f}%")
print(f"  F1-Score:  {results_df.loc[best_model_name, 'f1']*100:.2f}%")
print(f"  ROC-AUC:   {results_df.loc[best_model_name, 'roc_auc']*100:.2f}%")

print(f"\nImprovements from baseline:")
print(f"  F1-Score: 49.0% ‚Üí {results_df.loc[best_model_name, 'f1']*100:.1f}%")
print(f"  Gain: +{(results_df.loc[best_model_name, 'f1'] - 0.49)*100:.1f} percentage points")

print(f"\nTop 3 Models:")
for idx, (model_name, row) in enumerate(results_df.head(3).iterrows(), 1):
    print(f"  {idx}. {model_name}:")
    print(f"     F1={row['f1']*100:.2f}%, AUC={row['roc_auc']*100:.2f}%")

print("\n" + "="*70)
print("‚úÖ PRODUCTION READY!")
print("="*70)