In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency

# --- CONFIGURATION ---
sns.set(style="whitegrid")
# Adjust this path if your file is in a different location
DATA_PATH = '../data/train.csv'

try:
    print("Loading Dataset...")
    df = pd.read_csv(DATA_PATH)
    print(f"Data Loaded Successfully. Shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: File not found at {DATA_PATH}. Please check the path.")
    exit()

# ==============================================================================
# PART 1: NUMERICAL DISTRIBUTION & SKEW (The "MRI")
# ==============================================================================
print("\n" + "="*60)
print("      PART 1: NUMERICAL DISTRIBUTION & SKEW ANALYSIS")
print("="*60)

# Filter numerical columns (excluding ID)
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
num_cols = [c for c in num_cols if c != 'founder_id']

# Create summary table
dist_summary = pd.DataFrame(index=num_cols)
dist_summary['Skewness'] = df[num_cols].skew()
dist_summary['Kurtosis'] = df[num_cols].kurt()
dist_summary['Null_Count'] = df[num_cols].isnull().sum()

# Recommendation logic
def recommend_transform(row):
    if abs(row['Skewness']) > 1:
        return "Log/Power Transform Needed (High Skew)"
    elif abs(row['Skewness']) > 0.5:
        return "Moderate Skew (Consider Scaling)"
    else:
        return "Normal-ish (StandardScaler ok)"

dist_summary['Recommendation'] = dist_summary.apply(recommend_transform, axis=1)

print("\nDISTRIBUTION HEALTH CHECK:")
print(dist_summary.sort_values(by='Skewness', key=abs, ascending=False))

# --- VISUALIZATION: Histograms + Q-Q Plots ---
# Identify skewed columns for plotting
skewed_cols = dist_summary[dist_summary['Recommendation'].str.contains("Transform")].index.tolist()

if len(skewed_cols) > 0:
    print(f"\n[Visualizing Top Skewed Features]: {skewed_cols}")
    # Create a dynamic subplot layout
    rows = len(skewed_cols)
    plt.figure(figsize=(14, 5 * rows))
    
    for i, col in enumerate(skewed_cols):
        # 1. Histogram
        ax1 = plt.subplot(rows, 2, i*2 + 1)
        sns.histplot(df[col].dropna(), kde=True, color='purple', ax=ax1)
        ax1.set_title(f'{col} Distribution (Skew: {df[col].skew():.2f})')
        
        # 2. Q-Q Plot
        ax2 = plt.subplot(rows, 2, i*2 + 2)
        stats.probplot(df[col].dropna(), dist="norm", plot=ax2)
        ax2.set_title(f'{col} Q-Q Plot (Normality Check)')
    
    plt.tight_layout()
    plt.show()
else:
    print("\nNo heavily skewed columns found (Skewness < 1.0).")


# ==============================================================================
# PART 2: CATEGORICAL PREDICTIVE POWER (Cramer's V)
# ==============================================================================
print("\n" + "="*60)
print("      PART 2: CATEGORICAL FEATURE STRENGTH (CRAMER'S V)")
print("="*60)

def cramers_v(x, y):
    """Calculates Cramer's V statistic for categorical-categorical association."""
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    
    # Handle division by zero edge cases
    if min((kcorr-1), (rcorr-1)) == 0:
        return 0.0
        
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

# Get categorical columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

# Calculate Cramer's V for each column against 'retention_status'
correlations = {}
print("Calculating correlations...")
for col in cat_cols:
    if col != 'retention_status':
        # Drop NaNs purely for this calculation to avoid errors
        clean_data = df[[col, 'retention_status']].dropna()
        # Only calculate if we have data
        if not clean_data.empty:
            score = cramers_v(clean_data[col], clean_data['retention_status'])
            correlations[col] = score

# Create DataFrame & Sort
cat_corr_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['Cramers_V'])
cat_corr_df = cat_corr_df.sort_values(by='Cramers_V', ascending=False)

print("\nSTRENGTH OF ASSOCIATION WITH TARGET (0.0 to 1.0):")
print(cat_corr_df)

# --- VISUALIZATION: Feature Importance Bar Plot ---
plt.figure(figsize=(10, 8))
# FIX: Added hue and legend=False to fix the FutureWarning
sns.barplot(
    x=cat_corr_df.Cramers_V, 
    y=cat_corr_df.index, 
    hue=cat_corr_df.index, 
    legend=False, 
    palette='magma'
)
plt.title("Which Categorical Features Matter Most? (Cramer's V)")
plt.xlabel("Cramer's V Score (0=Noise, 1=Perfect Predictor)")
plt.axvline(x=0.05, color='red', linestyle='--', label='Noise Threshold (0.05)')
plt.legend()
plt.tight_layout()
plt.show()


# ==============================================================================
# PART 3: MULTIVARIATE INTERACTIONS (3D Analysis)
# ==============================================================================
print("\n" + "="*60)
print("      PART 3: MULTIVARIATE INTERACTIONS & PATTERNS")
print("="*60)

# 1. Scatter: Age vs Revenue vs Retention
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=df, 
    x='founder_age', 
    y='monthly_revenue_generated', 
    hue='retention_status', 
    alpha=0.6,
    palette='coolwarm'
)
plt.title('Interaction: Does High Revenue keep Young Founders?')
plt.tight_layout()
plt.show()

# 2. Violin Plot: Satisfaction vs Revenue
# Check if "Money" (Revenue) compensates for "Happiness" (Satisfaction)
if 'venture_satisfaction' in df.columns:
    plt.figure(figsize=(12, 6))
    sns.violinplot(
        data=df, 
        x='venture_satisfaction', 
        y='monthly_revenue_generated', 
        hue='retention_status',
        split=True,
        order=['Low', 'Medium', 'High', 'Very High'], # Adjust if your categories differ
        palette='muted'
    )
    plt.title('Distribution of Revenue by Satisfaction & Retention Status')
    plt.tight_layout()
    plt.show()

# 3. Pairplot for Key Numerical Factors
# A quick glance at the relationships between the most important numeric vars
key_numeric = ['founder_age', 'years_with_startup', 'monthly_revenue_generated', 'retention_status']
# Ensure columns exist before plotting
plot_cols = [c for c in key_numeric if c in df.columns]

if len(plot_cols) > 1:
    print("Generating Pairplot... (This might take a moment)")
    sns.pairplot(df[plot_cols], hue='retention_status', palette='husl', corner=True)
    plt.suptitle("Pairwise Relationships of Key Numerical Features", y=1.02)
    plt.show()

print("\n" + "="*60)
print("      EXTENSIVE EDA COMPLETE")
print("="*60)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# --- CONFIGURATION ---
sns.set(style="whitegrid")
plt.rcParams['figure.max_open_warning'] = 50 # Allow many plots

# Load Data
possible_paths = ['../data/train.csv', 'train.csv']
DATA_PATH = None
for path in possible_paths:
    if os.path.exists(path):
        DATA_PATH = path
        break

if DATA_PATH is None:
    print("❌ Error: 'train.csv' not found.")
else:
    print(f"✅ Data Loaded from {DATA_PATH}")
    df = pd.read_csv(DATA_PATH)

    # ==============================================================================
    # 1. THE MASTER CORRELATION HEATMAP (Numerical Only)
    # ==============================================================================
    print("\n" + "="*60)
    print("      1. MASTER CORRELATION HEATMAP")
    print("="*60)
    
    # Select only numbers
    numeric_df = df.select_dtypes(include=['float64', 'int64'])
    # Drop ID if present
    if 'founder_id' in numeric_df.columns:
        numeric_df = numeric_df.drop(columns=['founder_id'])
    
    plt.figure(figsize=(12, 10))
    # mask=np.triu(...) hides the upper triangle (since it's a mirror image)
    mask = np.triu(np.ones_like(numeric_df.corr(), dtype=bool))
    
    sns.heatmap(
        numeric_df.corr(), 
        mask=mask,
        annot=True, 
        fmt=".2f", 
        cmap='coolwarm', 
        linewidths=0.5,
        vmin=-1, vmax=1
    )
    plt.title('Correlation Matrix of All Numerical Features')
    plt.tight_layout()
    plt.show()

    # ==============================================================================
    # 2. NUMERICAL FEATURES vs TARGET (Boxplots)
    # ==============================================================================
    print("\n" + "="*60)
    print("      2. COMPARING NUMERICAL INPUTS WITH OUTPUT")
    print("      (Look for shift in the box/median)")
    print("="*60)

    # Get list of numerical cols again
    num_cols = numeric_df.columns.tolist()

    # Create a grid of plots
    # We will do 3 plots per row
    n_cols = 3
    n_rows = (len(num_cols) - 1) // n_cols + 1
    
    plt.figure(figsize=(15, 4 * n_rows))
    
    for i, col in enumerate(num_cols):
        ax = plt.subplot(n_rows, n_cols, i+1)
        
        # Boxplot: Great for seeing if the "Average" or "Spread" is different
        sns.boxplot(
            data=df, 
            x='retention_status', 
            y=col, 
            hue='retention_status', 
            palette='Set2', 
            legend=False,
            ax=ax
        )
        ax.set_title(f'{col} vs Retention')
        ax.set_xlabel('')
        
    plt.tight_layout()
    plt.show()

    # ==============================================================================
    # 3. CATEGORICAL FEATURES vs TARGET (Stacked Bar Charts)
    # ==============================================================================
    print("\n" + "="*60)
    print("      3. COMPARING CATEGORICAL INPUTS WITH OUTPUT")
    print("      (Look for bars with different Red/Blue proportions)")
    print("="*60)

    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    # Remove target from inputs
    cat_cols = [c for c in cat_cols if c != 'retention_status']

    # Filter out columns with too many categories (like names or IDs)
    cat_cols = [c for c in cat_cols if df[c].nunique() < 20]

    for col in cat_cols:
        # Create a Crosstab (Contingency Table)
        cross_tab = pd.crosstab(df[col], df['retention_status'])
        
        # Calculate Proportions (Rows sum to 100%)
        # This is better than counts because it handles unequal group sizes
        cross_tab_prop = cross_tab.div(cross_tab.sum(1), axis=0)
        
        # Plot
        ax = cross_tab_prop.plot(
            kind='bar', 
            stacked=True, 
            figsize=(10, 4), 
            colormap='viridis',
            edgecolor='black'
        )
        
        plt.title(f'Retention Rates by {col} (Normalized)', fontsize=14)
        plt.xlabel(col)
        plt.ylabel('Proportion')
        plt.legend(title='Retention Status', bbox_to_anchor=(1.05, 1), loc='upper left')
        
        # Add text labels for percentages
        for p in ax.patches:
            width, height = p.get_width(), p.get_height()
            x, y = p.get_xy() 
            if height > 0.05: # Only show text if segment is big enough
                ax.text(x+width/2, 
                        y+height/2, 
                        '{:.0f}%'.format(height*100), 
                        horizontalalignment='center', 
                        verticalalignment='center',
                        color='white',
                        weight='bold')
        
        plt.tight_layout()
        plt.show()

In [4]:
import pandas as pd
import numpy as np
import time
import os
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

# --- CONFIGURATION ---
OUTPUT_DIR = '../output'
os.makedirs(OUTPUT_DIR, exist_ok=True) # Create folder if it doesn't exist

# --- 1. LOAD DATA ---
print("Loading Data...")
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# Combine for consistent processing
train['is_train'] = 1
test['is_train'] = 0
test['retention_status'] = 'Unknown'
combined = pd.concat([train, test], axis=0)

# --- 2. FORENSIC PRE-PROCESSING ---
print("Applying Forensic Cleaning Pipeline...")

# A. DROP THE NOISE (Your 'Kill List')
cols_to_drop = ['founder_id', 'founder_role', 'leadership_scope', 
                'founder_visibility', 'innovation_support', 'team_size_category']
combined = combined.drop(columns=cols_to_drop)

# B. FIX SKEW
combined['monthly_revenue_generated'] = np.log1p(combined['monthly_revenue_generated'].fillna(combined['monthly_revenue_generated'].median()))

# C. IMPUTE MISSING VALUES
combined['years_since_founding'] = combined['years_since_founding'].fillna(combined['years_since_founding'].median())
combined['num_dependents'] = combined['num_dependents'].fillna(combined['num_dependents'].mode()[0])
combined['work_life_balance_rating'] = combined['work_life_balance_rating'].fillna('Unknown')
combined['venture_satisfaction'] = combined['venture_satisfaction'].fillna('Unknown')

# D. FEATURE ENGINEERING
combined['founder_age'] = combined['founder_age'].clip(lower=18)
combined['start_age'] = combined['founder_age'] - combined['years_with_startup']

# E. ORDINAL ENCODING
rating_map = {'Unknown': 0, 'Low': 1, 'Poor': 1, 'Below Average': 2, 
              'Fair': 3, 'Medium': 3, 'Average': 3, 
              'Good': 4, 'High': 4, 'Very High': 5, 'Excellent': 5}
for col in ['work_life_balance_rating', 'venture_satisfaction', 'startup_performance_rating', 'startup_reputation']:
    combined[col] = combined[col].map(rating_map).fillna(0)

stage_map = {'Entry': 1, 'Mid': 2, 'Senior': 3, 'Growth': 3, 'Established': 4}
combined['startup_stage'] = combined['startup_stage'].map(stage_map).fillna(1)

binary_map = {'No': 0, 'Yes': 1}
for col in ['working_overtime', 'remote_operations']:
    combined[col] = combined[col].map(binary_map)

# F. ONE-HOT ENCODING
combined = pd.get_dummies(combined, columns=['founder_gender', 'education_background', 'personal_status'], drop_first=True)

# --- 3. PREPARE MATRICES ---
print("Scaling Data...")
train_final = combined[combined['is_train'] == 1].drop(columns=['is_train'])
test_final = combined[combined['is_train'] == 0].drop(columns=['is_train', 'retention_status'])

# Target Mapping
target_map = {'Stayed': 0, 'Left': 1}
y = train_final['retention_status'].map(target_map)
X = train_final.drop(columns=['retention_status'])
X_submit = test_final[X.columns]

# Scale (Critical for SVM/NN)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_submit_scaled = scaler.transform(X_submit)

inverse_map = {0: 'Stayed', 1: 'Left'}

# ==============================================================================
# 4. DEFINE MODELS & PARAMETER GRIDS
# ==============================================================================
models_config = {
    "GradientBoosting": {
        "estimator": GradientBoostingClassifier(random_state=42),
        "params": {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [3, 4, 5],
            'subsample': [0.8, 0.9, 1.0]
        }
    },
    "RandomForest": {
        "estimator": RandomForestClassifier(random_state=42),
        "params": {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    "SVM": {
        "estimator": SVC(random_state=42),
        "params": {
            'C': [0.1, 1, 10],
            'kernel': ['rbf'], 
            'gamma': ['scale', 'auto']
        }
    },
    "NeuralNetwork": {
        "estimator": MLPClassifier(max_iter=500, random_state=42),
        "params": {
            'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
            'activation': ['relu', 'tanh'],
            'alpha': [0.0001, 0.001],
            'learning_rate_init': [0.001, 0.01]
        }
    }
}

# ==============================================================================
# 5. EXECUTION LOOP
# ==============================================================================
print("\n" + "="*50)
print("   STARTING MULTI-MODEL GENERATION")
print("="*50)

for name, config in models_config.items():
    print(f"\nProcessing Model: {name}...")
    start = time.time()
    
    # 1. Hyperparameter Tuning (Random Search)
    search = RandomizedSearchCV(
        estimator=config['estimator'],
        param_distributions=config['params'],
        n_iter=10,
        cv=3,
        scoring='accuracy',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    
    search.fit(X_scaled, y)
    best_model = search.best_estimator_
    
    print(f"  -> Best Params: {search.best_params_}")
    print(f"  -> Best CV Score: {search.best_score_:.4%}")
    
    # 2. Predict on Test Set using the BEST model
    preds = best_model.predict(X_submit_scaled)
    
    # 3. Save CSV to OUTPUT_DIR
    filename = f"{OUTPUT_DIR}/submission_{name}.csv"
    submission = pd.DataFrame({
        'founder_id': test['founder_id'],
        'retention_status': [inverse_map[p] for p in preds]
    })
    submission.to_csv(filename, index=False)
    
    elapsed = time.time() - start
    print(f"  -> Saved to '{filename}' (Time: {elapsed:.1f}s)")

print("\n" + "="*50)
print(f"   ALL FILES GENERATED IN {OUTPUT_DIR}/")
print("="*50)

Loading Data...
Applying Forensic Cleaning Pipeline...
Scaling Data...

   STARTING MULTI-MODEL GENERATION

Processing Model: GradientBoosting...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
  -> Best Params: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.05}
  -> Best CV Score: 75.3334%
  -> Saved to '../output/submission_GradientBoosting.csv' (Time: 37.0s)

Processing Model: RandomForest...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
  -> Best Params: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 10}
  -> Best CV Score: 74.7094%
  -> Saved to '../output/submission_RandomForest.csv' (Time: 17.4s)

Processing Model: SVM...
Fitting 3 folds for each of 6 candidates, totalling 18 fits




  -> Best Params: {'kernel': 'rbf', 'gamma': 'scale', 'C': 1}
  -> Best CV Score: 74.2346%
  -> Saved to '../output/submission_SVM.csv' (Time: 236.3s)

Processing Model: NeuralNetwork...
Fitting 3 folds for each of 10 candidates, totalling 30 fits




  -> Best Params: {'learning_rate_init': 0.001, 'hidden_layer_sizes': (50,), 'alpha': 0.001, 'activation': 'relu'}
  -> Best CV Score: 74.0769%
  -> Saved to '../output/submission_NeuralNetwork.csv' (Time: 62.2s)

   ALL FILES GENERATED IN ../output/


In [5]:
import pandas as pd
import numpy as np
import os
from scipy.stats import mode

# --- CONFIGURATION ---
OUTPUT_DIR = '../output'
files = [
    'submission_GradientBoosting.csv',
    'submission_RandomForest.csv',
    'submission_SVM.csv',
    'submission_NeuralNetwork.csv'
]

print("Loading predictions...")
dfs = []
for f in files:
    path = os.path.join(OUTPUT_DIR, f)
    if os.path.exists(path):
        df = pd.read_csv(path)
        # Convert to numbers for math (Stayed=0, Left=1)
        df['retention_numeric'] = df['retention_status'].map({'Stayed': 0, 'Left': 1})
        dfs.append(df['retention_numeric'].values)
        print(f"Loaded: {f}")
    else:
        print(f"Warning: {f} not found. Skipping.")

if not dfs:
    print("No files found!")
    exit()

# --- HARD VOTING ---
# We stack the predictions and take the "Mode" (Majority Vote)
stacked_preds = np.array(dfs)
# mode returns (values, counts), we just want the values [0]
# axis=0 means looking down the columns (across the models)
final_votes, _ = mode(stacked_preds, axis=0)

# Flatten the array
final_votes = final_votes.ravel()

# --- SOFT VOTING (Alternative) ---
# If you want to use average probability instead (often better):
# avg_preds = np.mean(stacked_preds, axis=0)
# final_votes = (avg_preds > 0.5).astype(int)

# --- SAVE ---
inverse_map = {0: 'Stayed', 1: 'Left'}
submission_ensemble = pd.read_csv(os.path.join(OUTPUT_DIR, files[0])) # Load template
submission_ensemble['retention_status'] = [inverse_map[p] for p in final_votes]

output_path = os.path.join(OUTPUT_DIR, 'submission_Ensemble_Voting.csv')
submission_ensemble.to_csv(output_path, index=False)

print(f"\n✅ Created Ensemble Submission: {output_path}")
print("This file combines the 'wisdom' of all your models.")

Loading predictions...
Loaded: submission_GradientBoosting.csv
Loaded: submission_RandomForest.csv
Loaded: submission_SVM.csv
Loaded: submission_NeuralNetwork.csv

✅ Created Ensemble Submission: ../output/submission_Ensemble_Voting.csv
This file combines the 'wisdom' of all your models.
