In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import classification_report, precision_recall_curve
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

# Load and clean data
df = pd.read_csv('sp500_esg_datafinal.csv').replace('#N/A', np.nan)
df = df.dropna(subset=['Interpretation']).drop_duplicates()

# Encode target variable
encoder = OrdinalEncoder(categories=[['Low Risk of Bankruptcy', 
                                      'Moderate Risk of Bankruptcy', 
                                      'High Risk of Bankruptcy']])
df['Risk'] = encoder.fit_transform(df[['Interpretation']]).astype(int)

# Enhanced feature engineering
def create_features(df):
    df = df.copy()
    df['Liquidity'] = df['Working Capital'] / df['totalAssets']
    df['Profitability'] = df['EBIT'] / df['totalAssets']
    df['Leverage'] = df['totalDebt'] / df['totalAssets']
    df['EarningsRetention'] = df['retainedEarnings'] / df['totalAssets']
    df['RevenueEfficiency'] = df['totalRevenue'] / df['totalAssets']
    
    df['ESG_Profitability'] = df['totalEsg'] * df['Profitability']
    df['Controversy_Leverage'] = df['highestControversy'] * df['Leverage']
    df['Gov_Revenue'] = df['governanceScore'] * df['RevenueEfficiency']
    
    return df

df = create_features(df)

# Feature selection
financial_features = ['Liquidity', 'Profitability', 'Leverage', 
                      'EarningsRetention', 'RevenueEfficiency']

esg_features = ['totalEsg', 'highestControversy', 'ESG_Profitability',
                'Controversy_Leverage', 'Gov_Revenue']

# Custom threshold tuning function
def tune_thresholds(model, X_test, y_test):
    thresholds = {}
    for i in range(3):  # For each class
        y_prob = model.predict_proba(X_test)[:, i]
        precision, recall, thresh = precision_recall_curve(y_test==i, y_prob)
        f1 = 2 * (precision * recall) / (precision + recall + 1e-8)
        best_idx = np.argmax(f1)
        thresholds[i] = thresh[best_idx] if best_idx < len(thresh) else 0.5
    return thresholds

# Model pipeline
def run_model(X, y, model_name='Financial'):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Compute valid SMOTE sampling strategy
    unique_classes, class_counts = np.unique(y_train, return_counts=True)
    min_samples = max(25, int(2 * np.min(class_counts)))  # At least 2x minority count
    sampling_strategy = {cls: max(min_samples, count) for cls, count in zip(unique_classes, class_counts)}
    
    # Preprocessing pipeline with SMOTE
    pipeline = make_pipeline(
        SimpleImputer(strategy='median'),
        SMOTE(sampling_strategy=sampling_strategy, random_state=42),
        StandardScaler(),
        XGBClassifier(objective='multi:softmax', random_state=42)
    )
    
    # Training
    pipeline.fit(X_train, y_train)
    
    # Threshold tuning
    thresholds = tune_thresholds(pipeline[-1], pipeline[:-1].transform(X_test), y_test)
    
    # Adjusted predictions
    y_probs = pipeline[-1].predict_proba(pipeline[:-1].transform(X_test))
    y_pred = np.argmax(y_probs / np.array(list(thresholds.values())), axis=1)
    
    # Evaluation
    print(f"\n{model_name} Model Performance:")
    print(classification_report(y_test, y_pred, target_names=encoder.categories_[0]))
    
    return pipeline

# Financial Model
print("=== Financial Model ===")
fin_pipeline = run_model(df[financial_features], df['Risk'], 'Financial')

# ESG-Enhanced Model
print("\n=== ESG Model ===")
esg_pipeline = run_model(df[financial_features + esg_features], df['Risk'], 'ESG')


=== Financial Model ===

Financial Model Performance:
                             precision    recall  f1-score   support

     Low Risk of Bankruptcy       0.85      0.93      0.89        56
Moderate Risk of Bankruptcy       0.36      0.36      0.36        11
    High Risk of Bankruptcy       1.00      0.74      0.85        19

                   accuracy                           0.81        86
                  macro avg       0.74      0.68      0.70        86
               weighted avg       0.82      0.81      0.81        86


=== ESG Model ===

ESG Model Performance:
                             precision    recall  f1-score   support

     Low Risk of Bankruptcy       0.88      0.91      0.89        56
Moderate Risk of Bankruptcy       0.33      0.27      0.30        11
    High Risk of Bankruptcy       0.79      0.79      0.79        19

                   accuracy                           0.80        86
                  macro avg       0.67      0.66      0.66        86
 