In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('encoded_bankruptcy_data.csv')

print(df.head())
print(df.info())

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

## DATA PREPROCESSING

In [None]:
# Prepare features and target
X = df.drop(['company_name', 'Bankruptcy_Status'], axis=1)
y = df['Bankruptcy_Status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTrain target distribution:\n{y_train.value_counts()}")
print(f"Test target distribution:\n{y_test.value_counts()}")

## FEATURE ENGG

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nScaled train set - Mean: {X_train_scaled.mean():.6f}, Std: {X_train_scaled.std():.6f}")
print(f"Scaled test set - Mean: {X_test_scaled.mean():.6f}, Std: {X_test_scaled.std():.6f}")

## OPTIMIZATION FOR IMBALANCED DATA

In [None]:
# Handle imbalance using scale_pos_weight for XGBoost and class_weight for LightGBM
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
print(f"\nClass imbalance ratio (scale_pos_weight): {scale_pos_weight:.2f}")

## XGBOOST MODEL TRAINING 

In [None]:
# XGBoost: GridSearchCV
xgb = XGBClassifier(
    objective='binary:logistic',
    scale_pos_weight=scale_pos_weight,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    tree_method='hist'  # Faster training
)

xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'min_child_weight': [1, 3, 5]  # Added to prevent overfitting
}


print("\nFitting the XGBoost model using GridSearchCV......")
xgb_grid = GridSearchCV(
    xgb, xgb_params, cv=5, scoring='f1', n_jobs=-1, verbose=1
)
xgb_grid.fit(X_train_scaled, y_train)
xgb_best = xgb_grid.best_estimator_

print(f"\nBest XGBoost params: {xgb_grid.best_params_}")
print(f"Best XGBoost CV F1 score: {xgb_grid.best_score_:.4f}")


## LightGBM Model Training

In [None]:

# LightGBM: GridSearchCV
lgbm = LGBMClassifier(
    class_weight='balanced',
    random_state=42,
    force_col_wise=True,  # Fix for auto-choosing warnings
    verbose=-1,  # Suppress iteration logs
    min_child_samples=20,  # Prevent overfitting
    min_data_in_leaf=20
)

lgbm_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [15, 31, 50],  # Adjusted range
    'min_child_samples': [20, 30, 40],  # Added
    'feature_fraction': [0.8, 0.9, 1.0]  # Added for regularization
}

lgbm_grid = GridSearchCV(
    lgbm, lgbm_params, cv=5, scoring='f1', n_jobs=-1, verbose=1
)

print("\nFitting LightGBM GridSearchCV...")
lgbm_grid.fit(X_train_scaled, y_train)
lgbm_best = lgbm_grid.best_estimator_

print(f"\nBest LightGBM params: {lgbm_grid.best_params_}")
print(f"Best LightGBM CV F1 score: {lgbm_grid.best_score_:.4f}")



## Cross Validation (Metric => F1-Score) 

In [None]:
# Cross-validation scores
xgb_cv_f1 = cross_val_score(xgb_best, X_train_scaled, y_train, cv=5, scoring='f1')
lgbm_cv_f1 = cross_val_score(lgbm_best, X_train_scaled, y_train, cv=5, scoring='f1')

print(f"\nXGBoost CV F1: {xgb_cv_f1.mean():.4f} (+/- {xgb_cv_f1.std() * 2:.4f})")
print(f"LightGBM CV F1: {lgbm_cv_f1.mean():.4f} (+/- {lgbm_cv_f1.std() * 2:.4f})")

## CHECKING THE MODELS ON TEST DATA

In [None]:
# Evaluation on test set

def evaluate(model, X_test, y_test, model_name):
    print("\n" + "=" * 80)
    print(f"{model_name} TEST SET EVALUATION")
    print("=" * 80)
    
    y_pred = model.predict(X_test)
    
    print(f"\nAccuracy:  {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"Recall:    {recall_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"F1 Score:  {f1_score(y_test, y_pred, zero_division=0):.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))

evaluate(xgb_best, X_test_scaled, y_test, "XGBOOST")
evaluate(lgbm_best, X_test_scaled, y_test, "LIGHTGBM")

## SAVING MODELS

In [None]:
import joblib

# Save both trained models
joblib.dump(xgb_best, "xgb_best_model.pkl")
joblib.dump(lgbm_best, "lgbm_best_model.pkl")

print("Models saved successfully!")

In [None]:
# Load the trained models
xgb_best = joblib.load("xgb_best_model.pkl")
lgbm_best = joblib.load("lgbm_best_model.pkl")

print("Models loaded successfully!")

## FEATURE IMPORTANCE BY CALCULATING INFORMATION GAIN

In [None]:
from sklearn.feature_selection import mutual_info_classif
from imblearn.over_sampling import SMOTE

# Calculate Information Gain (Mutual Information) for each feature
print("INFORMATION GAIN (MUTUAL INFORMATION) ANALYSIS")


# Use the original training data for information gain calculation
info_gain = mutual_info_classif(X_train_scaled, y_train, random_state=42)

# Create DataFrame with feature importance
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Information_Gain': info_gain
}).sort_values('Information_Gain', ascending=False)

print("\nTop 15 Features by Information Gain:")
print(feature_importance_df.head(15).to_string(index=False))

print("\n\nAll Features Information Gain:")
print(feature_importance_df.to_string(index=False))

# Save feature importance
joblib.dump(feature_importance_df, "feature_importance.pkl")
print("\nFeature importance saved to 'feature_importance.pkl'")

## Trying SMOTE for better results

In [None]:
print("SMOTE RESAMPLING ANALYSIS")

print(f"\nOriginal Training Set Distribution:")
print(y_train.value_counts())
print(f"Class 0: {y_train.value_counts()[0]} samples")
print(f"Class 1: {y_train.value_counts()[1]} samples")
print(f"Imbalance Ratio: {y_train.value_counts()[0] / y_train.value_counts()[1]:.2f}:1")

# Apply SMOTE
smote = SMOTE(random_state=42, k_neighbors=5)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print(f"\nAfter SMOTE Resampling:")
print(pd.Series(y_train_resampled).value_counts())
print(f"Class 0: {pd.Series(y_train_resampled).value_counts()[0]} samples")
print(f"Class 1: {pd.Series(y_train_resampled).value_counts()[1]} samples")
print(f"Total samples increased from {len(y_train)} to {len(y_train_resampled)}")

## Evaluate loaded models on SMOTE data

In [None]:
print("EVALUATING PRE-TRAINED MODELS ON SMOTE DATA")

def evaluate_on_smote(model, X_resampled, y_resampled, X_test, y_test, model_name):
    print(f"\n{model_name} - Training Set (SMOTE) Evaluation:")
    y_train_pred = model.predict(X_resampled)
    print(f"Accuracy:  {accuracy_score(y_resampled, y_train_pred):.4f}")
    print(f"Precision: {precision_score(y_resampled, y_train_pred, zero_division=0):.4f}")
    print(f"Recall:    {recall_score(y_resampled, y_train_pred, zero_division=0):.4f}")
    print(f"F1 Score:  {f1_score(y_resampled, y_train_pred, zero_division=0):.4f}")
    
    print(f"\n{model_name} - Test Set Evaluation:")
    y_test_pred = model.predict(X_test)
    print(f"Accuracy:  {accuracy_score(y_test, y_test_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_test_pred, zero_division=0):.4f}")
    print(f"Recall:    {recall_score(y_test, y_test_pred, zero_division=0):.4f}")
    print(f"F1 Score:  {f1_score(y_test, y_test_pred, zero_division=0):.4f}")

evaluate_on_smote(xgb_best, X_train_resampled, y_train_resampled, 
                  X_test_scaled, y_test, "XGBoost")
evaluate_on_smote(lgbm_best, X_train_resampled, y_train_resampled, 
                  X_test_scaled, y_test, "LightGBM")