In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.feature_selection import mutual_info_classif
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
import joblib
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('/kaggle/input/encoded-bankruptcy-data/encoded_bankruptcy_data.csv')

print(df.head())
print(df.info())

# Prepare features and target
X = df.drop(['company_name', 'Bankruptcy_Status'], axis=1)
y = df['Bankruptcy_Status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTrain target distribution BEFORE SMOTE:\n{y_train.value_counts()}")
print(f"Test target distribution:\n{y_test.value_counts()}")

# Scale features BEFORE SMOTE
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nScaled train set - Mean: {X_train_scaled.mean():.6f}, Std: {X_train_scaled.std():.6f}")
print(f"Scaled test set - Mean: {X_test_scaled.mean():.6f}, Std: {X_test_scaled.std():.6f}")

# Apply SMOTE to handle class imbalance
print("\n" + "=" * 80)
print("APPLYING SMOTE FOR CLASS IMBALANCE")
print("=" * 80)

smote = SMOTE(random_state=42, k_neighbors=5)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print(f"\nTrain target distribution AFTER SMOTE:\n{pd.Series(y_train_resampled).value_counts()}")
print(f"Original train samples: {X_train_scaled.shape[0]}")
print(f"Resampled train samples: {X_train_resampled.shape[0]}")

# Calculate Information Gain (Mutual Information) for each feature
print("\n" + "=" * 80)
print("INFORMATION GAIN (MUTUAL INFORMATION) FOR FEATURES")
print("=" * 80)

# Calculate on original scaled training data (before SMOTE)
info_gain = mutual_info_classif(X_train_scaled, y_train, random_state=42)

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Information_Gain': info_gain
}).sort_values('Information_Gain', ascending=False)

print("\nFeature Information Gain Rankings:")
print(feature_importance_df.to_string(index=False))

# Plot top 10 features
print("\n\nTop 10 Features by Information Gain:")
top_features = feature_importance_df.head(10)
for idx, row in top_features.iterrows():
    print(f"{row['Feature']:30s}: {row['Information_Gain']:.6f}")

# Calculate scale_pos_weight for comparison (though SMOTE handles imbalance)
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
print(f"\nOriginal class imbalance ratio (scale_pos_weight): {scale_pos_weight:.2f}")
print("Note: Using SMOTE, so class imbalance is now balanced in training data")

# XGBoost: GridSearchCV with SMOTE data
print("\n" + "=" * 80)
print("TRAINING XGBOOST WITH SMOTE DATA")
print("=" * 80)

xgb = XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    tree_method='hist'
)

xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

print("\nFitting XGBoost GridSearchCV...")
xgb_grid = GridSearchCV(
    xgb, xgb_params, cv=5, scoring='f1', n_jobs=-1, verbose=1
)
xgb_grid.fit(X_train_resampled, y_train_resampled)
xgb_best = xgb_grid.best_estimator_

print(f"\nBest XGBoost params: {xgb_grid.best_params_}")
print(f"Best XGBoost CV F1 score: {xgb_grid.best_score_:.4f}")

# LightGBM: GridSearchCV with SMOTE data
print("\n" + "=" * 80)
print("TRAINING LIGHTGBM WITH SMOTE DATA")
print("=" * 80)

lgbm = LGBMClassifier(
    random_state=42,
    force_col_wise=True,
    verbose=-1,
    min_child_samples=20,
    min_data_in_leaf=20
)

lgbm_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [15, 31, 50],
    'min_child_samples': [20, 30, 40],
    'feature_fraction': [0.8, 0.9, 1.0]
}

print("\nFitting LightGBM GridSearchCV...")
lgbm_grid = GridSearchCV(
    lgbm, lgbm_params, cv=5, scoring='f1', n_jobs=-1, verbose=1
)
lgbm_grid.fit(X_train_resampled, y_train_resampled)
lgbm_best = lgbm_grid.best_estimator_

print(f"\nBest LightGBM params: {lgbm_grid.best_params_}")
print(f"Best LightGBM CV F1 score: {lgbm_grid.best_score_:.4f}")

# Cross-validation scores
xgb_cv_f1 = cross_val_score(xgb_best, X_train_resampled, y_train_resampled, cv=5, scoring='f1')
lgbm_cv_f1 = cross_val_score(lgbm_best, X_train_resampled, y_train_resampled, cv=5, scoring='f1')

print(f"\nXGBoost CV F1: {xgb_cv_f1.mean():.4f} (+/- {xgb_cv_f1.std() * 2:.4f})")
print(f"LightGBM CV F1: {lgbm_cv_f1.mean():.4f} (+/- {lgbm_cv_f1.std() * 2:.4f})")

# Evaluation on test set
def evaluate(model, X_test, y_test, model_name):
    print("\n" + "=" * 80)
    print(f"{model_name} TEST SET EVALUATION")
    print("=" * 80)
    
    y_pred = model.predict(X_test)
    
    print(f"\nAccuracy:  {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"Recall:    {recall_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"F1 Score:  {f1_score(y_test, y_pred, zero_division=0):.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))

evaluate(xgb_best, X_test_scaled, y_test, "XGBOOST")
evaluate(lgbm_best, X_test_scaled, y_test, "LIGHTGBM")

# Save models and preprocessors
print("\n" + "=" * 80)
print("SAVING MODELS AND PREPROCESSORS")
print("=" * 80)

joblib.dump(xgb_best, "xgb_best_model.pkl")
joblib.dump(lgbm_best, "lgbm_best_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(feature_importance_df, "feature_importance.pkl")

print("\nModels saved successfully!")
print("- xgb_best_model.pkl")
print("- lgbm_best_model.pkl")
print("- scaler.pkl")
print("- feature_importance.pkl")

# Load the trained models
print("\n" + "=" * 80)
print("LOADING MODELS")
print("=" * 80)

xgb_best = joblib.load("xgb_best_model.pkl")
lgbm_best = joblib.load("lgbm_best_model.pkl")
scaler = joblib.load("scaler.pkl")
feature_importance_df = joblib.load("feature_importance.pkl")

print("\nModels loaded successfully!")
print(f"Top 5 most important features:")
print(feature_importance_df.head().to_string(index=False))

ImportError: cannot import name 'tarfile_extractall' from 'sklearn.utils.fixes' (c:\Users\kusha\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\fixes.py)