In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings

warnings.filterwarnings("ignore")  # Optional: Suppress warnings

# ------------------------------
# 1. Data Loading & Cleaning
# ------------------------------
def load_data(filename):
    df = pd.read_csv(filename)
    df = df.drop(columns=['Name'], errors='ignore')

    # Replace infinite values with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Fill NaN with column median
    df.fillna(df.median(numeric_only=True), inplace=True)
    return df

# ------------------------------
# 2. Feature Engineering: Altman Z‑Score
# ------------------------------
def compute_altman_z_score(df):
    if "Sales" not in df.columns or "Sales / Net fixed assets" not in df.columns:
        raise ValueError("Missing required column to compute Total Assets.")
        
    # Estimate Total Assets
    df["Total Assets"] = df["Sales"] / df["Sales / Net fixed assets"].replace(0, np.nan)
    
    # Market Value of Equity ≈ Net Working Capital + Total Assets
    if "Net working capital" not in df.columns:
        raise ValueError("Missing required column for Altman Z-Score: Net working capital")
    df["Market Value Equity"] = df["Net working capital"] + df["Total Assets"]

    required_cols = ["Net working capital", "Retained profits/losses during the year", 
                     "PBIT", "Market Value Equity", "Total liabilities", "Sales", "Total Assets"]
    
    for col in required_cols:
        if col not in df.columns:
            raise ValueError(f"Missing required column for Altman Z-Score: {col}")
    
    # Replace divide-by-zero with NaN
    df["Altman_Z"] = (
        1.2 * df["Net working capital"] / df["Total Assets"] +
        1.4 * df["Retained profits/losses during the year"] / df["Total Assets"] +
        3.3 * df["PBIT"] / df["Total Assets"] +
        0.6 * df["Market Value Equity"] / df["Total liabilities"].replace(0, np.nan) +
        1.0 * df["Sales"] / df["Total Assets"]
    )
    
    # Replace resulting inf/nan values
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df["Altman_Z"].fillna(df["Altman_Z"].median(), inplace=True)
    
    return df

# ------------------------------
# 3. Feature Vector Optimization
# ------------------------------
def build_feature_vector(df):
    feature_list = [
        "Altman_Z",
        "Total Assets",
        "Net working capital",
        "Retained profits/losses during the year",
        "PBIT",
        "Market Value Equity",
        "Total liabilities",
        "Sales"
    ]
    features = [f for f in feature_list if f in df.columns]
    X = df[features].copy()
    y = df["Label"]
    
    # Final check: no inf or nan in X
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X.fillna(X.median(numeric_only=True), inplace=True)
    
    return X, y

# ------------------------------
# 4. Model Training & Tuning
# ------------------------------
def train_and_tune_model(X_train, y_train, model_type="rf"):
    num_transformer = Pipeline(steps=[
        ("scaler", StandardScaler())
    ])
    
    preprocessor = ColumnTransformer(transformers=[
        ("num", num_transformer, X_train.columns)
    ])
    
    if model_type == "rf":
        classifier = RandomForestClassifier(random_state=42)
        param_grid = {
            "classifier__n_estimators": [100, 200],
            "classifier__max_depth": [None, 10],
            "classifier__min_samples_split": [2, 5],
            "classifier__class_weight": [None, "balanced"]
        }
    elif model_type == "svm":
        classifier = SVC(kernel="rbf", class_weight="balanced", probability=True, random_state=42)
        param_grid = {
            "classifier__C": [0.1, 1],
            "classifier__gamma": [0.01, 0.1]
        }
    else:
        raise ValueError("Invalid model type. Choose 'rf' or 'svm'.")

    pipeline = ImbPipeline(steps=[
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("classifier", classifier)
    ])

    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=5,
        scoring="f1_macro",
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {model_type.upper()}:\n", grid_search.best_params_)
    return grid_search.best_estimator_

# ------------------------------
# 5. Main Entry Point
# ------------------------------
def main():
    df = load_data("cleaned_financial_data.csv")

    try:
        df = compute_altman_z_score(df)
    except ValueError as e:
        print("Error computing Altman Z-Score:", e)
        return

    X, y = build_feature_vector(df)

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

    best_rf = train_and_tune_model(X_train, y_train, model_type="rf")
    y_pred_rf = best_rf.predict(X_test)
    print("\nRandom Forest Classification Report:")
    print(classification_report(y_test, y_pred_rf))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

    best_svm = train_and_tune_model(X_train, y_train, model_type="svm")
    y_pred_svm = best_svm.predict(X_test)
    print("\nSVM Classification Report:")
    print(classification_report(y_test, y_pred_svm))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))

if __name__ == "__main__":
    main()


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters for RF:
 {'classifier__class_weight': None, 'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99     10867
           1       0.26      0.57      0.36       149

    accuracy                           0.97     11016
   macro avg       0.63      0.77      0.67     11016
weighted avg       0.98      0.97      0.98     11016

Confusion Matrix:
 [[10626   241]
 [   64    85]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits
