In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# =============================================================================
# 1. Data Loading & Preprocessing
# =============================================================================
def load_data(filepath):
    """Load the cleaned/pivoted data from CSV file."""
    df = pd.read_csv(filepath)
    return df

def clean_data(df):
    """Clean the data: handle infinities and missing values."""
    # Replace infinity with NaN, then fill with median values
    df = df.drop(columns = ['Name'])
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(df.median(), inplace=True)
    return df

# =============================================================================
# 2. Feature Engineering
# =============================================================================
def compute_altman_zscore(df):
    """
    Compute Altman Z-score from available columns.
    
    **Note:** The classical Altman Z-Score for manufacturing firms:
      Z = 1.2*(WC/TA) + 1.4*(RE/TA) + 3.3*(EBIT/TA) + 0.6*(MVE/TL) + 1.0*(Sales/TA)
      
    Here we make assumptions for proxies:  
      - Working Capital (WC): Use "Cash balance_11" as a proxy  
      - Total Assets (TA): Sum of "Net property, plant and equipment_11" and "Long term investments_11"  
      - Retained Earnings (RE): Use "Profit after tax reported by company_11"  
      - EBIT: Use "PBIT_11"  
      - Market Value of Equity (MVE): Use "Total income_11" as a proxy  
      - Total Liabilities (TL): Use "Long term loans & advances_11"  
      - Sales: Use "Sales_11"
    
    Adjust these assumptions as needed!
    """
    # Use the .get method to avoid KeyErrors
    WC = df.get("Cash balance_11", pd.Series(0))
    TA = df.get("Net property, plant and equipment_11", pd.Series(0)) + df.get("Long term investments_11", pd.Series(0))
    RE = df.get("Profit after tax reported by company_11", pd.Series(0))
    EBIT = df.get("PBIT_11", pd.Series(0))
    MVE = df.get("Total income_11", pd.Series(0))
    TL = df.get("Long term loans & advances_11", pd.Series(0))
    Sales = df.get("Sales_11", pd.Series(0))
    
    # Prevent division by zero
    TA = TA.replace(0, np.nan)
    TL = TL.replace(0, np.nan)
    
    # Calculate Z-score; fill NaN later with 0
    z = 1.2 * (WC / TA) + 1.4 * (RE / TA) + 3.3 * (EBIT / TA) + 0.6 * (MVE / TL) + 1.0 * (Sales / TA)
    z.fillna(0, inplace=True)
    return z

def add_new_features(df):
    """Add new features such as growth rates and the Altman Z-score."""
    # Example: Create growth features using the first and last year's values.
    # Adjust column names if necessary.
    if ("Total income_1" in df.columns) and ("Total income_11" in df.columns):
        df["Income_Growth"] = np.where(df["Total income_1"] == 0, 0, 
                                       (df["Total income_11"] - df["Total income_1"]) / df["Total income_1"])
    if ("Profit after tax_1" in df.columns) and ("Profit after tax_11" in df.columns):
        df["Profit_Growth"] = np.where(df["Profit after tax_1"] == 0, 0, 
                                       (df["Profit after tax_11"] - df["Profit after tax_1"]) / df["Profit after tax_1"])
    
    # Add Altman Z-score
    df["Altman_Z"] = compute_altman_zscore(df)
    
    return df

# =============================================================================
# 3. Feature Vector Selection
# =============================================================================
def select_features(df):
    """
    Select a list of key features for the classification.
    Adjust this list based on your exploratory analysis.
    """
    feature_cols = [
        "Total income_11", "Sales_11", "Net sales_11", "Profit after tax_11",
        "Sales / Net fixed assets_11", "Current ratio_11", "Quick ratio_11",
        "Cash to current liabilities (times)_11", "Income_Growth", "Profit_Growth",
        "Altman_Z"
    ]
    # Ensure that these columns exist in the data
    available_features = [col for col in feature_cols if col in df.columns]
    return available_features

# =============================================================================
# 4. Model Building and Hyperparameter Optimization
# =============================================================================
def build_and_optimize_models(X_train, y_train):
    """
    Build and optimize models (Random Forest and SVM) using grid search.
    Returns the best estimators for both models.
    """
    # ---- Random Forest Pipeline ----
    pipeline_rf = ImbPipeline([
        ("scaler", StandardScaler()),
        ("smote", SMOTE(random_state=42)),
        ("classifier", RandomForestClassifier(random_state=42, n_jobs=-1))
    ])
    
    param_grid_rf = {
        "classifier__n_estimators": [100, 200, 300],
        "classifier__max_depth": [None, 10, 20],
        "classifier__min_samples_split": [2, 5, 10],
        "classifier__class_weight": [None, "balanced"]
    }
    
    grid_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring="f1", n_jobs=-1, verbose=1)
    grid_rf.fit(X_train, y_train)
    print("Random Forest Best Params:", grid_rf.best_params_)
    
    # ---- SVM Pipeline ----
    pipeline_svm = ImbPipeline([
        ("scaler", StandardScaler()),
        ("smote", SMOTE(random_state=42)),
        ("classifier", SVC(kernel="rbf", class_weight="balanced", probability=True, random_state=42))
    ])
    
    param_grid_svm = {
        "classifier__C": [0.1, 1, 10],
        "classifier__gamma": [0.001, 0.01, 0.1]
    }
    
    grid_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=5, scoring="f1", n_jobs=-1, verbose=1)
    grid_svm.fit(X_train, y_train)
    print("SVM Best Params:", grid_svm.best_params_)
    
    return grid_rf.best_estimator_, grid_svm.best_estimator_

def evaluate_model(model, X_test, y_test, model_name="Model"):
    """Evaluate the model performance on the test set."""
    y_pred = model.predict(X_test)
    print(f"=== {model_name} Evaluation ===")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\n")

# =============================================================================
# 5. Main Execution
# =============================================================================
def main():
    # -- Load and Clean Data --
    filepath = "pivoted_financial_data.csv"  # Update to your file path if needed
    df = load_data(filepath)
    df = clean_data(df)
    # -- Feature Engineering --
    df = add_new_features(df)
    features = select_features(df)
    
    # -- Prepare Feature Matrix and Target Vector --
    X = df[features]
    y = df["Label"]
    
    # -- Split Data with Stratification --
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    
    # -- Build & Optimize Models --
    best_rf, best_svm = build_and_optimize_models(X_train, y_train)
    
    # -- Evaluate the models --
    evaluate_model(best_rf, X_test, y_test, model_name="Random Forest")
    evaluate_model(best_svm, X_test, y_test, model_name="SVM")
    
if __name__ == "__main__":
    main()


Fitting 5 folds for each of 54 candidates, totalling 270 fits
Random Forest Best Params: {'classifier__class_weight': None, 'classifier__max_depth': 10, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Fitting 5 folds for each of 9 candidates, totalling 45 fits
SVM Best Params: {'classifier__C': 10, 'classifier__gamma': 0.1}
=== Random Forest Evaluation ===
              precision    recall  f1-score   support

           0       0.99      0.80      0.88      1058
           1       0.04      0.62      0.08        16

    accuracy                           0.79      1074
   macro avg       0.52      0.71      0.48      1074
weighted avg       0.98      0.79      0.87      1074

Confusion Matrix:
[[843 215]
 [  6  10]]


=== SVM Evaluation ===
              precision    recall  f1-score   support

           0       1.00      0.44      0.61      1058
           1       0.02      0.94      0.05        16

    accuracy                           0.45      1074
   macro 