In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.impute import KNNImputer

import warnings
warnings.filterwarnings("ignore")

In [18]:
df1 = pd.read_csv('cleaned_financial_data.csv')
df1 = df1.groupby('Name').apply(lambda g: g.ffill()).reset_index(drop=True)
df1['Year'] = df1.groupby('Name').cumcount()

Imputing the empty data with mean values (row-wise)

In [19]:
df = df1[['Name','Year','Total income','Net sales','Sales / Net fixed assets','Change in stock',
            'Total expenses','Profit after tax','PBDITA','PBIT','Cash profit','PAT as % of PBDITA','Total liabilities',
            'Total capital','Reserves and funds','Surplus/deficit as at the end of the year','Retained profits/losses during the year',
            'Non-current liabilities','Long term borrowings excl current portion','Current liabilities & provisions',
            'Debt','Debt to equity ratio (times)','Total assets','Net fixed assets','Net working capital (cost of sales method)',
            'Quick ratio (times)','Current ratio (times)','Cash to current liabilities (times)','Creditors turnover (times)',
            'Change in cash and bank balance','Change in sales','Change in total income','Change in working capital assets',
            'Change in working capital liabilities','Revenue','Label']]
# creating useful features
df['Debt/Assets'] = df['Debt'] / df['Total assets']
df['Debt/Equity'] = df['Debt'] / df['Total capital']
df['Debt/Net Income'] = df['Debt'] / df['Profit after tax']
df['Debt/Net Sales'] = df['Debt'] / df['Net sales']
df.head()

Unnamed: 0,Name,Year,Total income,Net sales,Sales / Net fixed assets,Change in stock,Total expenses,Profit after tax,PBDITA,PBIT,...,Change in sales,Change in total income,Change in working capital assets,Change in working capital liabilities,Revenue,Label,Debt/Assets,Debt/Equity,Debt/Net Income,Debt/Net Sales
0,20 MICRONS LTD.,0,3097.0,2888.8,204.9425,51.2,3146.9,1.3,322.1,217.8,...,154.6,195.1,240.9,210.4,,0,0.505315,9.585452,1246.846154,0.561098
1,20 MICRONS LTD.,1,3316.2,3116.7,227.7801,21.3,3378.2,-40.7,301.7,203.2,...,242.3,219.2,57.9,183.6,,0,0.470302,9.149616,-38.014742,0.496422
2,20 MICRONS LTD.,2,3527.6,3338.3,249.5431,8.2,3442.4,93.4,449.3,344.3,...,242.4,211.4,46.9,-145.9,,0,0.4645,8.80102,16.622056,0.465057
3,20 MICRONS LTD.,3,3741.8,3569.9,239.0117,-42.8,3567.4,131.6,519.6,421.4,...,234.2,214.2,-50.3,-11.5,,0,0.397061,8.241497,11.047112,0.407238
4,20 MICRONS LTD.,4,3922.0,3845.7,237.2036,7.2,3770.4,158.8,545.0,453.6,...,162.5,180.2,80.8,85.4,,0,0.339801,7.336735,8.149874,0.336532


In [20]:
# creating new features for Altman Z-Score
def compute_altman_z_score(df):
    if "Net sales" not in df.columns or "Sales / Net fixed assets" not in df.columns:
        raise ValueError("Missing required column to compute Total Assets.")
        
    # Market Value of Equity ≈ Net Working Capital + Total Assets
    if "Net working capital (cost of sales method)" not in df.columns:
        raise ValueError("Missing required column for Altman Z-Score: Net working capital (cost of sales method)")
    df["Market Value Equity"] = df["Net working capital (cost of sales method)"] + df["Total assets"]
    required_cols = ["Net working capital (cost of sales method)", "Retained profits/losses during the year", 
                     "PBIT", "Market Value Equity", "Total liabilities", "Net sales", "Total assets"]
    
    for col in required_cols:
        if col not in df.columns:
            raise ValueError(f"Missing required column for Altman Z-Score: {col}")
    
        df["NWC/TA"] = 1.2 * df["Net working capital (cost of sales method)"] / df["Total assets"]
        df["Retained Earnings/TA"] = 1.4 * df["Retained profits/losses during the year"] / df["Total assets"] 
        df["PBIT/TA"] = 3.3 * df["PBIT"] / df["Total assets"] 
        df["MVE/Debt"] = 0.6 * df["Market Value Equity"] / df["Debt"].replace(0, np.nan) 
        df["Net sales/TA"] = 1.0 * df["Net sales"] / df["Total assets"]
        
    # Replace divide-by-zero with NaN
    df["Altman_Z"] = df["NWC/TA"] + df["Retained Earnings/TA"] + df["PBIT/TA"] + df["MVE/Debt"] + df["Net sales/TA"]
    
    # Replace resulting inf/nan values
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df["Altman_Z"].fillna(df["Altman_Z"].median(), inplace=True)
    
    return df


In [21]:
# get all features related to the Altman Z-Score
compute_altman_z_score(df)
# map the value of Altman_Z to standard normal distribution to find the probability of default
def z_score_to_probability(z_score):
    # Using the cumulative distribution function (CDF) of the standard normal distribution
    return 1 - (0.5 * (1 + np.math.erf(z_score / np.sqrt(2))))
df['P(Default)'] = df['Altman_Z'].apply(z_score_to_probability)
df['P(Default)'] = df['P(Default)'].round(3)

In [23]:
df_temp = df.drop(columns=['Name'])
corr_matrix = df_temp.corr()
label_corr = corr_matrix['Label'].sort_values(ascending = False)
label_corr.shape
print(label_corr.head(10))

Label                                        1.000000
P(Default)                                   0.049926
Debt to equity ratio (times)                 0.028030
Current liabilities & provisions             0.023825
Debt                                         0.007666
Non-current liabilities                      0.006406
Long term borrowings excl current portion    0.006255
Net fixed assets                             0.002386
Total capital                                0.001867
Debt/Equity                                  0.000988
Name: Label, dtype: float64


In [25]:
df.to_csv('important_financials.csv')