## Data prep.

In [1]:
# import packages
import pandas as pd
import numpy as np
from numpy import where
from sklearn.impute import KNNImputer
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from sklearn.model_selection import train_test_split



In [2]:
# connect the drive
df = pd.read_excel('/content/drive/MyDrive/MMAI/Finance/Bankruptcy_data_Final.xlsx')

In [3]:
# look at shape
print(df.shape)
df.head(3)

(92872, 13)


Unnamed: 0,EPS,Liquidity,Profitability,Productivity,Leverage Ratio,Asset Turnover,Operational Margin,Return on Equity,Market Book Ratio,Assets Growth,Sales Growth,Employee Growth,BK
0,1.58,0.36,0.18,0.13,1.33,1.77,0.07,0.15,2.22,,,,0
1,1.41,0.36,0.19,0.12,1.31,1.59,0.07,0.13,2.41,0.126319,0.014278,0.040179,0
2,0.31,0.32,0.13,0.08,1.03,1.55,0.05,0.04,2.56,0.368077,0.327909,0.566524,0


In [4]:
print('Full dataset target distribution...')
print('1:', df['BK'][df['BK']==1].count(),',',df['BK'][df['BK']==1].count()/df['BK'].count() )
print('0:', df['BK'][df['BK']==0].count(),',',df['BK'][df['BK']==0].count()/df['BK'].count() )

Full dataset target distribution...
1: 558 , 0.00600826944611939
0: 92314 , 0.9939917305538806


In [5]:
# check out the columns
print('Understanding columns...')
for col in df.columns:
  print(col,':')
  print('Total nulls:',df[col].isnull().values.sum())
  print('Min:',df[col].min())
  print('Max:',df[col].max())
  if df[col].isnull().values.sum() == 0:
    print('All good for', col)
  elif df[col].isnull().values.sum() <= 30:
    print('Drop the nulls for', col,'\n')
  else:
    print('Use imputation for', col,'\n')

Understanding columns...
EPS :
Total nulls: 5
Min: -384000.0
Max: 55339.0
Drop the nulls for EPS 

Liquidity :
Total nulls: 247
Min: -25968.52
Max: 1.0
Use imputation for Liquidity 

Profitability :
Total nulls: 247
Min: -79682.0
Max: 140.58
Use imputation for Profitability 

Productivity :
Total nulls: 247
Min: -5093.0
Max: 1102.0
Use imputation for Productivity 

Leverage Ratio :
Total nulls: 26
Min: -7811.0
Max: 75970.38
Drop the nulls for Leverage Ratio 

Asset Turnover :
Total nulls: 247
Min: -31.59
Max: 276.38
Use imputation for Asset Turnover 

Operational Margin :
Total nulls: 5557
Min: -30175.7
Max: 394.47
Use imputation for Operational Margin 

Return on Equity :
Total nulls: 8
Min: -88875.14
Max: 39500.0
Drop the nulls for Return on Equity 

Market Book Ratio :
Total nulls: 57
Min: -3151500.0
Max: 3455419.33
Use imputation for Market Book Ratio 

Assets Growth :
Total nulls: 6701
Min: -1.0
Max: 14231.0
Use imputation for Assets Growth 

Sales Growth :
Total nulls: 6701
Min: 

In [6]:
# drop nulls where only a few missing on entire dataset
# same thing with outliers
print('All:',df.shape)
df = df.dropna(subset=['Return on Equity','Leverage Ratio','EPS'])
print('Nulls dropped where very few:',df.shape)

# Remove outliers that are not 1 target anyways
df_a = df[(df['Employee Growth']<=1000) | (df['Employee Growth'].isnull())] 
df_a = df_a[(df_a['Sales Growth']<=1000) | (df_a['Sales Growth'].isnull())] 
df_a = df_a[(df_a['Sales Growth']>=-10) | (df_a['Sales Growth'].isnull())] 
df_a = df_a[(df_a['Return on Equity']<=10000) | (df_a['Return on Equity'].isnull())] 
df_a = df_a[(df_a['Return on Equity']>=-25000) | (df_a['Return on Equity'].isnull())] 
df_a = df_a[(df_a['Operational Margin']<=11) | (df_a['Operational Margin'].isnull())] 
df_a = df_a[(df_a['Asset Turnover']>=-2) | (df_a['Asset Turnover'].isnull())]  
df_a = df_a[(df_a['Leverage Ratio']<=3500) | (df_a['Leverage Ratio'].isnull())]
df_a = df_a[(df_a['Leverage Ratio']>=-3500) | (df_a['Leverage Ratio'].isnull())]
df_a = df_a[(df_a['Productivity']<=50) |  (df_a['Productivity'].isnull())]
df_a = df_a[(df_a['Profitability']<=6) |  (df_a['Profitability'].isnull())]  
df_a = df_a[(df_a['Liquidity']>=-15000)  |  (df_a['Liquidity'].isnull())]  
print('Outliers dropped, none were target or null = 1:',df_a.shape)

All: (92872, 13)
Nulls dropped where very few: (92839, 13)
Outliers dropped, none were target or null = 1: (92805, 13)


In [7]:
# split training data
y = df['BK']
X = df.drop(columns=['BK'])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.15,
                                                    random_state=42)
print('Training:',X_train.shape)
print('Training:',X_test.shape)

Training: (78913, 12)
Training: (13926, 12)


In [8]:
# step 1: impute the training only with KNN
nan = np.nan
imputer = KNNImputer(n_neighbors=4, weights="distance")
array_imputed = imputer.fit_transform(X_train)
X_train_imp = pd.DataFrame(array_imputed, columns = X_train.columns)
print('Any nulls left?:',X_train_imp.isnull().values.sum())

Any nulls left?: 0


In [9]:
# look at the imputed version
X_train_imp.head(3)

Unnamed: 0,EPS,Liquidity,Profitability,Productivity,Leverage Ratio,Asset Turnover,Operational Margin,Return on Equity,Market Book Ratio,Assets Growth,Sales Growth,Employee Growth
0,0.0,-0.15,-0.83,0.02,0.05,0.26,0.09,0.0,87.53,-0.640843,0.172894,-0.057485
1,1.45,0.22,0.36,0.06,0.0,1.41,0.04,0.07,19.13,0.187419,0.152865,0.205692
2,0.4,0.24,-0.01,0.05,0.76,1.9,0.03,0.04,21.8,0.131678,0.160317,0.072963


In [10]:
# see shapes
counter = Counter(y_train)
print('Training distribution:',counter)
print('1:', y_train[y_train==1].count()/y_train.count() )
print('0:', y_train[y_train==0].count()/y_train.count() )

Training distribution: Counter({0: 78438, 1: 475})
1: 0.006019287062968079
0: 0.9939807129370319


In [11]:
# transform the training dataset with smote /and under sampling
oversample = SMOTE(sampling_strategy=0.2)
X_smote, y_smote = oversample.fit_resample(X_train_imp, y_train)
counter = Counter(y_smote)
print('New distribution:',counter)
#undersample = RandomUnderSampler(sampling_strategy=0.2)
#X_smote_under, y_smote_under = undersample.fit_resample(X_smote, y_smote)
#counter = Counter(y_smote_under)
#print('New distribution:',counter)

New distribution: Counter({0: 78438, 1: 15687})


## Modeling

In [12]:
# import packages
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
# from sklearn.feature_selection import RFE

In [13]:
# try a model to make sure cleaned ok for next team member
#stnd_scaler = StandardScaler()
#mm_scaler = MinMaxScaler()
johnpt = PowerTransformer(method='yeo-johnson')
rf = RandomForestClassifier(n_estimators=200)

Pipe= Pipeline([('scale',johnpt),
                   ('rf', rf)
                   ])
param_grid = {'rf__criterion': ('gini','entropy')
              }
model = GridSearchCV(Pipe,param_grid,cv=5,scoring='f1_micro')
model.fit(X_smote,y_smote) #X_smote_under, y_smote_under
print('\n\nBest Score:', model.best_score_,'\n')
print('Best parameters:', model.best_params_)



Best Score: 0.9859442231075697 

Best parameters: {'rf__criterion': 'entropy'}


In [14]:
#impute
nan = np.nan
imputer = KNNImputer(n_neighbors=4, weights="distance") 
v_array_imputed = imputer.fit_transform(X_test)
X_test_imp = pd.DataFrame(v_array_imputed, columns = X_test.columns)
print('Any nulls left?:', X_test_imp.isnull().values.sum(),'\n')
print('Testing shape:',X_test_imp.shape,'\n')

#predict
pred = model.predict(X_test_imp)
print('Confusion matrix:\n',confusion_matrix(y_test, pred))
print('\nClassification report:\n',classification_report(y_test, pred))

Any nulls left?: 0 

Testing shape: (13926, 12) 

Confusion matrix:
 [[13754    93]
 [   63    16]]

Classification report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99     13847
           1       0.15      0.20      0.17        79

    accuracy                           0.99     13926
   macro avg       0.57      0.60      0.58     13926
weighted avg       0.99      0.99      0.99     13926

