## Data prep.

In [1]:
# import packages
import pandas as pd
import numpy as np
from numpy import where
from sklearn.impute import KNNImputer
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from sklearn.model_selection import train_test_split



In [2]:
# connect the drive
df = pd.read_excel('/content/drive/MyDrive/MMAI/Finance/Bankruptcy_data_Final.xlsx')

In [38]:
# look at shape
print(df.shape)
df.head(3)

(92872, 13)


Unnamed: 0,EPS,Liquidity,Profitability,Productivity,Leverage Ratio,Asset Turnover,Operational Margin,Return on Equity,Market Book Ratio,Assets Growth,Sales Growth,Employee Growth,BK
0,1.58,0.36,0.18,0.13,1.33,1.77,0.07,0.15,2.22,,,,0
1,1.41,0.36,0.19,0.12,1.31,1.59,0.07,0.13,2.41,0.126319,0.014278,0.040179,0
2,0.31,0.32,0.13,0.08,1.03,1.55,0.05,0.04,2.56,0.368077,0.327909,0.566524,0


In [39]:
print('Full dataset target distribution...')
print('1:', df['BK'][df['BK']==1].count(),',',df['BK'][df['BK']==1].count()/df['BK'].count() )
print('0:', df['BK'][df['BK']==0].count(),',',df['BK'][df['BK']==0].count()/df['BK'].count() )

Full dataset target distribution...
1: 558 , 0.00600826944611939
0: 92314 , 0.9939917305538806


In [40]:
# check out the columns
print('Understanding columns...')
for col in df.columns:
  print(col,':')
  print('Total nulls:',df[col].isnull().values.sum())
  print('Min:',df[col].min())
  print('Max:',df[col].max())
  if df[col].isnull().values.sum() == 0:
    print('All good for', col)
  elif df[col].isnull().values.sum() <= 30:
    print('Drop the nulls for', col,'\n')
  else:
    print('Use imputation for', col,'\n')

Understanding columns...
EPS :
Total nulls: 5
Min: -384000.0
Max: 55339.0
Drop the nulls for EPS 

Liquidity :
Total nulls: 247
Min: -25968.52
Max: 1.0
Use imputation for Liquidity 

Profitability :
Total nulls: 247
Min: -79682.0
Max: 140.58
Use imputation for Profitability 

Productivity :
Total nulls: 247
Min: -5093.0
Max: 1102.0
Use imputation for Productivity 

Leverage Ratio :
Total nulls: 26
Min: -7811.0
Max: 75970.38
Drop the nulls for Leverage Ratio 

Asset Turnover :
Total nulls: 247
Min: -31.59
Max: 276.38
Use imputation for Asset Turnover 

Operational Margin :
Total nulls: 5557
Min: -30175.7
Max: 394.47
Use imputation for Operational Margin 

Return on Equity :
Total nulls: 8
Min: -88875.14
Max: 39500.0
Drop the nulls for Return on Equity 

Market Book Ratio :
Total nulls: 57
Min: -3151500.0
Max: 3455419.33
Use imputation for Market Book Ratio 

Assets Growth :
Total nulls: 6701
Min: -1.0
Max: 14231.0
Use imputation for Assets Growth 

Sales Growth :
Total nulls: 6701
Min: 

In [54]:
# drop nulls where only a few missing on entire dataset
df_a = df.dropna(subset=['Return on Equity','Leverage Ratio','EPS'])
print(df_a.shape)

(92839, 13)


In [42]:
# randomized train and test split early on
validation_test = df_a.sample(frac = 0.2)
training = df_a.drop(validation_test.index)

In [51]:
# impute the training only
nan = np.nan
imputer = KNNImputer(n_neighbors=7, weights="uniform")
array_imputed = imputer.fit_transform(training)
training_imputed = pd.DataFrame(array_imputed, columns = training.columns)
print('Any nulls left?:',training_imputed.isnull().values.sum())

Any nulls left?: 0


In [44]:
# look at the imputed version
training_imputed.head(3)

Unnamed: 0,EPS,Liquidity,Profitability,Productivity,Leverage Ratio,Asset Turnover,Operational Margin,Return on Equity,Market Book Ratio,Assets Growth,Sales Growth,Employee Growth,BK
0,1.58,0.36,0.18,0.13,1.33,1.77,0.07,0.15,2.22,0.094558,-0.002294,0.02061,0.0
1,1.41,0.36,0.19,0.12,1.31,1.59,0.07,0.13,2.41,0.126319,0.014278,0.040179,0.0
2,0.31,0.32,0.13,0.08,1.03,1.55,0.05,0.04,2.56,0.368077,0.327909,0.566524,0.0


In [45]:
# split training data
y_train = training_imputed['BK']
X_train = training_imputed.drop(columns=[#'Operational Margin','Assets Growth',\
                                 # 'Employee Growth','Sales Growth',
                                 'BK'])
print('Training:',X_train.shape)

Training: (74271, 12)
Testing: (9284, 12)
Validation: (9284, 12)


In [46]:
# see shapes
counter = Counter(y_train)
print('Training distribution:',counter)
print('1:', y_train[y_train==1].count()/y_train.count() )
print('0:', y_train[y_train==0].count()/y_train.count() )

Training distribution: Counter({0.0: 73833, 1.0: 438})
1: 0.005897321969543967
0: 0.994102678030456


In [47]:
# transform the dataset with smote /and under sampling
oversample = SMOTE(sampling_strategy=0.1)
X_smote, y_smote = oversample.fit_resample(X_train, y_train)
counter = Counter(y_smote)
print('New distribution:',counter)
undersample = RandomUnderSampler(sampling_strategy=0.3)
X_smote_under, y_smote_under = undersample.fit_resample(X_smote, y_smote)
counter = Counter(y_smote_under)
print('New distribution:',counter)

New distribution: Counter({0.0: 73833, 1.0: 7383})
New distribution: Counter({0.0: 24610, 1.0: 7383})


## Modeling

In [48]:
# import packages
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
#from sklearn.feature_selection import RFE

In [49]:
# try a model to make sure cleaned ok for next team member
stnd_scaler = StandardScaler()
mm_scaler = MinMaxScaler()
rf = RandomForestClassifier()

Pipe= Pipeline([('scale',mm_scaler),
                   ('rf', rf)
                   ])
param_grid = {'rf__criterion': ('gini','entropy')
              }
model = GridSearchCV(Pipe,param_grid,cv=10,scoring='f1_micro')
model.fit(X_smote_under,y_smote_under)
print('\n\nBest Score:', model.best_score_,'\n')
print('Best parameters:', model.best_params_)



Best Score: 0.9718689434198187 

Best parameters: {'rf__criterion': 'entropy'}


In [61]:
# check on test data and impute test data, too

#impute
nan = np.nan
imputer = KNNImputer(n_neighbors=5, weights="distance") #distance or uniform
v_array_imputed = imputer.fit_transform(validation_test)
validation_imputed = pd.DataFrame(v_array_imputed, columns = validation_test.columns)
print('Any nulls left?:',training_imputed.isnull().values.sum(),'\n')

#set variables
y_test = validation_imputed['BK']
X_test = validation_imputed.drop(columns=[#'Operational Margin','Assets Growth',\
                                  #'Employee Growth','Sales Growth',
                                  'BK'])
print('Testing shape:',X_test.shape,'\n')

#predict
pred = model.predict(X_test)
print('Confusion matrix:\n',confusion_matrix(y_test, pred))
print('\nClassification report:\n',classification_report(y_test, pred))

Any nulls left?: 0 

Testing shape: (18568, 12) 

Confusion matrix:
 [[18075   377]
 [   66    50]]

Classification report:
               precision    recall  f1-score   support

         0.0       1.00      0.98      0.99     18452
         1.0       0.12      0.43      0.18       116

    accuracy                           0.98     18568
   macro avg       0.56      0.71      0.59     18568
weighted avg       0.99      0.98      0.98     18568

