## Data prep.

In [None]:
# import packages
import pandas as pd
import numpy as np
from numpy import where
from sklearn.impute import KNNImputer
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)



In [None]:
# connect the drive
df = pd.read_excel('/content/drive/MyDrive/MMAI/Finance/Bankruptcy_data_Final.xlsx')

In [None]:
# look at shape
print(df.shape)
df.head(3)

(92872, 13)


Unnamed: 0,EPS,Liquidity,Profitability,Productivity,Leverage Ratio,Asset Turnover,Operational Margin,Return on Equity,Market Book Ratio,Assets Growth,Sales Growth,Employee Growth,BK
0,1.58,0.36,0.18,0.13,1.33,1.77,0.07,0.15,2.22,,,,0
1,1.41,0.36,0.19,0.12,1.31,1.59,0.07,0.13,2.41,0.126319,0.014278,0.040179,0
2,0.31,0.32,0.13,0.08,1.03,1.55,0.05,0.04,2.56,0.368077,0.327909,0.566524,0


In [None]:
print('Full dataset target distribution...')
print('1:', df['BK'][df['BK']==1].count(),',',df['BK'][df['BK']==1].count()/df['BK'].count() )
print('0:', df['BK'][df['BK']==0].count(),',',df['BK'][df['BK']==0].count()/df['BK'].count() )

Full dataset target distribution...
1: 558 , 0.00600826944611939
0: 92314 , 0.9939917305538806


In [None]:
# check out the columns
print('Understanding columns...')
for col in df.columns:
  print(col,':')
  print('Total nulls:',df[col].isnull().values.sum())
  print('Min:',df[col].min())
  print('Max:',df[col].max())
  if df[col].isnull().values.sum() == 0:
    print('All good for', col)
  elif df[col].isnull().values.sum() <= 30:
    print('Drop the nulls for', col,'\n')
  else:
    print('Use imputation for', col,'\n')

Understanding columns...
EPS :
Total nulls: 5
Min: -384000.0
Max: 55339.0
Drop the nulls for EPS 

Liquidity :
Total nulls: 247
Min: -25968.52
Max: 1.0
Use imputation for Liquidity 

Profitability :
Total nulls: 247
Min: -79682.0
Max: 140.58
Use imputation for Profitability 

Productivity :
Total nulls: 247
Min: -5093.0
Max: 1102.0
Use imputation for Productivity 

Leverage Ratio :
Total nulls: 26
Min: -7811.0
Max: 75970.38
Drop the nulls for Leverage Ratio 

Asset Turnover :
Total nulls: 247
Min: -31.59
Max: 276.38
Use imputation for Asset Turnover 

Operational Margin :
Total nulls: 5557
Min: -30175.7
Max: 394.47
Use imputation for Operational Margin 

Return on Equity :
Total nulls: 8
Min: -88875.14
Max: 39500.0
Drop the nulls for Return on Equity 

Market Book Ratio :
Total nulls: 57
Min: -3151500.0
Max: 3455419.33
Use imputation for Market Book Ratio 

Assets Growth :
Total nulls: 6701
Min: -1.0
Max: 14231.0
Use imputation for Assets Growth 

Sales Growth :
Total nulls: 6701
Min: 

In [None]:
# drop nulls where only a few missing
df_a = df.dropna(subset=['Return on Equity','Leverage Ratio','EPS'])
print(df_a.shape)

(92839, 13)


In [None]:
# impute the rest
nan = np.nan
imputer = KNNImputer(n_neighbors=4, weights="uniform")
array_imputed = imputer.fit_transform(df_a)
df_imputed = pd.DataFrame(array_imputed, columns = df_a.columns)
print('Any nulls left?:',df_imputed.isnull().values.sum())

Any nulls left?: 0


In [None]:
# look at the imputed version
df_imputed.head(3)

Unnamed: 0,EPS,Liquidity,Profitability,Productivity,Leverage Ratio,Asset Turnover,Operational Margin,Return on Equity,Market Book Ratio,Assets Growth,Sales Growth,Employee Growth,BK
0,1.58,0.36,0.18,0.13,1.33,1.77,0.07,0.15,2.22,0.133543,0.051761,0.03389,0.0
1,1.41,0.36,0.19,0.12,1.31,1.59,0.07,0.13,2.41,0.126319,0.014278,0.040179,0.0
2,0.31,0.32,0.13,0.08,1.03,1.55,0.05,0.04,2.56,0.368077,0.327909,0.566524,0.0


In [None]:
# pull out 15% of data for validate after train and test before using smote
validation = df_imputed.sample(frac = 0.2)
training = df_imputed.drop(validation.index)
print('Validate shape:',validation.shape)
print('Train and test shape:',training.shape)

Validate shape: (18568, 13)
Train and test shape: (74271, 13)


In [None]:
# set features and target
y = training['BK']
X = training.drop(columns=['BK'])
counter = Counter(y)
print('Existing distribution for training (75% of entire dataset):',counter)
print('1:', training['BK'][training['BK']==1].count()/training['BK'].count() )
print('0:', training['BK'][training['BK']==0].count()/training['BK'].count() )

Existing distribution for training (75% of entire dataset): Counter({0.0: 73836, 1.0: 435})
1: 0.005856929353314214
0: 0.9941430706466858


In [None]:
# transform the dataset with smote /and under sampling
oversample = SMOTE()#sampling_strategy=0.5)
X_smote, y_smote = oversample.fit_resample(X, y)
counter = Counter(y_smote)
print('New distribution:',counter)
#undersample = RandomUnderSampler(sampling_strategy=0.8)
#X_smote_under, y_smote_under = undersample.fit_resample(X_smote, y_smote)
#counter = Counter(y_smote_under)
#print('New distribution:',counter)

New distribution: Counter({0.0: 73836, 1.0: 73836})


## Modeling

In [None]:
# import packages
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_smote,y_smote, #_under
                                                    test_size=0.2,
                                                    random_state=42)

In [None]:
# try KNN for validation that cleaned okay
stnd_scaler = StandardScaler()
skb = SelectKBest()
KNN = KNeighborsClassifier()

Pipe= Pipeline([('scale',stnd_scaler),
                   ('skb',skb),
                   ('KNN',KNN)
                   ])
param_grid = {'skb__k':(5,6,7,8),     
              'KNN__weights': ('uniform','distance'),
              'KNN__metric': ('euclidean','manhattan')
              }
model = GridSearchCV(Pipe,param_grid,cv=10,scoring='f1_micro')
model.fit(X_train,y_train)
print('\n\nBest Score:', model.best_score_,'\n')
print('Best parameters:', model.best_params_)



Best Score: 0.9299457865718546 

Best parameters: {'KNN__metric': 'manhattan', 'KNN__weights': 'distance', 'skb__k': 8}


In [None]:
# check on test data
pred = model.predict(X_test)
print('Confusion matrix:\n',confusion_matrix(y_test, pred))
print('\nClassification report:\n',classification_report(y_test, pred))

Confusion matrix:
 [[13188  1505]
 [  531 14311]]

Classification report:
               precision    recall  f1-score   support

         0.0       0.96      0.90      0.93     14693
         1.0       0.90      0.96      0.93     14842

    accuracy                           0.93     29535
   macro avg       0.93      0.93      0.93     29535
weighted avg       0.93      0.93      0.93     29535



In [None]:
# now, be absolutely sure! check on validation data 
y_val = validation['BK']
X_val = validation.drop(columns=['BK'])
pred = model.predict(X_val)
print('Confusion matrix:\n',confusion_matrix(y_val, pred))
print('\nClassification report:\n',classification_report(y_val, pred))

Confusion matrix:
 [[16611  1838]
 [   46    73]]

Classification report:
               precision    recall  f1-score   support

         0.0       1.00      0.90      0.95     18449
         1.0       0.04      0.61      0.07       119

    accuracy                           0.90     18568
   macro avg       0.52      0.76      0.51     18568
weighted avg       0.99      0.90      0.94     18568

