# Statistical Learning: Company Bankruptcy Classifiers

In [1]:
import pandas as pd
import dash
from dash.dependencies import Input, Output
import dash_core_components as dcc
import dash_html_components as html
import plotly.graph_objs as go
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm

df = pd.read_csv('Credit.csv')
df.shape, df.head(2)



((6859, 100),
    Default  EBITDA08  EBITDA09  EBITDA10  EBITDA11  EBITDA12  EBITDA13  \
 0        1     250.9     151.5     177.2     191.5     190.8     127.1   
 1        1     303.8     296.4     261.7     288.1     270.4     205.1   
 
    EBITDA14  EBITDA15  EBITDA16    ...      UnlFC17  EBITDA_avg  IntExp_avg  \
 0     122.5     147.2     136.7    ...       114.54     164.494    -106.821   
 1     176.9     227.4     211.5    ...        65.70     243.030    -186.860   
 
    DEratio_avg  TotalAsset_avg  Op_due_avg  FCFDebt_avg  CurrentR_avg  \
 0      216.125         1341.81      81.220      0.21090        27.398   
 1        0.000         1231.82      74.436     -0.04883        15.846   
 
    DAratio_avg  UnlCF_avg  
 0       164.61    125.994  
 1       374.75    148.676  
 
 [2 rows x 100 columns])

In [2]:
Y = df['Default']
X = df.iloc[:,1:]
X.head(3), Y.head(3)



(   EBITDA08  EBITDA09  EBITDA10  EBITDA11  EBITDA12  EBITDA13  EBITDA14  \
 0     250.9     151.5     177.2     191.5     190.8     127.1     122.5   
 1     303.8     296.4     261.7     288.1     270.4     205.1     176.9   
 2      67.8      23.4      38.6      64.8      74.9     114.9     218.4   
 
    EBITDA15  EBITDA16  EBITDA17    ...      UnlFC17  EBITDA_avg  IntExp_avg  \
 0     147.2     136.7    149.54    ...      114.540     164.494   -106.8210   
 1     227.4     211.5    189.00    ...       65.700     243.030   -186.8600   
 2      77.5      28.7     70.90    ...      -87.742      77.990    -11.2299   
 
    DEratio_avg  TotalAsset_avg  Op_due_avg  FCFDebt_avg  CurrentR_avg  \
 0      216.125         1341.81     81.2200      0.21090        27.398   
 1        0.000         1231.82     74.4360     -0.04883        15.846   
 2       41.642          760.03      2.2415      0.00000         0.000   
 
    DAratio_avg  UnlCF_avg  
 0      164.610   125.9940  
 1      374.750 

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df, Y, test_size=0.4, random_state=0)

In [4]:
## Linear SVM classifier
clSVM = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clSVM.score(X_test, y_test)     



0.9857871720116618

In [5]:
## Xgboost
from xgboost.sklearn import XGBClassifier  
#from xgboost.sklearn import XGBRegressor

clXg = XGBClassifier()  # and for classifier  
clXg.fit(X_train, y_train)  
clXg.score(X_test, y_test)  




The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.



1.0

In [6]:
## Random Forest
from sklearn.ensemble import RandomForestClassifier
clRF = RandomForestClassifier(max_depth=2)#, random_state=0)
clRF.fit(X_train, y_train)
from sklearn.model_selection import cross_val_score
score_RF = cross_val_score(clRF, df, Y, cv=5)
clRF.score(X_test,y_test), print("Accuracy: %0.2f (+/- %0.2f)" % (score_RF.mean(), score_RF.std() * 2))



Accuracy: 0.99 (+/- 0.00)


(0.9919825072886297, None)

In [7]:
## Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

logi = LogisticRegression()
logi.fit(X_train, y_train)
logi.score(X_test,y_test)



0.8633381924198251

# Confusion Matrices

In [8]:
## Confusion Matrices
from sklearn.metrics import confusion_matrix
Y_true = y_test
Y_pred_RF = clRF.predict(X_test)
Y_pred_logi = logi.predict(X_test)
Y_pred_SVM = clSVM.predict(X_test)
Y_pred_XG = clXg.predict(X_test)

confusion_matrix(Y_true,Y_pred_RF)


The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.



array([[2717,    0],
       [  22,    5]])

In [9]:
confusion_matrix(Y_true,Y_pred_logi)

array([[2360,  357],
       [  18,    9]])

In [10]:
confusion_matrix(Y_true,Y_pred_SVM)

array([[2690,   27],
       [  12,   15]])

In [11]:
confusion_matrix(Y_true,Y_pred_XG)

array([[2717,    0],
       [   0,   27]])