# Notice
This workbook aims to examine the bankruptcy prediction model at t-1. 
The comprehensive workbook to predict bankruptcy at t-1,t-2,t-3 will be created based on this workbook

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
from sklearn import preprocessing

In [3]:
bankrupt_1 = pd.read_csv("bankrupt_t1.csv")
non_bankrupt_1 = pd.read_csv("non_bankrupt_t1.csv")

In [4]:
bankrupt_1 = bankrupt_1.fillna(value = 1)
non_bankrupt_1 = non_bankrupt_1.fillna(value = 1)

In [5]:
data_t1 = pd.concat([bankrupt_1, non_bankrupt_1], ignore_index = True)

# Create predicting variables
* X1 = w.cap/t.ass
* X2 = r.earn/t.ass
* X3 = ebit/t.ass
* X4 = t.equi/t.ass
* X5 = net.inc/t.ass
* X6 = t.liab/t.ass
* X7 = cf.oper/t.liab

In [6]:
data_t1["X1"] = preprocessing.scale(data_t1["WoCap"] / data_t1["ToAsset"])

data_t1["X2"] = preprocessing.scale(data_t1["ReEarns"] / data_t1["ToAsset"])

data_t1["X3"] = preprocessing.scale(data_t1["EBIT"] / data_t1["ToAsset"])

data_t1["X4"] = preprocessing.scale(data_t1["ToEqui"] / data_t1["ToAsset"])

data_t1["X5"] = preprocessing.scale(data_t1["NetInc"] / data_t1["ToAsset"])

data_t1["X6"] = preprocessing.scale(data_t1["ToLia"] / data_t1["ToAsset"])

data_t1["X7"] = preprocessing.scale(data_t1["CFOper"] / data_t1["ToLia"])

In [7]:
data_t1.head()

Unnamed: 0,ToAsset,CuAsset,CuLia,WoCap,ReEarns,EBIT,ToEqui,ToLia,CFOper,NetInc,Status,X1,X2,X3,X4,X5,X6,X7
0,2047379,400804,1939305,1538501,465919,108231.0,779,2046600,297,26248,0,0.06529,-0.226413,-0.282061,-0.465459,-0.277945,-0.134007,-0.150012
1,1654915,967259,865011,102248,2212650,128493.0,546231,2201146,63858,32072,0,-0.305964,-0.117413,-0.263001,-0.278809,-0.273621,0.037484,-0.105844
2,1242642,226446,388667,162221,510057,56724.0,362713,879929,107869,113759,0,-0.268949,-0.208444,-0.287611,-0.300423,-0.226051,-0.285291,0.037337
3,934248,131681,1260267,1128586,332154,93515.0,328155,1262403,2498,209426,0,0.311065,-0.213841,-0.245731,-0.266816,-0.138631,0.048478,-0.147206
4,2436005,567830,593095,25265,65659,36709.0,414523,1995504,77127,27893,0,-0.33364,-0.246122,-0.31113,-0.369336,-0.278848,-0.227655,-0.091095


### Scale the data. 
Using MinMaxScale instead of standard scale because dont know whether the data is normally distributed or not

### Split data for train and test

In [8]:
X = data_t1[["X1", "X2", "X3", "X4","X5","X6", "X7"]]
y = data_t1['Status'] 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [10]:
from sklearn.metrics import classification_report,confusion_matrix

## Supported vector Machine model

In [11]:
from sklearn.svm import SVC

In [12]:
model = SVC()

In [13]:
model.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [14]:
predictions = model.predict(X_test)

In [15]:
print(confusion_matrix(y_test,predictions))

[[6 3]
 [7 4]]


In [16]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.46      0.67      0.55         9
          1       0.57      0.36      0.44        11

avg / total       0.52      0.50      0.49        20



### Grid Search


In [17]:
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 

In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)

In [20]:
grid.fit(X_train,y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=rbf, score=0.5333333333333333, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=rbf, score=0.5333333333333333, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ............ C=0.1, gamma=1, kernel=rbf, score=0.5, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=rbf, score=0.5333333333333333, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=rbf, score=0.5333333333333333, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .......... C=0.1, gamma=0.1, kernel=rbf, score=0.5, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV]  C=100, gamma=0.01, kernel=rbf, score=0.5714285714285714, total=   0.0s
[CV] C=100, gamma=0.001, kernel=rbf ..................................
[CV]  C=100, gamma=0.001, kernel=rbf, score=0.5333333333333333, total=   0.0s
[CV] C=100, gamma=0.001, kernel=rbf ..................................
[CV]  C=100, gamma=0.001, kernel=rbf, score=0.5333333333333333, total=   0.0s
[CV] C=100, gamma=0.001, kernel=rbf ..................................
[CV] ........ C=100, gamma=0.001, kernel=rbf, score=0.5, total=   0.0s
[CV] C=100, gamma=0.0001, kernel=rbf .................................
[CV]  C=100, gamma=0.0001, kernel=rbf, score=0.5333333333333333, total=   0.0s
[CV] C=100, gamma=0.0001, kernel=rbf .................................
[CV]  C=100, gamma=0.0001, kernel=rbf, score=0.5333333333333333, total=   0.0s
[CV] C=100, gamma=0.0001, kernel=rbf .................................
[CV] ....... C=100, gamma=0.0001, kernel=rbf, score=0.5, total=   0.0s
[CV] C=1000, gamma=1, kernel=rbf ........

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    0.3s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [21]:
grid.best_params_

{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}

In [22]:
grid.best_estimator_

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [23]:
grid_predictions = grid.predict(X_test)

In [24]:
print(confusion_matrix(y_test,grid_predictions))

[[ 3  6]
 [ 0 11]]


In [25]:
print(classification_report(y_test,grid_predictions))

             precision    recall  f1-score   support

          0       1.00      0.33      0.50         9
          1       0.65      1.00      0.79        11

avg / total       0.81      0.70      0.66        20



# Decision Tree

In [26]:
from sklearn.tree import DecisionTreeClassifier

In [27]:
dtree = DecisionTreeClassifier()

In [28]:
dtree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [29]:
predictions = dtree.predict(X_test)

In [30]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.67      0.67      0.67         9
          1       0.73      0.73      0.73        11

avg / total       0.70      0.70      0.70        20



In [31]:
print(confusion_matrix(y_test,predictions))

[[6 3]
 [3 8]]


# Random Forest

In [32]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [33]:
rfc_pred = rfc.predict(X_test)

In [34]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.67      0.67      0.67         9
          1       0.73      0.73      0.73        11

avg / total       0.70      0.70      0.70        20



In [35]:
print(confusion_matrix(y_test,predictions))

[[6 3]
 [3 8]]


# K nearest neighbor

In [36]:
from sklearn.neighbors import KNeighborsClassifier

In [37]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

[[ 7  2]
 [ 1 10]]
             precision    recall  f1-score   support

          0       0.88      0.78      0.82         9
          1       0.83      0.91      0.87        11

avg / total       0.85      0.85      0.85        20

