In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import machine learning models
from sklearn import preprocessing

# train test split
from sklearn.model_selection import train_test_split

# results matrices
from sklearn.metrics import classification_report,confusion_matrix

#Supported vector machine model and grid search for the model
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Decision tree
from sklearn.tree import DecisionTreeClassifier

# Random forest
from sklearn.ensemble import RandomForestClassifier

# K-nearest neighbor
from sklearn.neighbors import KNeighborsClassifier

In [2]:
def bankrupt_prediction(bankrupt_data, non_bankrupt_data):
    """
    The function contain methods for calculate machine learning models
    based on the feed-in data of bankrupt and non-bankrupt companies.
    
    The function aims to give out result for data of data in 1,2,3 years
    before the bankruptcy.
    -----------------------------------------------------------------
    
    Variables are:
        X1 = working capital/total assets; 
        X2 = retained earnings/total asssets
        X3 = EBIT/total assets
        X4 = total equity(book)/total assets
        X5 = net income/total assets
        X6 = total liabilities/total assets
        X7 = cash flow from operation/total liabilities
        
        
    """

    #Concatanate the data together
    data_full = pd.concat([bankrupt_data, non_bankrupt_data], ignore_index = True)
    
    # Add and scale predictive variables in the dataset (X1,X2,X3,X4,X5,X6,X7)
    data_full["X1"] = preprocessing.scale(data_full["WoCap"] / data_full["ToAsset"])
    data_full["X2"] = preprocessing.scale(data_full["ReEarns"] / data_full["ToAsset"])
    data_full["X3"] = preprocessing.scale(data_full["EBIT"] / data_full["ToAsset"])
    data_full["X4"] = preprocessing.scale(data_full["ToEqui"] / data_full["ToAsset"])
    data_full["X5"] = preprocessing.scale(data_full["NetInc"] / data_full["ToAsset"])
    data_full["X6"] = preprocessing.scale(data_full["ToLia"] / data_full["ToAsset"])
    data_full["X7"] = preprocessing.scale(data_full["CFOper"] / data_full["ToLia"])
        
    # Split data for training and testing
    X = data_full[["X1", "X2", "X3", "X4","X5","X6", "X7"]]
    y = data_full['Status'] 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)
    
    # Supported Vector Machine model
    model = SVC()
    model.fit(X_train,y_train)
    predictions = model.predict(X_test)
    param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 
    grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
    grid.fit(X_train,y_train)
    grid.best_params_
    grid.best_estimator_
    grid_predictions = grid.predict(X_test)
    print("Confusion Matrix using Supported Vector Machine: \n", confusion_matrix(y_test,grid_predictions))
    print("Classification Report using Supported Vector Machine: \n", classification_report(y_test,grid_predictions))
    
    #Decision Tree
    dtree = DecisionTreeClassifier()
    dtree.fit(X_train,y_train)
    predict_dtree = dtree.predict(X_test)
    print("Confusion Matrix using Decision Tree: \n", confusion_matrix(y_test,predict_dtree))
    print("Classification Report using Decision Tree: \n", classification_report(y_test,predict_dtree))
    
    #Random Forest
    rfc = RandomForestClassifier(n_estimators=100)
    rfc.fit(X_train, y_train)
    rfc_pred = rfc.predict(X_test)
    print("Confusion Matrix using Random Forest: \n", confusion_matrix(y_test,rfc_pred))
    print("Classification Report using Random Forest: \n", classification_report(y_test,rfc_pred))
    
    # K-nearest neighbor
    knn = KNeighborsClassifier(n_neighbors=1)
    knn.fit(X_train,y_train)
    k_pred = knn.predict(X_test)
    print("Confusion Matrix using K-nearest neighbor: \n", confusion_matrix(y_test,k_pred))
    print("Classification Report using K-nearest neighbor: \n", classification_report(y_test,k_pred))
    
    

# Bankruptcy prediction at t-1
The models will prediction bankruptcy 1 year before the real bankruptcy

In [3]:
bankrupt_t1 = pd.read_csv("bankrupt_t1.csv").fillna(value = 1)
non_bankrupt_t1 = pd.read_csv("non_bankrupt_t1.csv").fillna(value = 1)
t1 = bankrupt_prediction(bankrupt_t1, non_bankrupt_t1)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=rbf, score=0.8571428571428571, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=rbf, score=0.6666666666666666, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=rbf, score=0.6666666666666666, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=rbf, score=0.6428571428571429, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .......... C=0.1, gamma=0.1, kernel=rbf, score=0.5, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=rbf, score=0.6666666666666666, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=rbf .................................

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV]  C=1000, gamma=0.1, kernel=rbf, score=0.7857142857142857, total=   0.0s
[CV] C=1000, gamma=0.1, kernel=rbf ...................................
[CV] ........ C=1000, gamma=0.1, kernel=rbf, score=0.75, total=   0.0s
[CV] C=1000, gamma=0.1, kernel=rbf ...................................
[CV]  C=1000, gamma=0.1, kernel=rbf, score=0.6666666666666666, total=   0.0s
[CV] C=1000, gamma=0.01, kernel=rbf ..................................
[CV]  C=1000, gamma=0.01, kernel=rbf, score=0.9285714285714286, total=   0.0s
[CV] C=1000, gamma=0.01, kernel=rbf ..................................
[CV] ....... C=1000, gamma=0.01, kernel=rbf, score=0.75, total=   0.0s
[CV] C=1000, gamma=0.01, kernel=rbf ..................................
[CV]  C=1000, gamma=0.01, kernel=rbf, score=0.6666666666666666, total=   0.0s
[CV] C=1000, gamma=0.001, kernel=rbf .................................
[CV]  C=1000, gamma=0.001, kernel=rbf, score=0.7857142857142857, total=   0.0s
[CV] C=1000, gamma=0.001, kernel=rbf ......

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    0.3s finished


# Bankruptcy prediction at t-2
The models will prediction bankruptcy 2 years before the real bankruptcy

In [4]:
bankrupt_t2 = pd.read_csv("bankrupt_t2.csv").fillna(value = 1)
non_bankrupt_t2 = pd.read_csv("non_bankrupt_t2.csv").fillna(value = 1)
t2 = bankrupt_prediction(bankrupt_t2, non_bankrupt_t2)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=rbf, score=0.7857142857142857, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=rbf, score=0.5833333333333334, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=rbf, score=0.6666666666666666, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=rbf, score=0.7857142857142857, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=rbf, score=0.5833333333333334, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ......... C=0.1, gamma=0.1, kernel=rbf, score=0.75, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=rbf .................................

Confusion Matrix using Random Forest: 
 [[11  2]
 [ 0 13]]
Classification Report using Random Forest: 
              precision    recall  f1-score   support

        0.0       1.00      0.85      0.92        13
        1.0       0.87      1.00      0.93        13

avg / total       0.93      0.92      0.92        26

Confusion Matrix using K-nearest neighbor: 
 [[10  3]
 [ 2 11]]
Classification Report using K-nearest neighbor: 
              precision    recall  f1-score   support

        0.0       0.83      0.77      0.80        13
        1.0       0.79      0.85      0.81        13

avg / total       0.81      0.81      0.81        26



[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    0.3s finished


# Bankruptcy prediction at t-3
The models will prediction bankruptcy 3 years before the real bankruptcy

In [5]:
bankrupt_t3 = pd.read_csv("bankrupt_t3.csv").fillna(value = 1)
non_bankrupt_t3 = pd.read_csv("non_bankrupt_t3.csv").fillna(value = 1)
t3 = bankrupt_prediction(bankrupt_t3, non_bankrupt_t3)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=rbf, score=0.7142857142857143, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ............ C=0.1, gamma=1, kernel=rbf, score=0.5, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV]  C=0.1, gamma=1, kernel=rbf, score=0.6666666666666666, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=rbf, score=0.7142857142857143, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .......... C=0.1, gamma=0.1, kernel=rbf, score=0.5, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV]  C=0.1, gamma=0.1, kernel=rbf, score=0.5833333333333334, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................


[CV] ....... C=100, gamma=0.0001, kernel=rbf, score=0.5, total=   0.0s
[CV] C=100, gamma=0.0001, kernel=rbf .................................
[CV]  C=100, gamma=0.0001, kernel=rbf, score=0.5833333333333334, total=   0.0s
[CV] C=1000, gamma=1, kernel=rbf .....................................
[CV]  C=1000, gamma=1, kernel=rbf, score=0.7857142857142857, total=   0.0s
[CV] C=1000, gamma=1, kernel=rbf .....................................
[CV]  C=1000, gamma=1, kernel=rbf, score=0.5833333333333334, total=   0.0s
[CV] C=1000, gamma=1, kernel=rbf .....................................
[CV]  C=1000, gamma=1, kernel=rbf, score=0.6666666666666666, total=   0.0s
[CV] C=1000, gamma=0.1, kernel=rbf ...................................
[CV]  C=1000, gamma=0.1, kernel=rbf, score=0.7857142857142857, total=   0.0s
[CV] C=1000, gamma=0.1, kernel=rbf ...................................
[CV]  C=1000, gamma=0.1, kernel=rbf, score=0.5833333333333334, total=   0.0s
[CV] C=1000, gamma=0.1, kernel=rbf ..........

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    0.3s finished


Confusion Matrix using Random Forest: 
 [[7 6]
 [5 8]]
Classification Report using Random Forest: 
              precision    recall  f1-score   support

          0       0.58      0.54      0.56        13
          1       0.57      0.62      0.59        13

avg / total       0.58      0.58      0.58        26

Confusion Matrix using K-nearest neighbor: 
 [[5 8]
 [5 8]]
Classification Report using K-nearest neighbor: 
              precision    recall  f1-score   support

          0       0.50      0.38      0.43        13
          1       0.50      0.62      0.55        13

avg / total       0.50      0.50      0.49        26

