# Bankrupcy Prediction

## Imports

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
import matplotlib.pyplot as plt

## Load data

In [None]:
filename = "company_info.xlsx"
folder = "./"
file_path = os.path.join(folder, filename)
df1 = pd.read_excel(file_path, engine="openpyxl", sheet_name=0)
df2 = pd.read_excel(file_path, engine="openpyxl", sheet_name=1)

df = pd.concat([df1, df2])

## Data Exploration

Typical data exploration tasks:
- dataset size (samples, features)
- variables without values
- description, column statistics

Data size

In [None]:
print("Shape:{}".format(df.shape))

Description

In [None]:
print("DESCRIBE:{}".format(df.describe(include="all")))

Info

In [None]:
print("INFO:{}".format(df.info))

Columns

In [None]:
print("Columns:{}".format(df.columns))

HEAD

In [None]:
print("HEAD:{}".format(df.head()))

Data types

In [None]:
print("DTYPES:{}".format(df.dtypes))

## Data cleanup 

Typical columns to drop:
- semanticaly meaningless columns
- columns with little data
- columns with very low variance

Drop useless columns and columns with little data

In [None]:
useless_columns = ["Unnamed: 0",
                   "NACE Rev. 2, core code (4 digits)"]

bad_columns_to_drop = ["X2=Equity/liabilities",
                       "X7=Current Liabilities /Inventory",
                       "X26=Financing Charge / Sales"]

df.drop(columns=useless_columns, inplace=True)
df.drop(columns=bad_columns_to_drop, inplace=True)

Convert country names to numbers

In [None]:
df.replace("ES", 0, inplace=True)
df.replace("PT", 1, inplace=True)

Check what data looks like

In [None]:
print("DESCRIBE:{}".format(df.describe(include="all")))
print("DTYPES:{}".format(df.dtypes))

Drop lines with nans

In [None]:
df.dropna(axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

Convert country code column to int

In [None]:
for column in df.columns:
    if column == "Country ISO code":
        df[column] = df[column].astype("int")

Check what data looks like

In [None]:
print("DESCRIBE:{}".format(df.describe(include="all")))
print("DTYPES:{}".format(df.dtypes))

Separate features from class and convert to numpy

In [None]:
Y = np.array(df["Situation"])
df.drop("Situation", axis=1, inplace=True)
X = np.array(df, dtype="float64")

# Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=1)

# Normalize data

In [None]:
means = np.zeros((len(df.columns)))
stds = np.zeros((len(df.columns)))

columns_without_normalization = ["Country ISO code"]

for col_index in range(X_train.shape[1]):
    col_name = df.columns[col_index]
    if col_name in columns_without_normalization:
        print("Not normalizing", col_name)
    else:
        print("Normalizing", col_name)
        col_mean = np.mean(X_train[:, col_index])
        means[col_index] = col_mean
        col_std = np.std(X_train[:, col_index])
        stds[col_index] = col_std
        X_train[:, col_index] = (X_train[:, col_index] - col_mean) / col_std
        X_test[:, col_index] = (X_test[:, col_index] - col_mean) / col_std

# Train Models

List models to run

In [None]:
models = ["LinearSVC", "SVC", "KNN", "LogisticRegression", "DecisionTree", "RandomForest", "ExtremeForest", "AdaBoost", "MLP"]

Create data structures for results

In [None]:
runs = 10  # number of times to run each model
accuracies = np.zeros((len(models), runs))
C1_precisions = np.zeros((len(models), runs))
C1_recalls = np.zeros((len(models), runs))
C1_fscores = np.zeros((len(models), runs))
C0_precisions = np.zeros((len(models), runs))
C0_recalls = np.zeros((len(models), runs))
C0_fscores = np.zeros((len(models), runs))

Train Models

In [None]:
np.random.RandomState(1)

for model_to_try_i in range(len(models)):
    model_to_try = models[model_to_try_i]
    print("Model to try: ", model_to_try)

    for try_i in range(runs):
        if model_to_try == "LinearSVC":
            model = LinearSVC(C=1.0,
                              class_weight=None,
                              max_iter=1000000,
                              dual=True,
                              loss='squared_hinge'
                              )
        elif model_to_try == "SVC":
            model = SVC(C=1.0,
                        kernel='rbf',
                        class_weight=None,
                        gamma='scale'
                        )
        elif model_to_try == "KNN":
            model = KNeighborsClassifier(n_neighbors=5,
                                         algorithm='auto',
                                         weights='uniform',
                                         metric='minkowski'
                                         )
        elif model_to_try == "LogisticRegression":
            model = LogisticRegression(
                                       solver='lbfgs',
                                       class_weight=None,
                                       penalty='l2',
                                       max_iter=100000)
        elif model_to_try == "DecisionTree":
            model = DecisionTreeClassifier(max_depth=None,
                                           max_features=None,
                                           criterion='gini',
                                           class_weight=None
                                           )
        elif model_to_try == "RandomForest":
            model = RandomForestClassifier(n_estimators=1000,
                                           criterion='gini',
                                           max_features='sqrt',
                                           class_weight=None                                           
                                           )
        elif model_to_try == "ExtremeForest":
            model = ExtraTreesClassifier(n_estimators=1000,
                                         criterion='gini',
                                         max_features='sqrt',
                                         class_weight=None                                         
                                         )
        elif model_to_try == "AdaBoost":
            model = AdaBoostClassifier(n_estimators=100,
                                       learning_rate=1.0
                                       )
        elif model_to_try == "MLP":
            model = MLPClassifier(hidden_layer_sizes=(100, 100),
                                  activation='relu',
                                  solver='adam',
                                  learning_rate='constant',
                                  max_iter=1000
                                  )

        print("Model:", model, "try ", str(try_i), "/", str(runs))

        # train model
        history = model.fit(X_train, y_train)
        # print("History:")
        # print(history)

        # test model
        y_test_predict = model.predict(X_test)

        # evaluate model
        # Evaluate the default option
        # jaccard = metrics.jaccard_score(y_test, y_test_predict)
        # print("Jaccard:", jaccard)
        accuracy = metrics.accuracy_score(y_test, y_test_predict)
        prfs = metrics.precision_recall_fscore_support(y_test, y_test_predict)
        confusion_matrix = metrics.confusion_matrix(y_test, y_test_predict, labels=[0, 1])
        # print("Train set Accuracy:", metrics.accuracy_score(y_train, y_train_predict))
        print("Test set Accuracy:", accuracy)
        print("Test set Class 0 Precision, Recall, F-score:", prfs[0][0], prfs[1][0], prfs[2][0])
        print("Test set Class 1 Precision, Recall, F-score:", prfs[0][1], prfs[1][1], prfs[2][1])
        print("Confusion matrix:")
        print(confusion_matrix)

        print("Classification Report:")
        print(metrics.classification_report(y_test, y_test_predict))

        accuracies[model_to_try_i, try_i] = accuracy
        C0_precisions[model_to_try_i, try_i] = prfs[0][0]
        C0_recalls[model_to_try_i, try_i] = prfs[1][0]
        C0_fscores[model_to_try_i, try_i] = prfs[2][0]
        C1_precisions[model_to_try_i, try_i] = prfs[0][1]
        C1_recalls[model_to_try_i, try_i] = prfs[1][1]
        C1_fscores[model_to_try_i, try_i] = prfs[2][1]

Save results to file

In [None]:
# save model results
results_folder = "results"
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

if os.path.exists(os.path.join(results_folder, "accuracies.npy")):
    os.remove(os.path.join(results_folder, "accuracies.npy"))
np.save(os.path.join(results_folder, "accuracies.npy"), accuracies)

if os.path.exists(os.path.join(results_folder, "C1_precisions.npy")):
    os.remove(os.path.join(results_folder, "C1_precisions.npy"))
np.save(os.path.join(results_folder, "C1_precisions.npy"), C1_precisions)

if os.path.exists(os.path.join(results_folder, "C1_recalls.npy")):
    os.remove(os.path.join(results_folder, "C1_recalls.npy"))
np.save(os.path.join(results_folder, "C1_recalls.npy"), C1_recalls)

if os.path.exists(os.path.join(results_folder, "C1_fscores.npy")):
    os.remove(os.path.join(results_folder, "C1_fscores.npy"))
np.save(os.path.join(results_folder, "C1_fscores.npy"), C1_fscores)

if os.path.exists(os.path.join(results_folder, "C0_precisions.npy")):
    os.remove(os.path.join(results_folder, "C0_precisions.npy"))
np.save(os.path.join(results_folder, "C0_precisions.npy"), C0_precisions)

if os.path.exists(os.path.join(results_folder, "C0_recalls.npy")):
    os.remove(os.path.join(results_folder, "C0_recalls.npy"))
np.save(os.path.join(results_folder, "C0_recalls.npy"), C0_recalls)

if os.path.exists(os.path.join(results_folder, "C0_fscores.npy")):
    os.remove(os.path.join(results_folder, "C0_fscores.npy"))
np.save(os.path.join(results_folder, "C0_fscores.npy"), C0_fscores)


Load results files

Compute means and std deviations

In [None]:
accuracies = np.load(os.path.join(results_folder, "accuracies.npy"))
C0_fscores = np.load(os.path.join(results_folder, "C0_fscores.npy"))
C0_precisions = np.load(os.path.join(results_folder, "C0_precisions.npy"))
C0_recalls = np.load(os.path.join(results_folder, "C0_recalls.npy"))
C1_fscores = np.load(os.path.join(results_folder, "C1_fscores.npy"))
C1_precisions = np.load(os.path.join(results_folder, "C1_precisions.npy"))
C1_recalls = np.load(os.path.join(results_folder, "C1_recalls.npy"))

accuracies_mean = np.zeros((len(models), ))
accuracies_std = np.zeros((len(models), ))
C0_fscores_mean = np.zeros((len(models), ))
C0_fscores_std = np.zeros((len(models), ))
C0_precisions_mean = np.zeros((len(models), ))
C0_precisions_std = np.zeros((len(models), ))
C0_recalls_mean = np.zeros((len(models), ))
C0_recalls_std = np.zeros((len(models), ))

C1_fscores_mean = np.zeros((len(models), ))
C1_fscores_std = np.zeros((len(models), ))
C1_precisions_mean = np.zeros((len(models), ))
C1_precisions_std = np.zeros((len(models), ))
C1_recalls_mean = np.zeros((len(models), ))
C1_recalls_std = np.zeros((len(models), ))

for model_to_try_i in range(len(models)):
    C0_precisions_mean[model_to_try_i] = C0_precisions[model_to_try_i, :].mean(axis=0)
    C0_precisions_std[model_to_try_i] = C0_precisions[model_to_try_i, :].std(axis=0)
    C0_recalls_mean[model_to_try_i] = C0_recalls[model_to_try_i, :].mean(axis=0)
    C0_recalls_std[model_to_try_i] = C0_recalls[model_to_try_i, :].std(axis=0)
    C0_fscores_mean[model_to_try_i] = C0_fscores[model_to_try_i, :].mean(axis=0)
    C0_fscores_std[model_to_try_i] = C0_fscores[model_to_try_i, :].std(axis=0)
    accuracies_mean[model_to_try_i] = accuracies[model_to_try_i, :].mean(axis=0)
    accuracies_std[model_to_try_i] = accuracies[model_to_try_i, :].std(axis=0)
    C1_precisions_mean[model_to_try_i] = C1_precisions[model_to_try_i, :].mean(axis=0)
    C1_precisions_std[model_to_try_i] = C1_precisions[model_to_try_i, :].std(axis=0)
    C1_recalls_mean[model_to_try_i] = C1_recalls[model_to_try_i, :].mean(axis=0)
    C1_recalls_std[model_to_try_i] = C1_recalls[model_to_try_i, :].std(axis=0)
    C1_fscores_mean[model_to_try_i] = C1_fscores[model_to_try_i, :].mean(axis=0)
    C1_fscores_std[model_to_try_i] = C1_fscores[model_to_try_i, :].std(axis=0)


Plot accuracy

In [None]:
# models = ["LinearSVC", "SVC", "KNN", "LogisticRegression", "DecisionTree", "RandomForest", "ExtremeForest", "AdaBoost", "MLP"]
models_charts = ["L-SVM", "K-SVM", "KNN", "LR", "DT", "RF", "ERF", "AdaBoost", "MLP"]

print("Accuracy")
accuracies_t = np.transpose(accuracies)
plt.boxplot(accuracies_t)
axes = plt.gca()
axes.set_ylim([0.8, 1.0])
axes.set_xticklabels(models_charts)
plt.title(('Model Accuracies'))
#plt.legend(models)
plt.ylabel('Accuracy ')
plt.xlabel('Model')
plt.tight_layout()
plt.savefig(fname=results_folder+"/accuracy.png")
plt.show()
plt.close()


Plot Precision-Recall-FScore C0

In [None]:
print("Precision C0")
precisions_t=np.transpose(C0_precisions)
plt.boxplot(precisions_t)
axes = plt.gca()
axes.set_ylim([0.5, 1.0])
axes.set_xticklabels(models_charts)
plt.title(('Model Precisions C0'))
#plt.legend(models)
plt.ylabel('Precision C0')
plt.xlabel('Method')
plt.tight_layout()
plt.savefig(fname=results_folder+"/C0_precision.png")
plt.show()
plt.close()

print("Recall C0")
recalls_t = np.transpose(C0_recalls)
plt.boxplot(recalls_t)
axes = plt.gca()
axes.set_ylim([0, 1.0])
axes.set_xticklabels(models_charts)
plt.title(('Model Recalls C0'))
#plt.legend(models)
plt.ylabel('Recall')
plt.xlabel('Model')
plt.tight_layout()
plt.savefig(fname=results_folder+"/C0_recall.png")
plt.show()
plt.close()

print("F-score C0")
fscores_t=np.transpose(C0_fscores)
plt.boxplot(fscores_t)
axes = plt.gca()
axes.set_ylim([0.0, 1.0])
axes.set_xticklabels(models_charts)
plt.title(('Model F-scores C0'))
#plt.legend(methods)
plt.ylabel('F-score')
plt.xlabel('Model')
plt.tight_layout()
plt.savefig(fname=results_folder+"/C0_f-score.png")
plt.show()
plt.close()

Plot Precision-Recall-FScore C1

In [None]:
print("Precision C1")
precisions_t=np.transpose(C1_precisions)
plt.boxplot(precisions_t)
axes = plt.gca()
axes.set_ylim([0.85, 1.0])
axes.set_xticklabels(models_charts)
plt.title(('Model Precisions C1'))
#plt.legend(models)
plt.ylabel('Precision C1')
plt.xlabel('Method')
plt.tight_layout()
plt.savefig(fname=results_folder+"/C1_precision.png")
plt.show()
plt.close()

print("Recall C1")
recalls_t = np.transpose(C1_recalls)
plt.boxplot(recalls_t)
axes = plt.gca()
axes.set_ylim([0.85, 1.0])
axes.set_xticklabels(models_charts)
plt.title(('Model Recalls C1'))
#plt.legend(models)
plt.ylabel('Recall')
plt.xlabel('Model')
plt.tight_layout()
plt.savefig(fname=results_folder+"/C1_recall.png")
plt.show()
plt.close()

print("F-score C1")
fscores_t=np.transpose(C1_fscores)
plt.boxplot(fscores_t)
axes = plt.gca()
axes.set_ylim([0.85, 1.0])
axes.set_xticklabels(models_charts)
plt.title(('Model F-scores C1'))
#plt.legend(methods)
plt.ylabel('F-score')
plt.xlabel('Model')
plt.tight_layout()
plt.savefig(fname=results_folder+"/C1_f-score.png")
plt.show()
plt.close()

Print table in LaTex format

In [None]:
print("Model \tAccuracy \tPrecision C0 \tRecall C0 \tF-score C0 \tPrecision C1 \tRecall C1 \tF-score C1")
for model_to_try_i in range(len(models)):
    model_to_try = models[model_to_try_i]
    print("{}\t{:.1f}%\t+-{:.1f}%\t{:.1f}%\t+-{:.1f}%\t{:.1f}%\t+-{:.1f}%\t{:.1f}%\t+-{:.1f}%\t{:.1f}%\t+-{:.1f}%\t{:.1f}%\t+-{:.1f}%\t{:.1f}%\t+-{:.1f}%".format(
        model_to_try,
        accuracies_mean[model_to_try_i] * 100, accuracies_std[model_to_try_i] * 100,
        C0_precisions_mean[model_to_try_i] * 100, C0_precisions_std[model_to_try_i] * 100,
        C0_recalls_mean[model_to_try_i] * 100, C0_recalls_std[model_to_try_i] * 100,
        C0_fscores_mean[model_to_try_i] * 100, C0_fscores_std[model_to_try_i] * 100,
        C1_precisions_mean[model_to_try_i] * 100, C1_precisions_std[model_to_try_i] * 100,
        C1_recalls_mean[model_to_try_i] * 100, C1_recalls_std[model_to_try_i] * 100,
        C1_fscores_mean[model_to_try_i] * 100, C1_fscores_std[model_to_try_i] * 100,
    ))