https://www.kaggle.com/shebrahimi/financial-distress

In this notebook we duplicate all continuous features and test CIBer and other competitors. Theoretically, there should be no influence to CIBer due to its special calculation of joint probabilities. However, for Naive Bayes, since $\mathbb{P}(X_i=x|Y=0) \neq \mathbb{P}(X_i=x|Y=1)$, the sign of $\mathbb{P}(\mathbf{X}|Y=0)\cdot\mathbb{P}(Y=0)-\mathbb{P}(\mathbf{X}|Y=1)\cdot\mathbb{P}(Y=1)$ may change after variable duplication.

In [1]:
import numpy as np
import pandas as pd
import sys
sys.path.insert(1, '/home/r7user5/Desktop/STAT')
import comonotonic as cm
import os
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import copy
import utils
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
import ensemble_ciber as ec
import conditional_ciber as cc
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn import tree
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTENC
import xlsxwriter

  import pandas.util.testing as tm


In [2]:
df = pd.read_csv("Financial Distress.csv")
df['Financial Distress'].values[df['Financial Distress'].values > -0.5] = 0
df['Financial Distress'].values[df['Financial Distress'].values <= -0.5] = 1
df['x80'] = df['x80']-1
df['Financial Distress'] = df['Financial Distress'].astype(int)
distress = df['Financial Distress']
df = df.drop(columns=['Company','Time','Financial Distress'])
df['Financial Distress'] = distress

In [3]:
redundant_df = df.copy()
redundant_df = redundant_df.drop(columns=['Financial Distress'])
for i in range(redundant_df.shape[1]):
    if i != 79:
        redundant_df['Add'+str(i)] = df.iloc[:,i]
redundant_df['Financial Distress'] = distress

In [4]:
del df

In [5]:
colnames = [('X'+str(i)) for i in range(redundant_df.shape[1]-1)]
colnames.append('Y')
redundant_df.columns = colnames

In [6]:
categorical = [79]
cont_col = [i for i in range(redundant_df.shape[1]-1)]
cont_col.remove(79)
discrete_feature_val = {79:37}

In [7]:
scaler = preprocessing.MinMaxScaler()
scale_col_name = ["X"+str(i) for i in cont_col]
redundant_df[scale_col_name] = scaler.fit_transform(redundant_df[scale_col_name])
original_cont_col = [i for i in range(83)]
original_cont_col.remove(79)
reduced_df = utils.outlier_removal(redundant_df, original_cont_col)

In [8]:
x = reduced_df.iloc[:,:-1].to_numpy()
y = reduced_df.iloc[:,-1].to_numpy()
smote_nc = SMOTENC(categorical_features=[79], random_state=0)
x_resample, y_resample = smote_nc.fit_resample(x, y)

In [6]:
k_fold = KFold(n_splits = 5, shuffle = True)
ciber_record = list()
ada_record = list()
rf_record = list()
xgb_record = list()
lgb_record = list()
DT_record = list()
svm_record = list()
lr_record = list()
nb_record = list()
gaussian_nb_record = list()

In [7]:
itr = 1
for train_idx, test_idx in k_fold.split(x_resample):
    x_train, x_test = x_resample[train_idx,:], x_resample[test_idx,:]
    y_train, y_test = y_resample[train_idx], y_resample[test_idx]
    # ciber
    ciber = cm.clustered_comonotonic(x_train,y_train,discrete_feature_val,cont_col,
                                     categorical, 0.98, None, corrtype='mutual_info',
                                     discrete_method='mdlp')
    ciber.run()
    ciber_predict = ciber.predict(x_test)
    ciber_record.append(accuracy_score(y_test, ciber_predict))
    #  adaboost
    ada_clf = AdaBoostClassifier()
    ada_clf.fit(x_train,y_train)
    ada_predict = ada_clf.predict(x_test)
    ada_record.append(accuracy_score(y_test, ada_predict))
    # random forest
    rf_clf = RandomForestClassifier()
    rf_clf.fit(x_train,y_train)
    rf_predict = rf_clf.predict(x_test)
    rf_record.append(accuracy_score(y_test, rf_predict))
    # xgboost
    xgb_clf = xgb.XGBClassifier()
    xgb_clf.fit(x_train,y_train)
    xgb_predict = xgb_clf.predict(x_test)
    xgb_record.append(accuracy_score(y_test, xgb_predict))
    # light GBM
    lgb_clf = lgb.LGBMClassifier()
    lgb_clf.fit(x_train, y_train)
    lgb_predict = lgb_clf.predict(x_test)
    lgb_predict = lgb_predict.round(0).astype('int')
    lgb_record.append(accuracy_score(y_test, lgb_predict))
    # decision tree
    DT_clf = tree.DecisionTreeClassifier()
    DT_clf.fit(x_train, y_train)
    DT_predict = DT_clf.predict(x_test)
    DT_record.append(accuracy_score(y_test, DT_predict))
    # svm
    svm = SVC()
    svm.fit(x_train, y_train)
    svm_predict = svm.predict(x_test)
    svm_record.append(accuracy_score(y_test, svm_predict))
    # logistic regression
    lr_clf = LogisticRegression()
    lr_clf.fit(x_train, y_train)
    lr_predict = lr_clf.predict(x_test)
    lr_record.append(accuracy_score(y_test, lr_predict))
    # nb by ciber
    nb_clf = cm.clustered_comonotonic(x_train,y_train,discrete_feature_val,cont_col,
                                    categorical, 1, None, corrtype='spearman',
                                    discrete_method='mdlp')
    nb_clf.run()
    nb_predict = nb_clf.predict(x_test)
    nb_record.append(accuracy_score(y_test, nb_predict))
    # gaussian nb
    gaussian_nb = GaussianNB()
    gaussian_nb.fit(x_train, y_train)
    gaussian_predict = gaussian_nb.predict(x_test)
    gaussian_nb_record.append(accuracy_score(y_test, gaussian_predict))
    
    print(itr)
    itr += 1
    del x_train, x_test, y_train, y_test



1




2




3




4




5


In [8]:
record = dict()
record['ciber'] = ciber_record
record['adaboost'] = ada_record
record['random forest'] = rf_record
record['xgboost'] = xgb_record
record['light GBM'] = lgb_record
record['decision tree'] = DT_record
record['svm'] = svm_record
record['logistic regress'] = lr_record
record['ciber nb'] = nb_record
record['gaussian nb'] = gaussian_nb_record

In [9]:
workbook = xlsxwriter.Workbook('5-fold CV.xlsx') 
worksheet = workbook.add_worksheet() 
row = 0
for method in record.keys():
    column = 1
    worksheet.write(row, 0, method)
    for acc in record[method]:
        worksheet.write(row, column, acc)
        column += 1
    row += 1
workbook.close()

In [9]:
x_train,x_test,y_train,y_test = train_test_split(x_resample,y_resample,test_size=0.2,random_state=14)

In [10]:
c_como_demo = cm.clustered_comonotonic(x_train,y_train,discrete_feature_val,cont_col,
                                      categorical, 0.98, None, corrtype='mutual_info',
                                      discrete_method='mdlp')
c_como_demo.run()
c_como_predict = c_como_demo.predict(x_test)
print(classification_report(y_test,c_como_predict))
print(roc_auc_score(y_test, c_como_predict))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97       714
           1       0.99      0.95      0.97       681

    accuracy                           0.97      1395
   macro avg       0.97      0.97      0.97      1395
weighted avg       0.97      0.97      0.97      1395

0.9693666012660571


In [12]:
NB = cm.clustered_comonotonic(x_train,y_train,discrete_feature_val,cont_col,
                              categorical, 1, None, corrtype='mutual_info',
                              discrete_method='mdlp')
NB.run()
NB_predict = NB.predict(x_test)
print(classification_report(y_test,NB_predict))
print(roc_auc_score(y_test, NB_predict))

              precision    recall  f1-score   support

           0       0.97      0.91      0.94       714
           1       0.91      0.97      0.94       681

    accuracy                           0.94      1395
   macro avg       0.94      0.94      0.94      1395
weighted avg       0.94      0.94      0.94      1395



In [11]:
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(x_train,y_train)
xgb_predict = xgb_clf.predict(x_test)
print(classification_report(y_test, xgb_predict))
print(roc_auc_score(y_test, xgb_predict))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98       714
           1       0.96      1.00      0.98       681

    accuracy                           0.98      1395
   macro avg       0.98      0.98      0.98      1395
weighted avg       0.98      0.98      0.98      1395

0.9817587828082776


In [12]:
rf_clf = RandomForestClassifier()
rf_clf.fit(x_train,y_train)
rf_predict = rf_clf.predict(x_test)
print(classification_report(y_test, rf_predict))
print(roc_auc_score(y_test, rf_predict))



              precision    recall  f1-score   support

           0       0.99      0.97      0.98       714
           1       0.97      0.99      0.98       681

    accuracy                           0.98      1395
   macro avg       0.98      0.98      0.98      1395
weighted avg       0.98      0.98      0.98      1395

0.9772516936289933


In [13]:
ada_clf = AdaBoostClassifier()
ada_clf.fit(x_train,y_train)
ada_predict = ada_clf.predict(x_test)
print(classification_report(y_test, ada_predict))
print(roc_auc_score(y_test, ada_predict))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94       714
           1       0.92      0.96      0.94       681

    accuracy                           0.94      1395
   macro avg       0.94      0.94      0.94      1395
weighted avg       0.94      0.94      0.94      1395

0.941592936734165


In [14]:
lgb_clf = lgb.LGBMClassifier()
lgb_clf.fit(x_train, y_train)
lgb_predict = lgb_clf.predict(x_test)
lgb_predict = lgb_predict.round(0).astype('int')
print(classification_report(y_test, lgb_predict))
print(roc_auc_score(y_test, lgb_predict))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98       714
           1       0.97      1.00      0.98       681

    accuracy                           0.98      1395
   macro avg       0.98      0.98      0.98      1395
weighted avg       0.98      0.98      0.98      1395

0.981690914251163


In [15]:
DT_clf = tree.DecisionTreeClassifier()
DT_clf.fit(x_train,y_train)
DT_predict = DT_clf.predict(x_test)
print(classification_report(y_test, DT_predict))
print(roc_auc_score(y_test, DT_predict))

              precision    recall  f1-score   support

           0       0.97      0.93      0.95       714
           1       0.93      0.97      0.95       681

    accuracy                           0.95      1395
   macro avg       0.95      0.95      0.95      1395
weighted avg       0.95      0.95      0.95      1395

0.9502338380286035


In [16]:
svm = SVC()
svm.fit(x_train, y_train)
svm_predict = svm.predict(x_test)
print(classification_report(y_test, svm_predict))
print(roc_auc_score(y_test, svm_predict))



              precision    recall  f1-score   support

           0       0.97      0.85      0.90       714
           1       0.86      0.97      0.91       681

    accuracy                           0.91      1395
   macro avg       0.91      0.91      0.91      1395
weighted avg       0.91      0.91      0.91      1395

0.9089512456965165


In [17]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
lr_predict = lr.predict(x_test)
print(classification_report(y_test, lr_predict))
print(roc_auc_score(y_test, lr_predict))



              precision    recall  f1-score   support

           0       0.95      0.88      0.92       714
           1       0.89      0.95      0.92       681

    accuracy                           0.92      1395
   macro avg       0.92      0.92      0.92      1395
weighted avg       0.92      0.92      0.92      1395

0.9169134614198102


In [18]:
gaussian_nb = GaussianNB()
gaussian_nb.fit(x_train, y_train)
gaussian_predict = gaussian_nb.predict(x_test)
print(classification_report(y_test, gaussian_predict))
print(roc_auc_score(y_test, gaussian_predict))

              precision    recall  f1-score   support

           0       0.94      0.60      0.73       714
           1       0.70      0.96      0.81       681

    accuracy                           0.78      1395
   macro avg       0.82      0.78      0.77      1395
weighted avg       0.82      0.78      0.77      1395

0.7798621651303693
