https://www.kaggle.com/shebrahimi/financial-distress

Apply MinMaxScaler, use all data to do 5-fold CV

In [1]:
import numpy as np
import pandas as pd
import sys
sys.path.insert(1, '/home/r7user5/Desktop/STAT')
import comonotonic as cm
import os
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import copy
import utils
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import ensemble_ciber as ec
import conditional_ciber as cc
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn import tree
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTENC
import xlsxwriter
from sklearn.decomposition import PCA

  import pandas.util.testing as tm


In [2]:
df = pd.read_csv("Financial Distress.csv")
df['Financial Distress'].values[df['Financial Distress'].values > -0.5] = 0
df['Financial Distress'].values[df['Financial Distress'].values <= -0.5] = 1
df['x80'] = df['x80']-1
df['Financial Distress'] = df['Financial Distress'].astype(int)
distress = df['Financial Distress']
df = df.drop(columns=['Company','Time','Financial Distress'])
df['Financial Distress'] = distress
colnames = [('X'+str(i)) for i in range(df.shape[1]-1)]
colnames.append('Y')
df.columns = colnames

In [3]:
categorical = [79]
cont_col = [i for i in range(79)]+[i for i in range(80,83)]
discrete_feature_val = {79:37}

In [4]:
scaler = preprocessing.MinMaxScaler()
scale_col_name = ["X"+str(i) for i in cont_col]
df[scale_col_name] = scaler.fit_transform(df[scale_col_name])
reduced_df = utils.outlier_removal(df, cont_col)

In [5]:
x = reduced_df.iloc[:,:-1].to_numpy()
y = reduced_df.iloc[:,-1].to_numpy()
smote_nc = SMOTENC(categorical_features=[79], random_state=0)
x_resample, y_resample = smote_nc.fit_resample(x, y)

In [6]:
n_components = int(0.9 * x_resample.shape[1])
pca = PCA(n_components = n_components)
x_new = pca.fit_transform(x_resample)

In [7]:
k_fold = KFold(n_splits = 5, shuffle = True)
ciber_record = list()
ada_record = list()
rf_record = list()
xgb_record = list()
lgb_record = list()
DT_record = list()
svm_record = list()
lr_record = list()
nb_record = list()
gaussian_nb_record = list()

itr = 1
for train_idx, test_idx in k_fold.split(x_new):
    x_train, x_test = x_new[train_idx,:], x_new[test_idx,:]
    y_train, y_test = y_resample[train_idx], y_resample[test_idx]
    #  adaboost
    ada_clf = AdaBoostClassifier()
    ada_clf.fit(x_train,y_train)
    ada_predict = ada_clf.predict(x_test)
    ada_record.append(accuracy_score(y_test, ada_predict))
    # random forest
    rf_clf = RandomForestClassifier()
    rf_clf.fit(x_train,y_train)
    rf_predict = rf_clf.predict(x_test)
    rf_record.append(accuracy_score(y_test, rf_predict))
    # xgboost
    xgb_clf = xgb.XGBClassifier()
    xgb_clf.fit(x_train,y_train)
    xgb_predict = xgb_clf.predict(x_test)
    xgb_record.append(accuracy_score(y_test, xgb_predict))
    # light GBM
    lgb_clf = lgb.LGBMClassifier()
    lgb_clf.fit(x_train, y_train)
    lgb_predict = lgb_clf.predict(x_test)
    lgb_predict = lgb_predict.round(0).astype('int')
    lgb_record.append(accuracy_score(y_test, lgb_predict))
    # decision tree
    DT_clf = tree.DecisionTreeClassifier()
    DT_clf.fit(x_train, y_train)
    DT_predict = DT_clf.predict(x_test)
    DT_record.append(accuracy_score(y_test, DT_predict))
    # svm
    svm = SVC()
    svm.fit(x_train, y_train)
    svm_predict = svm.predict(x_test)
    svm_record.append(accuracy_score(y_test, svm_predict))
    # logistic regression
    lr_clf = LogisticRegression()
    lr_clf.fit(x_train, y_train)
    lr_predict = lr_clf.predict(x_test)
    lr_record.append(accuracy_score(y_test, lr_predict))
    # nb by ciber
    nb_clf = cm.clustered_comonotonic(x_train,y_train,discrete_feature_val,cont_col,
                                    categorical, 1, None, corrtype='spearman',
                                    discrete_method='mdlp')
    nb_clf.run()
    nb_predict = nb_clf.predict(x_test)
    nb_record.append(accuracy_score(y_test, nb_predict))
    # gaussian nb
    gaussian_nb = GaussianNB()
    gaussian_nb.fit(x_train, y_train)
    gaussian_predict = gaussian_nb.predict(x_test)
    gaussian_nb_record.append(accuracy_score(y_test, gaussian_predict))
    
    print(itr)
    itr += 1
    del x_train, x_test, y_train, y_test



1




2




3




4




5


In [8]:
record = dict()
record['adaboost'] = ada_record
record['random forest'] = rf_record
record['xgboost'] = xgb_record
record['light GBM'] = lgb_record
record['decision tree'] = DT_record
record['svm'] = svm_record
record['logistic regress'] = lr_record
record['ciber nb'] = nb_record
record['gaussian nb'] = gaussian_nb_record

workbook = xlsxwriter.Workbook('pca 0.9 5fold cv.xlsx') 
worksheet = workbook.add_worksheet() 
row = 0
for method in record.keys():
    column = 1
    worksheet.write(row, 0, method)
    for acc in record[method]:
        worksheet.write(row, column, acc)
        column += 1
    row += 1
workbook.close()

In [14]:
x_train,x_test,y_train,y_test = train_test_split(x_new,y_resample,test_size=0.2,random_state=14)

In [48]:
c_como_demo = cm.clustered_comonotonic(x_train,y_train,discrete_feature_val,cont_col,
                                      categorical, 0.1, None, corrtype='spearman',
                                      discrete_method='mdlp')
c_como_demo.run()
c_como_predict = c_como_demo.predict(x_test)
print(classification_report(y_test,c_como_predict))

              precision    recall  f1-score   support

           0       0.90      0.87      0.88       714
           1       0.87      0.90      0.88       681

    accuracy                           0.88      1395
   macro avg       0.88      0.88      0.88      1395
weighted avg       0.88      0.88      0.88      1395



In [50]:
c_como_demo.print_cluster()

[[0], [1, 6], [2], [3], [4, 26], [5], [7, 14], [8, 16], [9], [10], [11], [12], [13], [15], [17], [18], [19], [20], [21], [22], [23], [24], [25], [27, 28, 29]]


In [51]:
import xgboost as xgb
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(x_train,y_train)
xgb_predict = xgb_clf.predict(x_test)
print(classification_report(y_test, xgb_predict))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98       714
           1       0.97      1.00      0.98       681

    accuracy                           0.98      1395
   macro avg       0.98      0.98      0.98      1395
weighted avg       0.98      0.98      0.98      1395



In [10]:
xgb_proba = xgb_clf.predict_proba(x_test)
print(utils.cross_entropy(xgb_proba, y_test))

0.09233894895789105


In [52]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier()
rf_clf.fit(x_train,y_train)
rf_predict = rf_clf.predict(x_test)
print(classification_report(y_test, rf_predict))



              precision    recall  f1-score   support

           0       0.99      0.97      0.98       714
           1       0.97      0.99      0.98       681

    accuracy                           0.98      1395
   macro avg       0.98      0.98      0.98      1395
weighted avg       0.98      0.98      0.98      1395



In [12]:
rf_proba = rf_clf.predict_proba(x_test)
print(utils.cross_entropy(rf_proba, y_test))

0.12500483664301854


In [13]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier()
ada_clf.fit(x_train,y_train)
ada_predict = ada_clf.predict(x_test)
print(classification_report(y_test, ada_predict))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94       714
           1       0.92      0.96      0.94       681

    accuracy                           0.94      1395
   macro avg       0.94      0.94      0.94      1395
weighted avg       0.94      0.94      0.94      1395



In [14]:
ada_proba = ada_clf.predict_proba(x_test)
print(utils.cross_entropy(ada_proba, y_test))

0.8413441349424778


In [21]:
import lightgbm as lgb
lgb_clf = lgb.LGBMClassifier()
lgb_clf.fit(x_train, y_train)
gbm_predict = gbm_clf.predict(x_test)
gbm_predict = gbm_predict.round(0).astype('int')
print(classification_report(y_test, gbm_predict))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97       714
           1       0.95      0.99      0.97       681

    accuracy                           0.97      1395
   macro avg       0.97      0.97      0.97      1395
weighted avg       0.97      0.97      0.97      1395



In [24]:
lgb_proba = lgb_clf.predict_proba(x_test)
print(utils.cross_entropy(lgb_proba, y_test))

0.0777683570831202


In [54]:
svm = SVC()
svm.fit(x_train, y_train)
svm_predict = svm.predict(x_test)
print(classification_report(y_test, svm_predict))



              precision    recall  f1-score   support

           0       0.97      0.87      0.92       714
           1       0.88      0.97      0.92       681

    accuracy                           0.92      1395
   macro avg       0.93      0.92      0.92      1395
weighted avg       0.93      0.92      0.92      1395

