https://www.kaggle.com/shebrahimi/financial-distress

In this notebook, we change the minimum correlation used in the cluster construction process and collect the test accuracy (out-of-sample). For each setting of minimum correlation, we repeat the experiment for 5 times.

In [1]:
import numpy as np
import pandas as pd
import sys
sys.path.insert(1, '/home/r7user5/Desktop/STAT')
import comonotonic as cm
import os
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
import copy
import utils
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
import ensemble_ciber as ec
import conditional_ciber as cc
from imblearn.over_sampling import SMOTENC
import xlsxwriter
from sklearn.metrics import confusion_matrix
from multiprocessing import Pool
import statistics
import math

In [2]:
df = pd.read_csv("Financial Distress.csv")
df['Financial Distress'].values[df['Financial Distress'].values > -0.5] = 0
df['Financial Distress'].values[df['Financial Distress'].values <= -0.5] = 1
df['x80'] = df['x80']-1
df['Financial Distress'] = df['Financial Distress'].astype(int)
distress = df['Financial Distress']
df = df.drop(columns=['Company','Time','Financial Distress'])
df['Financial Distress'] = distress
colnames = [('X'+str(i)) for i in range(df.shape[1]-1)]
colnames.append('Y')
df.columns = colnames

In [3]:
def change_min_corr(df, min_corr, n_repeat):
    categorical = [79]
    cont_col = [i for i in range(79)]+[i for i in range(80,83)]
    discrete_feature_val = {79:37}
    acc_result = list()
    auc_result = list()
    for i in range(n_repeat):
        df_train, df_test = train_test_split(df, test_size=0.3, stratify=df[['Y']])
        scaler = preprocessing.MinMaxScaler()
        scale_col_name = ["X"+str(i) for i in cont_col]
        df_train[scale_col_name] = scaler.fit_transform(df_train[scale_col_name])
        reduced_df_train = utils.outlier_removal(df_train, cont_col)
        df_test[scale_col_name] = scaler.transform(df_test[scale_col_name])
        x_train = reduced_df_train.iloc[:,:-1].to_numpy()
        y_train = reduced_df_train.iloc[:,-1].to_numpy()
        smote_nc = SMOTENC(categorical_features=[79], random_state=0)
        x_train_synthetic, y_train_synthetic = smote_nc.fit_resample(x_train, y_train)
        x_test = df_test.iloc[:,:-1].to_numpy()
        y_test = df_test.iloc[:,-1].to_numpy()
        ciber = cm.clustered_comonotonic(x_train_synthetic,y_train_synthetic,
                                         discrete_feature_val,cont_col,categorical, 
                                         min_corr, None, corrtype='spearman',
                                         discrete_method='mdlp')
        ciber.run()
        ciber_predict = ciber.predict(x_test)
        ciber_proba = ciber.predict_proba(x_test)[:,1]
        acc_result.append(accuracy_score(y_test, ciber_predict))
        auc_result.append(roc_auc_score(y_test, ciber_proba))
        del df_train, df_test, scaler, reduced_df_train, x_train, x_train_synthetic
        del y_train, y_train_synthetic, x_test, y_test, ciber, ciber_predict, ciber_proba
    print("Completed min_corr = "+str(min_corr))
    return min_corr, acc_result, auc_result

In [4]:
def experiment(df, corr_range, n_repeat):
    acc_record = dict()
    auc_record = dict()
    param_list = [(df, min_corr, n_repeat) for min_corr in corr_range]
    pool = Pool()
    results = pool.starmap(change_min_corr, param_list)
    for result in results:
        min_corr = result[0]
        acc_result = result[1]
        auc_result = result[2]
        acc_record[min_corr] = acc_result
        auc_record[min_corr] = auc_result
        del min_corr, acc_result, auc_result
    return acc_record, auc_record

In [5]:
def compute_CI(ll):
    mean = statistics.mean(ll)
    std = statistics.stdev(ll)
    t_value = 2.262
    lower = mean - t_value * std / math.sqrt(9)
    upper = mean + t_value * std / math.sqrt(9)
    return lower, upper

In [6]:
corr_range = [0.98,1]
acc_record, auc_record = experiment(df, corr_range, 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a Da

  File "/home/r7user5/anaconda3/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/home/r7user5/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/r7user5/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/r7user5/anaconda3/lib/python3.7/multiprocessing/queues.py", line 351, in get
    with self._rlock:
  File "/home/r7user5/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/r7user5/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/r7user5/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/r7user5/anaconda3/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/home/r7user5/anaconda3/lib/python3.7/multiprocessing/pool.py", line

  File "/home/r7user5/anaconda3/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/home/r7user5/anaconda3/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/home/r7user5/anaconda3/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/home/r7user5/anaconda3/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
  File "/home/r7user5/anaconda3/lib/python3.7/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
workbook1 = xlsxwriter.Workbook("supp acc.xlsx")
workbook2 = xlsxwriter.Workbook("supp auc.xlsx")
worksheet1 = workbook1.add_worksheet()
worksheet2 = workbook2.add_worksheet()
title1 = ['Min_corr','Acc1','Acc2','Acc3','Acc4','Acc5','Acc6','Acc7','Acc8','Acc9','Acc10','T-Lower','T-Upper']
title2 = ['Min_corr','auc1','auc2','auc3','auc4','auc5','auc6','auc7','auc8','auc9','auc10','T-Lower','T-Upper']
row = 0
for col in range(len(title1)):
    worksheet1.write(row, col, title1[col])
    worksheet2.write(row, col, title2[col])
row += 1    
for min_corr in acc_record.keys():
    acc = acc_record[min_corr]
    low1, up1 = compute_CI(acc)
    auc = auc_record[min_corr]
    low2, up2 = compute_CI(auc)
    for col in range(11):
        if col == 0:
            worksheet1.write(row, col, min_corr)
            worksheet2.write(row, col, min_corr)
        elif col in [1,2,3,4,5,6,7,8,9,10]:
            worksheet1.write(row, col, acc_record[min_corr][col-1])
            worksheet2.write(row, col, auc_record[min_corr][col-1])
    worksheet1.write(row, 11, low1)
    worksheet1.write(row, 12, up1)
    worksheet2.write(row, 11, low2)
    worksheet2.write(row, 12, up2)
    row += 1
workbook1.close()
workbook2.close()