# Taiwan Company Bankruptcy Prediction

## 1. Introduction

Data from the Taiwan Economic Journal for the years 1999–2009 representing company bankruptcy based on the business regulations of the Taiwan Stock Exchange.

### Source

Deron Liang and Chih-Fong Tsai, deronliang '@' gmail.com; cftsai '@' mgt.ncu.edu.tw, National Central University, Taiwan
The data was obtained from UCI Machine Learning Repository: https://archive.ics.uci.edu/ml/datasets/Taiwanese+Bankruptcy+Prediction

### Relevant Papers

Liang, D., Lu, C.-C., Tsai, C.-F., and Shih, G.-A. (2016) Financial Ratios and Corporate Governance Indicators in Bankruptcy Prediction: A Comprehensive Study. European Journal of Operational Research, vol. 252, no. 2, pp. 561-572.
https://www.sciencedirect.com/science/article/pii/S0377221716000412

## 2. Technical Setup

### 2.1 Matplot lib

This section will include the provided theme and configuration for `matplotlib` provided for the course at http://web.ist.utl.pt/~claudia.antunes/DSLabs/config.py 

In [None]:
import config
import pandas as pd
from numpy import log
from pandas.plotting import register_matplotlib_converters
from matplotlib.pyplot import figure, savefig, show, show, subplots, Axes
from ds_charts import bar_chart, get_variable_types, choose_grid, HEIGHT, multiple_line_chart
from pandas import DataFrame, Series    
from seaborn import distplot
from scipy.stats import norm, expon, lognorm

register_matplotlib_converters()

### 2.2 Loading data with `Pandas`

In [None]:
missing_values = ["NA", "n/a", "na", "?", "--"]
data = pd.read_csv('./data/taiwan.csv', sep=',', decimal='.', parse_dates=True, infer_datetime_format=True, na_values = missing_values)
data.describe()

## 3 Data Profiling

TODO: We must gather some insights about the dataset we have

### 3.1 Data dimensionality

Simple analysis of the data dimensionality and types of variables we have on this dataset.

In [None]:
figure(figsize=(4,2))
values = {'# records': data.shape[0], '# variables': data.shape[1]}
bar_chart(list(values.keys()), list(values.values()), title='# of records vs # variables')
savefig('./images/records_variables.png')
show()

#### 3.1.1 Variable types

Inferring about the nature of the variables we have on our dataset.

In [None]:
data.dtypes

In [None]:
def get_variable_types(df: DataFrame) -> dict:
    variable_types: dict = {
        'Numeric': [],
        'Binary': [],
        'Date': [],
        'Symbolic': []
    }
    for c in df.columns:
        uniques = df[c].dropna(inplace=False).unique()
        if len(uniques) == 2:
            variable_types['Binary'].append(c)
            df[c].astype('bool')
        elif df[c].dtype == 'datetime64':
            variable_types['Date'].append(c)
        elif df[c].dtype == 'int':
            variable_types['Numeric'].append(c)
        elif df[c].dtype == 'float':
            variable_types['Numeric'].append(c)
        else:
            df[c].astype('category')
            variable_types['Symbolic'].append(c)

    return variable_types

variable_types = get_variable_types(data)
print(variable_types)
counts = {}
for tp in variable_types.keys():
    counts[tp] = len(variable_types[tp])
figure(figsize=(4,2))
bar_chart(list(counts.keys()), list(counts.values()), title='Nr of variables per type')
savefig('./images/variable_types.png')
show()
    

#### 3.1.2 Missing values

Simple analysis of the missing values.

In [None]:
mv = {}
for var in data:
    nr = data[var].isnull().sum()
    if nr > 0:
        print(f'[!] Found {nr} missing values in {var}')
        mv[var] = nr

if mv == {}:
    print("No missing values found in dataset.")

### 3.2 Data distribution

Simple analysis of the data distribution.


In [None]:
numeric_vars = get_variable_types(data)['Numeric']
if [] == numeric_vars:
    raise ValueError('There are no numeric variables.')
rows, cols = choose_grid(len(numeric_vars))
fig, axs = subplots(rows, cols, figsize=(cols*HEIGHT, rows*HEIGHT), squeeze=False)
i, j = 0, 0
for n in range(len(numeric_vars)):
    axs[i, j].set_title('Boxplot for %s'%numeric_vars[n])
    axs[i, j].boxplot(data[numeric_vars[n]].dropna().values)
    i, j = (i + 1, 0) if (n+1) % cols == 0 else (i, j + 1)
savefig('images/single_boxplots.png')
show()

Outliers using both the IQR and stdev critteria:


In [None]:
from matplotlib.pyplot import figure, savefig, show
from ds_charts import get_variable_types, multiple_bar_chart, HEIGHT

NR_STDEV: int = 2

numeric_vars = get_variable_types(data)['Numeric']
if [] == numeric_vars:
    raise ValueError('There are no numeric variables.')

outliers_iqr = []
outliers_stdev = []
summary5 = data.describe(include='number')

for var in numeric_vars:
    iqr = 1.5 * (summary5[var]['75%'] - summary5[var]['25%'])
    outliers_iqr += [
        data[data[var] > summary5[var]['75%']  + iqr].count()[var] +
        data[data[var] < summary5[var]['25%']  - iqr].count()[var]]
    std = NR_STDEV * summary5[var]['std']
    outliers_stdev += [
        data[data[var] > summary5[var]['mean'] + std].count()[var] +
        data[data[var] < summary5[var]['mean'] - std].count()[var]]

outliers = {'iqr': outliers_iqr, 'stdev': outliers_stdev}

rows, cols = choose_grid(len(numeric_vars))
fig, axs = subplots(rows, cols, figsize=(cols*HEIGHT, rows*HEIGHT), squeeze=False)
i, j = 0, 0
width = 0.3
for n in range(len(numeric_vars)):
    current_outliers = {'iqr': outliers['iqr'][n], 'stdev': outliers['stdev'][n]}
    axs[i, j].set_title('Outliers for %s'%numeric_vars[n])
    axs[i, j].bar(1 + width/2, outliers['iqr'][n], width, label='iqr')
    axs[i, j].bar(1 - width/2, outliers['stdev'][n], width, label='stdev')
    axs[i, j].legend()
    i, j = (i + 1, 0) if (n+1) % cols == 0 else (i, j + 1)
savefig('images/single_outliers.png')
show()



Finding the matching possible distribution

In [None]:
numeric_vars = get_variable_types(data)['Numeric']
if [] == numeric_vars:
    raise ValueError('There are no numeric variables.')

fig, axs = subplots(rows, cols, figsize=(cols*HEIGHT, rows*HEIGHT), squeeze=False)
i, j = 0, 0
for n in range(len(numeric_vars)):
    axs[i, j].set_title('Histogram with trend for %s'%numeric_vars[n])
    distplot(data[numeric_vars[n]].dropna().values, norm_hist=True, ax=axs[i, j], axlabel=numeric_vars[n])
    i, j = (i + 1, 0) if (n+1) % cols == 0 else (i, j + 1)
savefig('images/histograms_trend_numeric.png')
show()

Let us try as well to match well known distributions

In [None]:
def compute_known_distributions(x_values: list) -> dict:
    distributions = dict()
    # Gaussian
    mean, sigma = norm.fit(x_values)
    distributions['Normal(%.1f,%.2f)'%(mean,sigma)] = norm.pdf(x_values, mean, sigma)
    # Exponential
    loc, scale = expon.fit(x_values)
    distributions['Exp(%.2f)'%(1/scale)] = expon.pdf(x_values, loc, scale)
    # LogNorm
    sigma, loc, scale = lognorm.fit(x_values)
    distributions['LogNor(%.1f,%.2f)'%(log(scale),sigma)] = lognorm.pdf(x_values, sigma, loc, scale)
    return distributions

def histogram_with_distributions(ax: Axes, series: Series, var: str):
    values = series.sort_values().values
    ax.hist(values, 20, density=True)
    distributions = compute_known_distributions(values)
    multiple_line_chart(values, distributions, ax=ax, title='Best fit for %s'%var, xlabel=var, ylabel='')

numeric_vars = get_variable_types(data)['Numeric']
if [] == numeric_vars:
    raise ValueError('There are no numeric variables.')


#fig, axs = subplots(rows, cols, figsize=(cols*HEIGHT, rows*HEIGHT), squeeze=False)
#i, j = 0, 0
#for n in range(len(numeric_vars)):
#    histogram_with_distributions(axs[i, j], data[numeric_vars[n]].dropna(), numeric_vars[n])
#    i, j = (i + 1, 0) if (n+1) % cols == 0 else (i, j + 1)
#savefig('images/histogram_numeric_distribution.png')
#show()

### 3.3 Symbolic Values

In [13]:
symbolic_vars = get_variable_types(data)['Symbolic']
if [] == symbolic_vars:
    print('There are no symbolic variables.')
else:
    rows, cols = choose_grid(len(symbolic_vars))
    fig, axs = subplots(rows, cols, figsize=(cols*HEIGHT, rows*HEIGHT), squeeze=False)
    i, j = 0, 0
    for n in range(len(symbolic_vars)):
        counts = data[symbolic_vars[n]].value_counts()
        bar_chart(counts.index.to_list(), counts.values, ax=axs[i, j], title='Histogram for %s'%symbolic_vars[n], xlabel=symbolic_vars[n], ylabel='nr records', percentage=False)
        i, j = (i + 1, 0) if (n+1) % cols == 0 else (i, j + 1)
    savefig('images/histograms_symbolic.png')
    show()

There are no symbolic variables.


ValueError: Number of columns must be a positive integer, not 0

<Figure size 0x600 with 0 Axes>