# Libraries and versions

In [19]:
#Basic libraries
python_version = !python -V #version 3.7.9
import pandas as pd #version 1.2.4
import numpy as np #version 1.20.1
import seaborn as sns #version 0.11.0
import matplotlib as plt #version 3.3.3

#for analyse multiple features
from sklearn import __version__ as skn #version 0.24.1
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE

#for model classifier
from catboost import __version__ as ct #0.25.1
from catboost import CatBoostClassifier, Pool

import plotly.express as px

In [21]:
print(f'Python version - {python_version[0][7:]}')
print(f'Pandas version - {pd.__version__}')
print(f'Numpy version - {np.__version__}')
print(f'Seaborn version - {sns.__version__}')
print(f'Sklearn version - {skn}')
print(f'CatBoost version - {ct}')
print(f'Matplotlib version - {plt.__version__}')

Python version - 3.7.9
Pandas version - 1.2.4
Numpy version - 1.20.1
Seaborn version - 0.11.0
Sklearn version - 0.24.1
CatBoost version - 0.25.1
Matplotlib version - 3.3.3


# Configurations

## Seaborn

In [4]:
sns.set_palette('Set1')
sns.set_style('darkgrid')

def configuration(graphic_object, title=None, xlabel=None, ylabel=None, colors='black'):
    graphic_object.figure.set_size_inches(15,8)
    graphic_object.set_title(title, color=colors, fontsize=16)
    graphic_object.set_xlabel(xlabel, color=colors, fontsize=14)
    graphic_object.set_ylabel(ylabel, color=colors, fontsize=14)
    graphic_object = graphic_object

## Warning

In [5]:
import warnings
warnings.filterwarnings("ignore")

# Dataset

In [6]:
dataset = pd.read_csv('18_feature_selected.csv')
dataset.head()

Unnamed: 0,Net Income to Total Assets,Net worth/Assets,Persistent EPS in the Last Four Seasons,Retained Earnings to Total Assets,Net profit before tax/Paid-in capital,Per Share Net profit before tax (Yuan ¥),Working Capital to Total Assets,Net Income to Stockholder's Equity,Net Value Per Share (A),CFO to Assets,Cash/Total Assets,Gross Profit to Sales,Equity to Long-term Liability,Current Liability to Equity,Current Liability to Current Assets,Borrowing dependency,Current Liability to Assets,Debt ratio %,Bankrupt
0,0.716845,0.792424,0.169141,0.903225,0.137757,0.138736,0.672775,0.82789,0.14795,0.520382,0.004094,0.601453,0.126549,0.339077,0.11825,0.390284,0.147308,0.207576,1
1,0.795297,0.828824,0.208944,0.931065,0.168962,0.169918,0.751111,0.839969,0.182251,0.567101,0.014948,0.610237,0.120916,0.32974,0.047775,0.37676,0.056963,0.171176,1
2,0.77467,0.792484,0.180581,0.909903,0.148036,0.142803,0.829502,0.836774,0.177911,0.538491,0.000991,0.601449,0.117922,0.334777,0.025346,0.379093,0.098162,0.207516,1
3,0.739555,0.848535,0.193722,0.906902,0.147561,0.148603,0.725754,0.834697,0.154187,0.604105,0.018851,0.583538,0.12076,0.331509,0.06725,0.379743,0.098715,0.151465,1
4,0.795016,0.893491,0.212537,0.91385,0.167461,0.168412,0.751822,0.839973,0.167502,0.578469,0.014161,0.598782,0.110933,0.330726,0.047725,0.375025,0.110195,0.106509,1


## Select features

In [7]:
# total len of dataset
dataset.shape

(6819, 19)

### X and y

In [8]:
X = dataset.drop(columns=['Bankrupt'])
y = dataset['Bankrupt']

print(f'Number of features for X = {X.shape[1]}')

Number of features for X = 18


## CatBoostClassifier

In [11]:
#split data in train and test with different X
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.8, random_state=666)

#intance model
cbc = CatBoostClassifier()

#train model
cbc.fit(X_train, y_train, silent=True)
y_predict = cbc.predict(X_test)

#check accuracy
total_accuracy = accuracy_score(y_test, y_predict) * 100

#create a DataFrame to count correct value predict for 1 in y_predict
results = {'Test':y_test, 'Predict':y_predict}
results = pd.DataFrame(results)
results['OK'] = results['Test'] == results['Predict']
bankrupt_accuracy = results.query('Predict==1 & OK==True').shape[0]
bankrupt_total = results['Test'].sum()
bankrupt_score = bankrupt_accuracy / bankrupt_total * 100

#see results
print(f'Total Accuracy - {total_accuracy:.2f}\nBanrupt correct predicted - {bankrupt_accuracy}')
print(f'Bankrupt Score - {bankrupt_score:.2f}')

Total Accuracy - 97.07
Banrupt correct predicted - 10
Bankrupt Score - 22.73


## Analyse error

#### Create a new dataframe with test, predicted and check columns

In [None]:
X_test['y_test'] = y_test
X_test['y_predict'] = y_predict
X_test['check'] = y_test == y_predict
df_analysis = X_test.copy()

df_analysis.shape

#### Create an TSNE object for 18 variables

In [None]:
tsne = TSNE(n_components=2)
X_tsne = df_analysis.drop(columns=['y_test', 'y_predict', 'check'])#exclude target variables
X_tsne = tsne.fit_transform(X_tsne)

In [None]:
graphic1 = sns.scatterplot(x=X_tsne[:,0], y=X_tsne[:,1], hue=df_analysis['y_predict'])
configuration(graphic1, title='TSNE with predicted values')

In [None]:
graphic2 = sns.scatterplot(x=X_tsne[:,0], y=X_tsne[:,1], hue=df_analysis['y_test'])
configuration(graphic2, title='TSNE with test values')

In [None]:
graphic3 = sns.scatterplot(x=X_tsne[:,0], y=X_tsne[:,1], hue=df_analysis['check'])
configuration(graphic3, title='TSNE with incorrect classification')

### 3D visualization

In [None]:
fig = px.scatter_3d(x=X_tsne[:,0], y=X_tsne[:,1], z=df_analysis['y_test'], color=df_analysis['y_predict'])
fig.show()

## Maximize differences between 1 and 0

#### Fist attempt

- I will create a new variable with quantile positions (0.25, 0.50, 0.75) using the best variable choosen by CatBoost

In [None]:
quantile_list = []
for value in dataset['Persistent EPS in the Last Four Seasons']:
    if value >= dataset['Persistent EPS in the Last Four Seasons'].quantile(0.75):
        quantile_list.append(4)
    elif value >= dataset['Persistent EPS in the Last Four Seasons'].quantile(0.5):
        quantile_list.append(3)
    elif value >= dataset['Persistent EPS in the Last Four Seasons'].quantile(0.25):
        quantile_list.append(2)
    else:
        quantile_list.append(1)
dataset['Q - Persistent EPS in the Last Four Seasons'] = quantile_list

In [None]:
dataset.query('Bankrupt==0')['Q - Persistent EPS in the Last Four Seasons'].value_counts(normalize=True)

In [None]:
dataset.query('Bankrupt==1')['Q - Persistent EPS in the Last Four Seasons'].value_counts(normalize=True)

- 85 percent of bankrupted companies has this variable in first quantile
- the quantile for "good" companies had a very distribution for each

### Test model with this new variable

#### X and y

In [None]:
X = dataset.drop(columns=['Bankrupt'])
y = dataset['Bankrupt']

#### Test one

In [None]:
#split data in train and test with different X
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.8, random_state=666)

#intance model
cbc1 = CatBoostClassifier()

#train model
cbc1.fit(X_train, y_train, silent=True)
y_predict = cbc1.predict(X_test)

#check accuracy
total_accuracy = accuracy_score(y_test, y_predict) * 100

#create a DataFrame to count correct value predict for 1 in y_predict
results = {'Test':y_test, 'Predict':y_predict}
results = pd.DataFrame(results)
results['OK'] = results['Test'] == results['Predict']
bankrupt_accuracy = results.query('Predict==1 & OK==True').shape[0]
bankrupt_total = results['Test'].sum()
bankrupt_score = bankrupt_accuracy / bankrupt_total * 100

#see results
print(f'Total Accuracy - {total_accuracy:.2f}\nBanrupt correct predicted - {bankrupt_accuracy}')
print(f'Bankrupt Score - {bankrupt_score:.2f}')

- after this transformation, the result of model was worst them previous, but I thing if transform all variables in a new dataset the results must be better

### Transform all variables

#### Function

In [None]:
def quantile_column(dataset, column, name_column_output):
    quantile_list = []
    q4 = dataset[column].quantile(0.75)
    q3 = dataset[column].quantile(0.50)
    q2 = dataset[column].quantile(0.25)
    for value in dataset[column]:
        if value >= q4:
            quantile_list.append(4)
        elif value >= q3:
            quantile_list.append(3)
        elif value >= q2:
            quantile_list.append(2)
        else:
            quantile_list.append(1)
    dataset[name_column_output] = quantile_list
    return dataset

#### Create a new variables

In [None]:
#list with variables to convert
columns = dataset.columns[:18]
for column in columns:
    quantile_column(dataset, column=column, name_column_output=f'Q - {column}')

In [None]:
#check results
dataset.columns

In [None]:
#create a new DataFrame only with quantile variables
q_dataset = dataset.drop(columns=columns)   #columns is an object above for iteration
q_dataset

In [None]:
#drop columns in the old dataset
columns_to_drop = q_dataset.columns[1:]
dataset.drop(columns=columns_to_drop, inplace=True)

### Test model with quantiles

#### function for model

In [None]:
def model_function(dataset):
    #split dataset in X and Y
    X = dataset.drop(columns=['Bankrupt'])
    y = dataset['Bankrupt']
    
    #split data in train and test with different X
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.8, random_state=666)

    #intance model
    cbc = CatBoostClassifier()

    #train model
    cbc.fit(X_train, y_train, silent=True)
    y_predict = cbc.predict(X_test)

    #check accuracy
    total_accuracy = accuracy_score(y_test, y_predict) * 100

    #create a DataFrame to count correct value predict for 1 in y_predict
    results = {'Test':y_test, 'Predict':y_predict}
    results = pd.DataFrame(results)
    results['OK'] = results['Test'] == results['Predict']
    bankrupt_accuracy = results.query('Predict==1 & OK==True').shape[0]
    bankrupt_total = results['Test'].sum()
    bankrupt_score = bankrupt_accuracy / bankrupt_total * 100

    #see results
    print(f'Total Accuracy - {total_accuracy:.2f}\nBanrupt correct predicted - {bankrupt_accuracy}')
    print(f'Bankrupt Score - {bankrupt_score:.2f}')
    
    return cbc

In [None]:
model_function(q_dataset)

In [None]:
#see the distribution for each features compare Bankrupt 1 and 0
fig, axes = plt.subplots(len(columns_to_drop), 2, figsize=(18,80))

plot_line = 0
plot_column = 0
for column in columns_to_drop:
    sns.distplot(q_dataset.query('Bankrupt==1')[column], bins=4, ax=axes[plot_line,plot_column]) 
    plot_column += 1
    sns.distplot(q_dataset.query('Bankrupt==0')[column], bins=4, ax=axes[plot_line,plot_column])
    plot_line += 1
    plot_column = 0    

- the distribution for theses variables using quantile is different, but model did not understand that
- I thing that: model is overfitted with Y variable equal 1, because there are a lot of that results in the taining data
- let's check overfitting