# Libraries and versions

In [1]:
#Basic libraries
python_version = !python -V #version 3.7.9
import pandas as pd #version 1.2.4
import numpy as np #version 1.20.1
import seaborn as sns #version 0.11.0
import matplotlib as plt #version 3.3.3

#for analyse multiple features
from sklearn import __version__ as skn #version 0.24.1
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE

#for model classifier
from catboost import __version__ as ct #0.25.1
from catboost import CatBoostClassifier, Pool

In [2]:
print(f'Python version - {python_version[0][7:]}')
print(f'Pandas version - {pd.__version__}')
print(f'Numpy version - {np.__version__}')
print(f'Seaborn version - {sns.__version__}')
print(f'Sklearn version - {skn}')
print(f'CatBoost version - {ct}')
print(f'Matplotlib version - {plt.__version__}')

Python version - 3.7.9
Pandas version - 1.2.4
Numpy version - 1.20.1
Seaborn version - 0.11.0
Sklearn version - 0.24.1
CatBoost version - 0.25.1
Matplotlib version - 3.3.3


# Configurations

## Seaborn

In [3]:
sns.set_palette('Set1')
sns.set_style('darkgrid')

def configuration(graphic_object, title=None, xlabel=None, ylabel=None, colors='black'):
    graphic_object.figure.set_size_inches(15,8)
    graphic_object.set_title(title, color=colors, fontsize=16)
    graphic_object.set_xlabel(xlabel, color=colors, fontsize=14)
    graphic_object.set_ylabel(ylabel, color=colors, fontsize=14)
    graphic_object = graphic_object

## Warning

In [4]:
import warnings
warnings.filterwarnings("ignore")

# Dataset

In [5]:
dataset = pd.read_csv('18_feature_selected.csv')
dataset.head()

Unnamed: 0,Net Income to Total Assets,Net worth/Assets,Persistent EPS in the Last Four Seasons,Retained Earnings to Total Assets,Net profit before tax/Paid-in capital,Per Share Net profit before tax (Yuan ¥),Working Capital to Total Assets,Net Income to Stockholder's Equity,Net Value Per Share (A),CFO to Assets,Cash/Total Assets,Gross Profit to Sales,Equity to Long-term Liability,Current Liability to Equity,Current Liability to Current Assets,Borrowing dependency,Current Liability to Assets,Debt ratio %,Bankrupt
0,0.716845,0.792424,0.169141,0.903225,0.137757,0.138736,0.672775,0.82789,0.14795,0.520382,0.004094,0.601453,0.126549,0.339077,0.11825,0.390284,0.147308,0.207576,1
1,0.795297,0.828824,0.208944,0.931065,0.168962,0.169918,0.751111,0.839969,0.182251,0.567101,0.014948,0.610237,0.120916,0.32974,0.047775,0.37676,0.056963,0.171176,1
2,0.77467,0.792484,0.180581,0.909903,0.148036,0.142803,0.829502,0.836774,0.177911,0.538491,0.000991,0.601449,0.117922,0.334777,0.025346,0.379093,0.098162,0.207516,1
3,0.739555,0.848535,0.193722,0.906902,0.147561,0.148603,0.725754,0.834697,0.154187,0.604105,0.018851,0.583538,0.12076,0.331509,0.06725,0.379743,0.098715,0.151465,1
4,0.795016,0.893491,0.212537,0.91385,0.167461,0.168412,0.751822,0.839973,0.167502,0.578469,0.014161,0.598782,0.110933,0.330726,0.047725,0.375025,0.110195,0.106509,1


## How many Bankrupt equal 1 is bigger then equal 0?

I think which model learn a lot class 1 and did not understand 0 class. 

Because of this I think the model is overfitted

### Check difference between classes

In [6]:
print(dataset['Bankrupt'].value_counts().to_frame())

difference = dataset['Bankrupt'].value_counts()[0] / dataset['Bankrupt'].value_counts()[1]
print('')
print(f'Class 1 is {difference:.0f} times bigger then class 0.')

   Bankrupt
0      6599
1       220

Class 1 is 30 times bigger then class 0.


**I will reduce the learning for class 1 in 1/3** (class 1 will be 10 times bigger then class 0)

### Create a new dataframe with random sample for class 0

In [7]:
#select random new sample for class 0 and new object for class 0
dataset_0 = dataset.query('Bankrupt==0').sample(2220, random_state=666)
dataset_1 = dataset.query('Bankrupt==1')

#create a new dataset more balanced
new_dataset = dataset_0.append(dataset_1)
new_dataset.reset_index(inplace=True, drop=True)
new_dataset

Unnamed: 0,Net Income to Total Assets,Net worth/Assets,Persistent EPS in the Last Four Seasons,Retained Earnings to Total Assets,Net profit before tax/Paid-in capital,Per Share Net profit before tax (Yuan ¥),Working Capital to Total Assets,Net Income to Stockholder's Equity,Net Value Per Share (A),CFO to Assets,Cash/Total Assets,Gross Profit to Sales,Equity to Long-term Liability,Current Liability to Equity,Current Liability to Current Assets,Borrowing dependency,Current Liability to Assets,Debt ratio %,Bankrupt
0,0.801688,0.841310,0.219722,0.939054,0.174844,0.175793,0.760356,0.840612,0.186044,0.573751,0.175991,0.605126,0.110933,0.335579,0.045340,0.376300,0.165602,0.158690,0
1,0.841585,0.883907,0.247235,0.948432,0.198748,0.199669,0.764457,0.843523,0.187097,0.600745,0.118248,0.606364,0.110933,0.331334,0.042721,0.369637,0.118646,0.116093,0
2,0.840415,0.980530,0.235038,0.956747,0.191720,0.192664,0.898402,0.842256,0.187687,0.682726,0.260377,0.640119,0.110933,0.326794,0.005467,0.369637,0.021822,0.019470,0
3,0.822981,0.917496,0.227758,0.941010,0.179549,0.180538,0.916609,0.841782,0.174076,0.656720,0.045642,0.616989,0.112289,0.329026,0.014090,0.370148,0.077692,0.082504,0
4,0.762014,0.870514,0.200246,0.920653,0.151263,0.152293,0.758259,0.837245,0.173360,0.557877,0.027304,0.589990,0.116508,0.327735,0.038323,0.372784,0.033091,0.129486,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2435,0.725750,0.783445,0.197977,0.873937,0.148481,0.149507,0.764816,0.828067,0.139354,0.534595,0.025171,0.612747,0.110933,0.346780,0.044960,0.389886,0.211979,0.216555,1
2436,0.519388,0.474590,0.079512,0.777637,0.057130,0.054304,0.723172,0.856906,0.069656,0.738860,0.024764,0.598051,0.090263,0.320395,0.059374,0.357056,0.165715,0.525410,1
2437,0.557733,0.731294,0.164792,0.852516,0.090634,0.091738,0.740426,0.726888,0.131010,0.649732,0.108758,0.590838,0.144985,0.391128,0.050780,0.402534,0.239915,0.268706,1
2438,0.641804,0.730961,0.154297,0.879445,0.106018,0.106274,0.720006,0.765967,0.132401,0.511451,0.019116,0.581461,0.216878,0.372218,0.060766,0.458819,0.167973,0.269039,1


In [8]:
print(new_dataset['Bankrupt'].value_counts().to_frame())

new_difference = new_dataset['Bankrupt'].value_counts()[0] / new_dataset['Bankrupt'].value_counts()[1]
print('')
print(f'Class 1 is {new_difference:.0f} times bigger then class 0.')

   Bankrupt
0      2220
1       220

Class 1 is 10 times bigger then class 0.


## Select features

In [9]:
# total len of dataset
new_dataset.shape

(2440, 19)

### X and y

In [10]:
X = new_dataset.drop(columns=['Bankrupt'])
y = new_dataset['Bankrupt']

print(f'Number of features for X = {X.shape[1]}')

Number of features for X = 18


## CatBoostClassifier

In [11]:
#split data in train and test with different X
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.8, random_state=666)

#intance model
cbc = CatBoostClassifier()

#train model
cbc.fit(X_train, y_train, silent=True)
y_predict = cbc.predict(X_test)

#check accuracy
total_accuracy = accuracy_score(y_test, y_predict) * 100

#create a DataFrame to count correct value predict for 1 in y_predict
results = {'Test':y_test, 'Predict':y_predict}
results = pd.DataFrame(results)
results['OK'] = results['Test'] == results['Predict']
bankrupt_accuracy = results.query('Predict==1 & OK==True').shape[0]
bankrupt_total = results['Test'].sum()
bankrupt_score = bankrupt_accuracy / bankrupt_total * 100

#see results
print(f'Total Accuracy - {total_accuracy:.2f}\nBanrupt correct predicted - {bankrupt_accuracy}')
print(f'Bankrupt Score - {bankrupt_score:.2f}')

Total Accuracy - 92.83
Banrupt correct predicted - 21
Bankrupt Score - 47.73


- **Previous results:** 10 hits (22,4%)
- **New results:** 21 hits (47,7%)

The overall accuracy of model decline almost 4pp (it was possible too because the percentual of class 0 is only 90%)

***But the accuracy for class 0 was increased more then 100%***

Is it possible increase the accuracy for class 1 removing more observations with class 0?

### Test minimun observations in dataset

- the idea is to randomly remove an observation at each iteration with the classifier, in order to obtain as few cases as possible with better accuracy for class 1

In [12]:
#set one array with random values take only indexes for class 0
random_indexes = np.random.random_integers(0,len(new_dataset.query('Bankrupt==0').index),5)

In [13]:
#create a copy for new_dataset
minimum_dataset = new_dataset.copy()

total_score = []
bankrupt_predict = []
bankrupt_predict_score = []
observation = []
#begining the iteration
for index in random_indexes:
    minimum_dataset.drop(index=index, inplace=True)
    
    #X and y
    X = minimum_dataset.drop(columns=['Bankrupt'])
    y = minimum_dataset['Bankrupt']
    
    #split data in train and test with different X
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.8, random_state=666)

    #intance model
    cbc = CatBoostClassifier(random_state=666, task_type='GPU', devices='0:2') #must be the same in all iterations

    #train model
    cbc.fit(X_train, y_train, silent=True)
    y_predict = cbc.predict(X_test)

    #check accuracy
    total_accuracy = accuracy_score(y_test, y_predict) * 100
    
    #create a DataFrame to count correct value predict for 1 in y_predict
    results = {'Test':y_test, 'Predict':y_predict}
    results = pd.DataFrame(results)
    results['OK'] = results['Test'] == results['Predict']
    bankrupt_accuracy = results.query('Predict==1 & OK==True').shape[0]
    bankrupt_total = results['Test'].sum()
    bankrupt_score = bankrupt_accuracy / bankrupt_total * 100
    
    #save data in list to create a dict
    total_score.append(total_accuracy)
    bankrupt_predict.append(bankrupt_accuracy)
    bankrupt_predict_score.append(bankrupt_score)
    observation.append(minimum_dataset.shape[0])

dict_results = {
'Observations':observation, 
'Total Score':total_score, 
'Bankrupt Predict':bankrupt_predict,
'Bankrupt Score':bankrupt_predict_score
}
cbc_results = pd.DataFrame(data=dict_results)
cbc_results

Unnamed: 0,Observations,Total Score,Bankrupt Predict,Bankrupt Score
0,2439,92.622951,22,50.0
1,2438,92.213115,22,50.0
2,2437,91.803279,19,43.181818
3,2436,91.803279,17,38.636364
4,2435,93.839836,23,52.272727
