# Company Bankruptcy Prediction

### Cel biznesowy
Predykcja brankructwa firm na podstawie parametrów ekonomicznych

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

In [None]:
db_raw = pd.read_csv('./db/data.csv') 
db_raw.head()

## EDA

Sprawdzenie wartości null w danych kolumnach

**Wniosek**: brak wratości null

In [None]:
db_raw.describe()

In [None]:
db_raw.info()

Sprawdzenie ile firm zbankrutowało, a ile nie

In [None]:
db_raw['Bankrupt?'].value_counts()

Wyrzucenie ' Net Income Flag' - 0 dla każdego recordu

In [None]:
db = db_raw.drop([' Net Income Flag'], axis=1)

Podzielenie na zbiór treningowy i walidacyjny do sprawdzenia poprawności założeń


In [None]:
target = db['Bankrupt?']
db = db.drop(['Bankrupt?'], axis=1)

In [None]:
x_temp, x_valid, y_temp, y_valid = train_test_split(db, target, test_size=0.3, stratify = target, random_state = 42)
x_train, x_test, y_train, y_test = train_test_split(x_temp, y_temp, test_size=0.3, stratify = y_temp, random_state = 42)

Sprawdzenie zależności miedzy bankrupctwem a pozostałymi parametrami celem wyszczególenia tych najbardziej istotnych

Za pomocą boxplot

In [None]:
for i in range(10, 20):
    fig, ax = plt.subplots(figsize=(25,8))
    sns.boxplot(x=y_train, y=x_train[x_train.columns[i]], ax=ax)
    ax.set_title(f'Boxplot of {x_train.columns[i]}')
    print(i)
    plt.show()

oraz za pomocą heatmapy korelacji

In [None]:
spearman_corr = db.corr('spearman')
mask = np.triu(np.ones_like(spearman_corr, dtype=bool))

plt.figure(figsize=(25, 20))
sns.heatmap(spearman_corr, mask=mask, square=True, linewidths=0.5, cmap='coolwarm')
plt.show()

Wybór kolumn z najbardziej widocznymi zależnościami do dalszych badań korelacji

In [None]:
print(spearman_corr['Bankrupt?'].sort_values(ascending=False).head(15))
print(spearman_corr['Bankrupt?'].sort_values(ascending=False).tail(30))

Wizualizacja kilku ciekawych parametrów za pomocą boxplotów

In [None]:
plt.subplots(ncols=5, figsize=(25,8))

sns.boxplot(x='Bankrupt?', y=" Persistent EPS in the Last Four Seasons", data=db, ax=plt.subplot(1, 5, 1))
sns.boxplot(x='Bankrupt?', y=" Net Income to Total Assets", data=db, ax=plt.subplot(1, 5, 2))
sns.boxplot(x='Bankrupt?', y=" ROA(B) before interest and depreciation after tax", data=db, ax=plt.subplot(1, 5, 3))
sns.boxplot(x='Bankrupt?', y=" Net Value Per Share (B)", data=db, ax=plt.subplot(1, 5, 4))
sns.boxplot(x='Bankrupt?', y=" Equity to Long-term Liability", data=db, ax=plt.subplot(1, 5, 5))

Obsługa outlierów

In [None]:
for column in x_train.columns:
    upper_lim = x_train[column].quantile(.98)
    lower_lim = x_train[column].quantile(.02)

    x_train.loc[x_train[column] > upper_lim, column] = upper_lim
    x_train.loc[x_train[column] < lower_lim, column] = lower_lim

    x_test.loc[x_test[column] > upper_lim, column] = upper_lim
    x_test.loc[x_test[column] < lower_lim, column] = lower_lim
    
    x_valid.loc[x_valid[column] > upper_lim, column] = upper_lim
    x_valid.loc[x_valid[column] < lower_lim, column] = lower_lim

Logarytm kolumn

In [None]:
for column in x_train.columns:
    skew = x_train[column].skew()
    if skew > 0.5 or skew < -0.5:
        x_train[column] = np.log1p(x_train[column])  
        x_test[column] = np.log1p(x_test[column])  
        x_valid[column] = np.log1p(x_valid[column])  

Normalizacja

In [None]:
scaler = MinMaxScaler()
scaler.fit_transform(x_train)

x_train = pd.DataFrame(scaler.transform(x_train), columns=x_train.columns)
x_test = pd.DataFrame(scaler.transform(x_test), columns=x_test.columns)
x_valid = pd.DataFrame(scaler.transform(x_valid), columns=x_valid.columns)

Wybór najbardziej istotnych cech

In [None]:
rf_model = RandomForestClassifier(random_state=69)
rf_model.fit(x_train, y_train)
perm_importance = permutation_importance(rf_model, x_train, y_train, n_repeats=100, random_state=69)

In [None]:
sorted_idx = perm_importance.importances_mean.argsort()
top_vars = 40

x_test = x_test[x_train.columns[sorted_idx][-top_vars:]]
x_valid = x_valid[x_train.columns[sorted_idx][-top_vars:]]
x_train = x_train[x_train.columns[sorted_idx][-top_vars:]]

plt.figure(figsize=(10, 10))
plt.barh(range(top_vars), perm_importance.importances_mean[sorted_idx][-top_vars:], color='skyblue')
plt.yticks(range(top_vars), x_train.columns)
plt.xlabel('Permutation Importance')
plt.title('Variable Importance')
plt.show()

In [None]:
fig, axs = plt.subplots(10, 10, figsize=(25, 25))

for i in range(10):
    for j in range(10):
        column_index = i * 10 + j
        if column_index < len(db.columns):
            sns.kdeplot(db[db.columns[column_index]], ax=axs[i, j])
            axs[i, j].set_title(f'Density plot of {db.columns[column_index]}')

plt.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(10, 10, figsize=(25, 25))

for i in range(10):
    for j in range(10):
        column_index = i * 10 + j
        if column_index < len(db.columns):
            sns.kdeplot(db[db.columns[column_index]], ax=axs[i, j])
            axs[i, j].set_title(f'Density plot of {db.columns[column_index]}')

plt.tight_layout()
plt.show()

Pierwsze testowanie modeli: LogisticRegression

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'penalty': ['l1', 'l2'],
              'max_iter': list(range(100,800,100)),
              'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

grid = GridSearchCV(LogisticRegression(), param_grid, refit=True, verbose=3, cv=5)

grid.fit(x_train, y_train)
best_estimator = grid.best_estimator_

y_pred = best_estimator.predict(x_test)

In [None]:
best_estimator = grid.best_estimator_

y_pred = best_estimator.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

precision = precision_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

roc_auc = roc_auc_score(y_test, y_pred)
gini = 2 * roc_auc - 1

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')
print('-'*30)
print(f'accuracy: {accuracy}')
print(f'precision: {precision}')
print(f'recall: {recall}')
print(f'f1: {f1}')
print('-'*30)
print(f'ROC_AUC: {roc_auc}')
print(f'GINI: {gini}')

In [None]:
model = LogisticRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

precision = precision_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

roc_auc = roc_auc_score(y_test, y_pred)
gini = 2 * roc_auc - 1

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')
print('-'*30)
print(f'accuracy: {accuracy}')
print(f'precision: {precision}')
print(f'recall: {recall}')
print(f'f1: {f1}')
print('-'*30)
print(f'ROC_AUC: {roc_auc}')
print(f'GINI: {gini}')

In [None]:
from sklearn.metrics import roc_curve

log_fpr, log_tpr, log_thresold = roc_curve(y_test, best_estimator)



def graph_roc_curve_multiple(log_fpr, log_tpr):
    plt.figure(figsize=(20,8))
    plt.title('ROC Curve', fontsize=14)
    plt.plot(log_fpr, log_tpr, label='Logistic Regression Classifier Score: {:.4f}'.format(roc_auc_score(y_test, best_estimator)))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([-0.01, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=13)
    plt.ylabel('True Positive Rate', fontsize=13)
    plt.annotate('Minimum ROC Score of 50% \n (This is the minimum score to get)', xy=(0.5, 0.5), xytext=(0.6, 0.3),
                arrowprops=dict(facecolor='#6E726D', shrink=0.05),
                )
    plt.legend()
    
graph_roc_curve_multiple(log_fpr, log_tpr)
plt.show()