# Company Bankruptcy Prediction

### Cel biznesowy
Predykcja brankructwa firm na podstawie parametrów ekonomicznych

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score

db_raw = pd.read_csv('./db/data.csv') 
db_raw.head()

## EDA

Sprawdzenie wartości null w danych kolumnach

**Wniosek**: brak wratości null

In [None]:
db_raw.describe()

In [None]:
db_raw.info()

Sprawdzenie ile firm zbankrutowało, a ile nie

In [None]:
db_raw['Bankrupt?'].value_counts()

Wyrzucenie ' Net Income Flag' - 0 dla każdego recordu

In [None]:
db = db_raw.drop([' Net Income Flag'], axis=1)

Sprawdzenie zależności miedzy bankrupctwem a pozostałymi parametrami celem wyszczególenia tych najbardziej istotnych

Za pomocą boxplot

In [None]:
for i in range(20):
    fig, ax = plt.subplots(figsize=(25,8))
    sns.boxplot(x='Bankrupt?', y=db.columns[i], data=db, ax=ax)
    ax.set_title(f'Boxplot of {db.columns[i]}')
    print(i)
    plt.show()

oraz za pomocą heatmapy korelacji

In [None]:
spearman_corr = db.corr('spearman')
mask = np.triu(np.ones_like(spearman_corr, dtype=bool))

plt.figure(figsize=(25, 20))
sns.heatmap(spearman_corr, mask=mask, square=True, linewidths=0.5, cmap='coolwarm')
plt.show()

Wybór kolumn z najbardziej widocznymi zależnościami do dalszych badań korelacji

In [None]:
print(spearman_corr['Bankrupt?'].sort_values(ascending=False).head(15))
print(spearman_corr['Bankrupt?'].sort_values(ascending=False).tail(30))

Wizualizacja kilku ciekawych parametrów za pomocą boxplotów

In [None]:
plt.subplots(ncols=5, figsize=(25,8))

sns.boxplot(x='Bankrupt?', y=" Persistent EPS in the Last Four Seasons", data=db, ax=plt.subplot(1, 5, 1))
sns.boxplot(x='Bankrupt?', y=" Net Income to Total Assets", data=db, ax=plt.subplot(1, 5, 2))
sns.boxplot(x='Bankrupt?', y=" ROA(B) before interest and depreciation after tax", data=db, ax=plt.subplot(1, 5, 3))
sns.boxplot(x='Bankrupt?', y=" Net Value Per Share (B)", data=db, ax=plt.subplot(1, 5, 4))
sns.boxplot(x='Bankrupt?', y=" Equity to Long-term Liability", data=db, ax=plt.subplot(1, 5, 5))

Podmiana wybranych wartości odstających na mediany

In [None]:
min_column_idx = [4, 5, 6, 7, 8, 10, 13, 16, 17, 18, 25, 26, 27, 28, 35, 38, 66, 89]
max_column_idx = [9, 13, 19, 21, 22, 23, 30, 34, 34, 34, 34, 34, 36, 36, 37, 40, 44, 58, 58, 60, 65, 65, 66, 69, 73, 75, 76, 79, 85, 85, 92]

for col_idx in min_column_idx:
    row_idx = db.iloc[:, col_idx].idxmin()
    median = db.iloc[:, col_idx].median()
    db.at[row_idx, db.columns[col_idx]] = median

for col_idx in max_column_idx:
    row_idx = db.iloc[:, col_idx].idxmax()
    median = db.iloc[:, col_idx].median()
    db.at[row_idx, db.columns[col_idx]] = median

In [None]:
target = db['Bankrupt?']
db = db.drop(['Bankrupt?'], axis=1)

Normalizacja danych

In [None]:
scaler = MinMaxScaler()
db = pd.DataFrame(scaler.fit_transform(db), columns=db.columns)

In [None]:
fig, axs = plt.subplots(10, 10, figsize=(25, 25))

for i in range(10):
    for j in range(10):
        column_index = i * 10 + j
        if column_index < len(db.columns):
            sns.kdeplot(db[db.columns[column_index]], ax=axs[i, j])
            axs[i, j].set_title(f'Density plot of {db.columns[column_index]}')

plt.tight_layout()
plt.show()

Zastosowanie logarytmu w celu poprawy rozkładu

In [None]:
for col in db:
    skew = db[col].skew()
    if skew > 0.5 or skew < -0.5:
        db[col] = np.log1p(db[col])  

In [None]:
fig, axs = plt.subplots(10, 10, figsize=(25, 25))

for i in range(10):
    for j in range(10):
        column_index = i * 10 + j
        if column_index < len(db.columns):
            sns.kdeplot(db[db.columns[column_index]], ax=axs[i, j])
            axs[i, j].set_title(f'Density plot of {db.columns[column_index]}')

plt.tight_layout()
plt.show()

Podzielenie na zbiór treningowy i walidacyjny do sprawdzenia poprawności założeń


In [None]:
x_temp, x_valid, y_temp, y_valid = train_test_split(db, target, test_size=0.3, stratify = target, random_state = 42)
x_train, x_test, y_train, y_test = train_test_split(x_temp, y_temp, test_size=0.3, stratify = y_temp, random_state = 42)

Pierwsze testowanie modeli: LogisticRegression

In [None]:
model = LogisticRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

precision_score = precision_score(y_test, y_pred)
accuracy_score = accuracy_score(y_test, y_pred)
recall_score = recall_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')
print(f'accuracy: {accuracy_score}')
print(f'precision: {precision_score}')
print(f'recall: {recall_score}')
print(f'f1: {f1_score}')