# Loading Data
- The dataset is composed of a combination of 6819 observations per each of our 96 features.
- All of the features are numerical (int64 or float64)
- There are no missing values (NaN) among the data

In [7]:
import pandas as pd
import warnings
warnings.filterwarnings(action="ignore")
import os
# "/Users/DaynoJa/Desktop/BankruptData/archive/data.csv"


In [13]:
data = pd.read_csv("/Users/DaynoJa/Desktop/BankruptData/archive/data.csv")
import numpy as np
np.any(data.dtypes==object)

False

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6819 entries, 0 to 6818
Data columns (total 96 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Bankrupt?                                                 6819 non-null   int64  
 1    ROA(C) before interest and depreciation before interest  6819 non-null   float64
 2    ROA(A) before interest and % after tax                   6819 non-null   float64
 3    ROA(B) before interest and depreciation after tax        6819 non-null   float64
 4    Operating Gross Margin                                   6819 non-null   float64
 5    Realized Sales Gross Margin                              6819 non-null   float64
 6    Operating Profit Rate                                    6819 non-null   float64
 7    Pre-tax net Interest Rate                                6819 non-null   float64
 8    After-tax net Int

In [15]:
np.any(data.isnull())
#Checking for null values

False

## Removing Outliers
- Increase Model Accuracy

In [16]:
outliers = data.columns.drop('Bankrupt?')

In [17]:
def remove_outliers(data):
    Q1 = data.quantile(0.10) #25th quantile of the column
    Q3 = data.quantile(0.90) #75th quantile of the column
    IQR = Q3 - Q1 #The difference between the 75th quantile and the 25th quantile of the column
    
    data_clean = data[~((data < (Q1-1.5*IQR)) | (data > (Q3+1.5*IQR))).any(axis=1)] #Find all the datapoints in the column which are less than its 25th quantile and greater than its 75th quantile and remove them
    
    return data_clean #Return the data which has no outliers

In [18]:
data[outliers] = remove_outliers(data[outliers])
#used the function 'remove_outliers' to clean the data for the columns specified in 'outliers', which is all the columns except the 'Bankrupt?' column
data.shape
data

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,,,,,,,,,,...,,,,,,,,,,
1,1,,,,,,,,,,...,,,,,,,,,,
2,1,,,,,,,,,,...,,,,,,,,,,
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.998700,0.796967,0.808966,0.30335,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1.0,0.023982
4,1,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6814,0,0.493687,0.539468,0.543230,0.604455,0.604462,0.998992,0.797409,0.809331,0.30351,...,0.799927,0.000466,0.623620,0.604455,0.840359,0.279606,0.027064,0.566193,1.0,0.029890
6815,0,,,,,,,,,,...,,,,,,,,,,
6816,0,,,,,,,,,,...,,,,,,,,,,
6817,0,,,,,,,,,,...,,,,,,,,,,


In [19]:
data.dropna(inplace=True)
data.shape
#The function changed all outliers to null values lets remove them

(3226, 96)

In [20]:
X = data.drop('Bankrupt?', axis=1)
y = data['Bankrupt?']

 ## Scaling Data

In [21]:
from sklearn.preprocessing import StandardScaler
import mlflow
scaler = StandardScaler()
X_ = scaler.fit_transform(X)
mlflow.sklearn.autolog()
#doesnt log anything yet

In [22]:
from sklearn.decomposition import PCA
pca = PCA()
X_ = pca.fit_transform(X_)b

2021/02/22 16:12:02 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '92612f439dd54e4797320ce2106bf037', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


In [23]:
X = pd.DataFrame(X_, columns=X.columns)

In [24]:
import seaborn as sns
sns.countplot(y)

ModuleNotFoundError: No module named 'seaborn'