In [None]:
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install numpy
!pip install scipy
!pip install scikit-learn
!pip install statsmodels

In [None]:
import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

import numpy as np

from scipy.stats import skew

import scipy

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

import statsmodels
import statsmodels.api
from statsmodels.stats.diagnostic import kstest_normal
from statsmodels.stats.nonparametric import rank_compare_2indep

from utils import remove_top_quantile

In [None]:
df = pd.read_csv('../data/GiveMeSomeCredit-training.csv')
df.info()

In [None]:
df = df.drop(["Unnamed: 0"], axis=1)

In [None]:
df.columns.to_list()

In [None]:
df.describe()

In [None]:
df.head()

## EDA start

### Target variable SeriousDlqin2years

We can see we are dealing with significant target feature imbalance in traning data (14:1).

In [None]:
df["SeriousDlqin2yrs"].value_counts()

### Feature "RevolvingUtilizationOfUnsecuredLines"

Huge outliers present.

In [None]:
f, axes = plt.subplots(nrows=2, ncols=1, figsize=(7, 9))
f.suptitle('RevolvingUtilizationOfUnsecuredLines', fontsize=16)

sns.boxplot(df["RevolvingUtilizationOfUnsecuredLines"], ax=axes[0])
sns.histplot(df["RevolvingUtilizationOfUnsecuredLines"], bins=50, ax=axes[1])

plt.tight_layout()
plt.show()


### Age

In [None]:
f, axes = plt.subplots(nrows=2, ncols=1, figsize=(7, 9))
f.suptitle('Age', fontsize=16)

sns.boxplot(df["age"], ax=axes[0])
sns.histplot(df["age"], bins=50, ax=axes[1])

plt.tight_layout()
plt.show()

### DebtRatio

Massive outliers again. We will have to remove them later.

In [None]:
f, axes = plt.subplots(nrows=2, ncols=1, figsize=(7, 9))
f.suptitle('DebtRatio', fontsize=16)

sns.boxplot(df["DebtRatio"], ax=axes[0])
sns.histplot(df["DebtRatio"], bins=50, ax=axes[1])

plt.tight_layout()
plt.show()

### MonthlyIncome

In [None]:
f, axes = plt.subplots(nrows=2, ncols=1, figsize=(7, 9))
f.suptitle('MonthlyIncome', fontsize=16)

sns.boxplot(df["MonthlyIncome"], ax=axes[0])
sns.histplot(df["MonthlyIncome"], bins=50, ax=axes[1])

plt.tight_layout()
plt.show()

### NumberOfOpenCreditLinesAndLoans

In [None]:
f, axes = plt.subplots(nrows=2, ncols=1, figsize=(7, 9))
f.suptitle('NumberOfOpenCreditLinesAndLoans', fontsize=16)

sns.boxplot(df["NumberOfOpenCreditLinesAndLoans"], ax=axes[0])
sns.histplot(df["NumberOfOpenCreditLinesAndLoans"], bins=50, ax=axes[1])

plt.tight_layout()
plt.show()

### NumberRealEstateLoansOrLines

In [None]:
f, axes = plt.subplots(nrows=2, ncols=1, figsize=(7, 9))
f.suptitle('NumberRealEstateLoansOrLines', fontsize=16)

sns.boxplot(df["NumberRealEstateLoansOrLines"], ax=axes[0])
sns.histplot(df["NumberRealEstateLoansOrLines"], bins=50, ax=axes[1])

plt.tight_layout()
plt.show()

### NumberOfDependants

In [None]:
f, axes = plt.subplots(nrows=2, ncols=1, figsize=(7, 9))
f.suptitle('NumberOfDependents', fontsize=16)

sns.boxplot(df["NumberOfDependents"], ax=axes[0])
sns.histplot(df["NumberOfDependents"], bins=50, ax=axes[1])

plt.tight_layout()
plt.show()

## Outlier removal

Since large outliers are present, I am going to trim them. For more info about outliers, see notebook part 2.

In [None]:
df = remove_top_quantile(df, "RevolvingUtilizationOfUnsecuredLines", 0.95)
df = remove_top_quantile(df, "DebtRatio", 0.95)
df = remove_top_quantile(df, "MonthlyIncome", 0.95)

## Heatmap and correlation

In [None]:
sns.heatmap(df.corr())
df.corr()

## Hypothesis testing

### Debt ratio t-test

First lets test hypothesis that defaulted and non defaulted customer have the same debt ratio.

H0: Defaulted and non defaulted customers have the same debt ratio (mean).

H1: Defaulted and non defaulted customer have different debt ratio (mean).

After conducting U-test seen below, we can reject H0 in favor of H1 at level of significance 5%.

In [None]:
test_for_normality = kstest_normal(df.DebtRatio)

print(f"{test_for_normality=}")

In [None]:
debt_ratio_nondefault = df[df['SeriousDlqin2yrs'] == 0].DebtRatio
debt_ratio_default = df[df['SeriousDlqin2yrs'] == 1].DebtRatio

tstat, pvalue = rank_compare_2indep(debt_ratio_nondefault, debt_ratio_default)

print(f'{tstat=}, {pvalue=}')

In [None]:
df_plot = df[["DebtRatio", "SeriousDlqin2yrs"]]

sns.boxplot(y="DebtRatio", x="SeriousDlqin2yrs", hue="SeriousDlqin2yrs", data=df_plot)

plt.show()

### Monthly income t-test

Next up we can test hypothesis that defaulted and non defaulted customers have the same monthly income.

First we perform KS-test for normality.

In [None]:
test_for_normality = kstest_normal(df.MonthlyIncome)

print(f"{test_for_normality=}")

We can reject hypothesis that distribution is normal or normal-like.

H0: Defaulted and non defaulted customers have the same montly income.

H1: Non defaulted customers have higher income than defaulted customers.

After Mann Whitney U-test has been conducted, we can reject H0 in favor of H1.

In [None]:
monthly_income_nondefault = df[df['SeriousDlqin2yrs'] == 0].MonthlyIncome
monthly_income_default = df[df['SeriousDlqin2yrs'] == 1].MonthlyIncome

#tstat, pvalue, deg = statsmodels.stats.weightstats.ttest_ind(monthly_income_nondefault, monthly_income_default, alternative='larger')
tstat, pvalue = rank_compare_2indep(monthly_income_nondefault, monthly_income_default)

print(f'{tstat=}, {pvalue=}')

In [None]:
df_plot = df[["MonthlyIncome", "SeriousDlqin2yrs"]]

sns.boxplot(y="MonthlyIncome", x="SeriousDlqin2yrs", hue="SeriousDlqin2yrs", data=df_plot)

plt.show()

### Number of opened credit lines and loans
Next up we can test hypothesis that defaulted and non defaulted customers have the same number of opened credit lines and loans.

H0: Defaulted and non defaulted customers have the same number of opened credit lines and loans.

H1: Non defaulted customers don't have the same number of opened credit lines and loans.

Based on the results of a U-test, we can reject H0 in favor of H1.

In [None]:
test_for_normality = kstest_normal(df.NumberOfOpenCreditLinesAndLoans)

print(f"{test_for_normality=}")

In [None]:
number_of_open_credit_lines_and_loans_nondefault = df[df['SeriousDlqin2yrs'] == 0].NumberOfOpenCreditLinesAndLoans
number_of_open_credit_lines_and_loans_default = df[df['SeriousDlqin2yrs'] == 1].NumberOfOpenCreditLinesAndLoans

tstat, pvalue = rank_compare_2indep(
    number_of_open_credit_lines_and_loans_nondefault, 
    number_of_open_credit_lines_and_loans_default
)

print(f'{tstat=}, {pvalue=}')

In [None]:
df_plot = df[["NumberOfOpenCreditLinesAndLoans", "SeriousDlqin2yrs"]]

sns.boxplot(y="NumberOfOpenCreditLinesAndLoans", x="SeriousDlqin2yrs", hue="SeriousDlqin2yrs", data=df_plot)

plt.show()

## PCA (analysis)

Let's perform PCA analysis on current data.

In [None]:
input_features = df[[
    'RevolvingUtilizationOfUnsecuredLines',
    'age',
    'NumberOfTime30-59DaysPastDueNotWorse',
    'DebtRatio',
    'MonthlyIncome',
    'NumberOfOpenCreditLinesAndLoans',
    'NumberOfTimes90DaysLate',
    'NumberRealEstateLoansOrLines',
    'NumberOfTime60-89DaysPastDueNotWorse',
    'NumberOfDependents'
]]

target_feature = df['SeriousDlqin2yrs']

n_PCA_features = 10

In [None]:
imputer = SimpleImputer()
input_features = imputer.fit_transform(input_features)

In [None]:
scaler = MinMaxScaler()
input_features = scaler.fit_transform(input_features)

5 PCA components will be calculated and their impact shown.

In [None]:
pca = PCA(n_components=5)

input_features = pca.fit_transform(input_features)

input_features = pd.DataFrame(input_features, 
                              columns=[
                                    "COL_1",
                                    "COL_2",
                                    "COL_3",
                                    "COL_4",
                                    "COL_5"
                              ], 
                              index=df.index)

In [None]:
print(pca.explained_variance_ratio_)
sns.barplot(pca.explained_variance_ratio_)

In [None]:
df_pca = pd.concat([input_features, target_feature], axis=1)

df_pca.head()

The following graph shows correlation between two most impactful PCA components and target variable.

In [None]:
targets = [0, 1]
colors = ['orange', 'blue']

f, ax = plt.subplots(1, 1)

for target, color in zip(targets,colors):
    sns.scatterplot(
        x = df_pca.loc[df_pca['SeriousDlqin2yrs'] == target, 'COL_1'], 
        y = df_pca.loc[df_pca['SeriousDlqin2yrs'] == target, 'COL_2'], 
        color = color,
        ax = ax
    )

ax.legend(targets)
ax.grid()
plt.show()