# 02 - Exploratory data analysis

With help from https://www.analyticsvidhya.com/blog/2022/07/step-by-step-exploratory-data-analysis-eda-using-python/.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from scipy.stats import chi2_contingency

SEED=42
np.random.seed(SEED)

In [None]:
# load data
df = pd.read_csv("./data/processed/german.csv")

y = df["credit_risk"]
X = df.drop(columns=["credit_risk", "id"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

## Statistics summary

In [None]:
df.describe().T

In [None]:
df.isna().sum()

There are no missing values in the data.

In [None]:
## check final counts of observations and covariates after preprocessing

X.shape # (1000, 20)

# there are:
# 1000 observations
# 20 covariates

In [None]:
## check for data imbalance

print(y_train.value_counts())   # 560/240
print(y_test.value_counts())    # 140/60

# we find that our data is not imbalanced, so we can use accuracy as our model refit metric

## Plotting

In [None]:
target = "credit_risk"

In [None]:
numeric_features = X.select_dtypes(include="number").columns
categorical_features = X.select_dtypes(exclude="number").columns

print(f'Numeric features: {numeric_features}')
print(f'Categorical features: {categorical_features}')

In [None]:
for col in numeric_features:
    sns.histplot(data=df, x=col, hue="credit_risk", kde=True)
    plt.show()

In [None]:
for col in categorical_features:
    sns.countplot(data=df, x=col, hue="credit_risk")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# covariates correlation heatmap (numeric covariates only)
sns.heatmap(df[numeric_features].corr(), annot=True, cmap="coolwarm", center=0)
plt.title("Correlation Heatmap for Numeric Features")
plt.savefig("imgs/corr_heatmap.png", dpi=300, bbox_inches='tight')
plt.show()

Notable correlation between duration_months and credit_amount (+0.62). This makes sense since if you want to borrow a higher amount of credit, you need a longer time to pay it off.

In [None]:
# we see that credit_amount and duration_months have somewhat high correlation
# and residence_duration and age
# and credit_amount and installation_rate_percentage

sns.regplot(
    x='credit_amount', 
    y='duration_months', 
    data=df,
    scatter_kws={'alpha':0.5}  # makes points slightly transparent
)
plt.title('Duration vs Credit Amount with Fit Line')
plt.show()

In [None]:
# sns.boxplot(x='residence_duration', y='credit_amount', hue='credit_risk', data=df)
sns.violinplot(x='installment_rate_percent', y='credit_amount', hue='credit_risk', data=df)


In [None]:
## boxplots: numeric features vs target
for col in numeric_features:
    plt.figure(figsize=(6,4))
    sns.boxplot(data=df, x=target, y=col)
    plt.title(f"{col} by Credit Risk")
    plt.show()

In [None]:
## categorical features: countplots by target
for col in categorical_features:
    plt.figure(figsize=(8,4))
    sns.countplot(data=df, x=col, hue=target)
    plt.xticks(rotation=45)
    plt.title(f"{col} distribution by Credit Risk")
    plt.show()

In [None]:
## proportions of target per category (stacked bar chart)
for col in categorical_features:
    prop_df = pd.crosstab(df[col], df[target], normalize="index")
    prop_df.plot(kind="bar", stacked=True, figsize=(8,4))
    plt.ylabel("Proportion")
    plt.title(f"Proportion of Credit Risk by {col}")
    plt.show()

In [None]:
## dont use this anymore; i have logistic reg interpretation now!

## feature importance previews
plt.title("Top 10 Feature Importances")

tree = DecisionTreeClassifier(max_depth=3, random_state=42)
tree.fit(pd.get_dummies(X, drop_first=True), y)
importances = pd.Series(tree.feature_importances_, index=pd.get_dummies(X, drop_first=True).columns)
importances.sort_values(ascending=False).head(10).plot(kind="barh")

plt.xlabel("Feature Importance")
# plt.savefig("imgs/feature_importance.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# asks: how does the credit loan amount differ based on credit risk?
for col in categorical_features:
    g = sns.catplot(
        x=col,
        y='credit_amount',
        hue='credit_risk',
        kind='box',
        data=df,
        order=sorted(df[col].unique())
    )
    g.figure.suptitle(f'Credit amount by {col} and credit risk', y=1.02)

These boxplots show how loan amounts (credit_amount) vary across different categories of each categorical feature, with credit risk (credit_risk) as the hue (0 = good, 1 = bad). Each plot highlights:

The spread and median of loan amounts for each category.

Differences in distributions between good vs. bad credit risk, which may indicate higher risk for certain categories.

Outliers, representing unusually small or large loans that could be riskier or need further investigation.

Overall, these plots help identify which categorical variables and specific categories are potential predictors of credit risk based on their associated loan amounts.

Two plots that stand out:
1. Credit amount by purpose and credit risk
2. Credit amount by foreign worker and credit risk

We see that if purpose = A410 (other), then the credit_amount loan asking amount is much higher if they are a bad credit risk as opposed to if they were a good risk.

From the second plot, we see that for foreign_worker=A202, there is very low variability of the credit loan amount if they have good credit. The median is about $2500. However, if the person has bad credit, there is much higher variability in the credit loan amount with a median of about $6000.

In [None]:
# sns.catplot(
#     x='foreign_worker',
#     y='credit_amount',
#     hue='purpose',
#     kind='box',
#     data=df,
#     order=sorted(df['foreign_worker'].unique()),
#     hue_order=sorted(df['purpose'].unique())
# )


In [None]:
# correlation heatmap (built in function) only works for numerical
# so i'm handmaking one for categorical features using Cram√©r's V!

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r,k = confusion_matrix.shape
    return np.sqrt(phi2 / min(k-1, r-1))

cramers_matrix = pd.DataFrame(index=categorical_features, columns=categorical_features)

for col1 in categorical_features:
    for col2 in categorical_features:
        cramers_matrix.loc[col1, col2] = cramers_v(df[col1], df[col2])

cramers_matrix = cramers_matrix.astype(float)

plt.figure(figsize=(10,8))
sns.heatmap(cramers_matrix, annot=True, cmap='Blues', vmin=0, vmax=1)
plt.title("Correlation Heatmap for Categorical Features")
plt.savefig("imgs/cramers_v_heatmap.png", dpi=300, bbox_inches='tight')
plt.show()