# 02 - Exploratory data analysis

With help from https://www.analyticsvidhya.com/blog/2022/07/step-by-step-exploratory-data-analysis-eda-using-python/.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

SEED=42
np.random.seed(SEED)

In [None]:
# load data
df = pd.read_csv("./data/processed/german.csv")

y = df["credit_risk"]
X = df.drop(columns=["credit_risk", "id"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

## Statistics summary

In [None]:
df.describe().T

`duration_months`

Interpretation:
- Most loans last 12â€“24 months (IQR).
- A few loans are very long (up to 72 months).
- Distribution is likely right-skewed (mean > median), meaning a few very long loans increase the mean.


(here i can add interpretations for all)

In [None]:
df.isna().sum()

There are no missing values in the data.

In [None]:
## check final counts of observations and covariates after preprocessing

X.shape # (1000, 20)

# there are:
# 1000 observations
# 20 covariates

In [None]:
## check for data imbalance

print(y_train.value_counts())   # 560/240
print(y_test.value_counts())    # 140/60

# we find that our data is not imbalanced, so we can use accuracy as our model refit metric

## Plotting

In [None]:
target = "credit_risk"

In [None]:
numeric_features = X.select_dtypes(include="number").columns
categorical_features = X.select_dtypes(exclude="number").columns

print(f'Numeric features: {numeric_features}')
print(f'Categorical features: {categorical_features}')

In [None]:
for col in numeric_features:
    sns.histplot(data=df, x=col, hue="credit_risk", kde=True)
    plt.show()

In [None]:
for col in categorical_features:
    sns.countplot(data=df, x=col, hue="credit_risk")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# covariates correlation heatmap (numeric covariates only)
plt.figure(figsize=(8,6))
sns.heatmap(df[numeric_features].corr(), annot=True, cmap="coolwarm", center=0)
plt.title("Correlation Heatmap (Numeric Features)")
plt.show()

In [None]:
## boxplots: numeric features vs target
for col in numeric_features:
    plt.figure(figsize=(6,4))
    sns.boxplot(data=df, x=target, y=col)
    plt.title(f"{col} by Credit Risk")
    plt.show()

In [None]:
## categorical features: countplots by target
for col in categorical_features:
    plt.figure(figsize=(8,4))
    sns.countplot(data=df, x=col, hue=target)
    plt.xticks(rotation=45)
    plt.title(f"{col} distribution by Credit Risk")
    plt.show()

In [None]:
## proportions of target per category (stacked bar chart)
for col in categorical_features:
    prop_df = pd.crosstab(df[col], df[target], normalize="index")
    prop_df.plot(kind="bar", stacked=True, figsize=(8,4))
    plt.ylabel("Proportion")
    plt.title(f"Proportion of Credit Risk by {col}")
    plt.show()

In [None]:
## feature importance previews
from sklearn.tree import DecisionTreeClassifier

X = df.drop(columns=[target])
y = df[target]

tree = DecisionTreeClassifier(max_depth=3, random_state=42)
tree.fit(pd.get_dummies(X, drop_first=True), y)

importances = pd.Series(tree.feature_importances_, index=pd.get_dummies(X, drop_first=True).columns)
importances.sort_values(ascending=False).head(10).plot(kind="barh", figsize=(8,4))
plt.title("Top 10 Feature Importances (Shallow Decision Tree)")
plt.show()