# 05 - Interpreting logistic regression model

In [None]:
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split

SEED=42
np.random.seed(SEED)

In [None]:
# Load data
df = pd.read_csv("./data/processed/german.csv")

y = df["credit_risk"]
X = df.drop(columns=["credit_risk", "id"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

In [None]:
# import saved models
loaded_models = {}
for name in ["logistic_regression"]:
    path = f"saved_models/{name}.pkl"
    loaded_models[name] = joblib.load(path)

lr_model = loaded_models["logistic_regression"]

## Interpretation

In [None]:
## interpretation

best_pipeline = lr_model.best_estimator_
preprocessor = best_pipeline.named_steps["preprocess"]
model = best_pipeline.named_steps["model"]

# get feature names
numeric_features = X.select_dtypes(include="number").columns
categorical_features = X.select_dtypes(exclude="number").columns
cat_features_encoded = preprocessor.named_transformers_["cat"] \
    .get_feature_names_out(categorical_features)
all_feature_names = np.concatenate([numeric_features, cat_features_encoded])

# print coefficients and odds ratios for interpretation
coef = model.coef_[0]
coef_df = pd.DataFrame({
    "feature": all_feature_names,
    "coefficient": coef,
    "odds_ratio": np.exp(coef)
}).sort_values(by="odds_ratio", ascending=False)

print(coef_df)

In [None]:
#        feature  coefficient  odds_ratio
# 38                 property_A124     0.778540    2.178290
# 20                   purpose_A46     0.503262    1.654108
# 2       installment_rate_percent     0.352686    1.422884
# 0                duration_months     0.325862    1.385225
# 1                  credit_amount     0.299864    1.349676
# 34            other_debtors_A102     0.293565    1.341201
# 37                 property_A123     0.182374    1.200063
# 27       employment_duration_A72     0.166939    1.181682
# 44                      job_A173     0.138598    1.148663
# 43                      job_A172     0.118969    1.126335
# 19                   purpose_A45     0.109644    1.115880
# 6                     dependents     0.109456    1.115670
# 5               existing_credits     0.105305    1.111050
# 36                 property_A122     0.100998    1.106275
# 10            credit_history_A31     0.081256    1.084649
# 45                      job_A174     0.069202    1.071653
# 3             residence_duration     0.017244    1.017394
# 4                            age    -0.051939    0.949387
# 31       personal_status_sex_A92    -0.134229    0.874390
# 30       employment_duration_A75    -0.174347    0.840005
# 23            savings_status_A62    -0.181331    0.834159
# 11            credit_history_A32    -0.194593    0.823170
# 28       employment_duration_A73    -0.209255    0.811188
# 18                   purpose_A44    -0.219844    0.802644
# 33       personal_status_sex_A94    -0.239679    0.786880
# 16                   purpose_A42    -0.276227    0.758641
# 39  other_installment_plans_A142    -0.295640    0.744055
# 12            credit_history_A33    -0.296046    0.743753
# 7            checking_status_A12    -0.300267    0.740621
# 46                telephone_A192    -0.316243    0.728882
# 21                   purpose_A48    -0.354349    0.701630
# 22                   purpose_A49    -0.376726    0.686104
# 41                  housing_A152    -0.421097    0.656327
# 24            savings_status_A63    -0.434718    0.647447
# 35            other_debtors_A103    -0.449359    0.638037
# 15                  purpose_A410    -0.541870    0.581660
# 8            checking_status_A13    -0.551553    0.576054
# 17                   purpose_A43    -0.583763    0.557796
# 29       employment_duration_A74    -0.634736    0.530075
# 32       personal_status_sex_A93    -0.699264    0.496951
# 40  other_installment_plans_A143    -0.708471    0.492397
# 42                  housing_A153    -0.761912    0.466773
# 47           foreign_worker_A202    -0.764323    0.465649
# 26            savings_status_A65    -0.933211    0.393289
# 25            savings_status_A64    -0.966711    0.380332
# 13            credit_history_A34    -0.984557    0.373605
# 14                   purpose_A41    -0.988848    0.372005
# 9            checking_status_A14    -1.446513    0.235390

From the coefficient and odds_ratio output, the Logistic regression model indicates that:

- Customers with longer loan durations, higher credit amounts, or higher installment rates are more likely to fail to repay their loans. 
- Some features help reduce risk of / protect against default (These customers are less likely to fail to repay):
  - Having checking account status A14
  - Taking a loan for purpose A41
- On the other hand, certain features increase risk:
  - Owning property type A124 makes a customer more likely to fail to repay.
- For numeric features like age, older customers are slightly less likely to default, while longer loan durations slightly increase the risk.

In [None]:
## aside:

# total # of features

len(numeric_features)       # 7 numerical features
len(categorical_features)   # 13 categorical features before encoding
len(cat_features_encoded)   # 41 categorical features after encoding
len(all_feature_names)      # 48

# there are 48 features after encoding