In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [77]:
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, RocCurveDisplay
from sklearn.metrics import PrecisionRecallDisplay
from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import (
    make_column_transformer,
    make_column_selector,
    ColumnTransformer,
)

from sklearn.metrics import classification_report

from sklearn.dummy import DummyClassifier
from sklearn.feature_selection import chi2
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow import keras

from sklearn import set_config

set_config(display="diagram")

#### Auxiliary variables

In [3]:
filename_input = "data-hdma-eastbay-2019-2022-processed"
filename_output = "data-hdma-eastbay-2019-2022-sklearn"

#### Loading the processed data

In [4]:
df = pd.read_csv(f"../../{filename_input}.csv", index_col=0)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 222872 entries, 0 to 222871
Data columns (total 37 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   debt_to_income_ratio_df            222872 non-null  float64
 1   property_value                     222872 non-null  float64
 2   loan_term                          222872 non-null  float64
 3   loan_to_value_ratio                222872 non-null  float64
 4   loan_amount                        222872 non-null  float64
 5   income                             222872 non-null  float64
 6   tract_minority_population_percent  222872 non-null  float64
 7   tract_to_msa_income_percentage     222872 non-null  float64
 8   tract_population                   222872 non-null  float64
 9   tract_one_to_four_family_homes     222872 non-null  float64
 10  tract_median_age_of_housing_units  222872 non-null  float64
 11  tract_owner_occupied_units         222872 no

## 3. Preparing for modeling

### 3.1. Lists of feature names

In [6]:
tract_features = [x for x in df.columns if x.startswith("tract_")]

numerical_loan_features = [
    "debt_to_income_ratio",
    "debt_to_income_ratio_df",
    "loan_term",
    "loan_to_value_ratio",
]
numerical_currency_features = ["property_value", "income", "loan_amount"]
numerical_sqrt_currency_features = [
    "sqrt_income",
    "sqrt_property_value",
    "sqrt_loan_amount",
]
categorical_applicant_features = [
    "applicant_sex",
    "applicant_race",
    "applicant_ethnicity",
    "applicant_age",
]
categorical_coapplicant_features = [
    "coapplicant_sex",
    "coapplicant_race",
    "coapplicant_ethnicity",
    "coapplicant_age",
]
categorical_loan_features = [
    "conforming_loan_limit",
    "occupancy_type",
    # "hoepa_status",
    "lien_status",
    "loan_purpose",
    "loan_type",
]
binary_features = [
    "applicant_age_above_62",
    "interest_only_payment",
    "business_or_commercial_purpose",
    "open-end_line_of_credit",
]

derived_features = [
    "derived_sex",
    "derived_race",
    "derived_ethnicity",
    "derived_loan_product_type",
]

target_related_features = ["denial_reason"]

target_feature = "application_outcome"

### 3.3. X,y

In [15]:
X = df[
    numerical_loan_features
    + numerical_currency_features
    # + numerical_sqrt_currency_features
    + categorical_applicant_features
    + categorical_coapplicant_features
    + categorical_loan_features
    + binary_features
    # + derived_features
]
y = df[target_feature]
y = y.map({'Approved': 1, 'Denied': 0})

categorical_columns = X.select_dtypes(include=object).columns
numerical_columns = X.select_dtypes(include=float).columns

print(categorical_columns)
print(numerical_columns)

X.loc[:, categorical_columns] = X.loc[:, categorical_columns].astype('category')

Index(['applicant_sex', 'applicant_race', 'applicant_ethnicity',
       'applicant_age', 'coapplicant_sex', 'coapplicant_race',
       'coapplicant_ethnicity', 'coapplicant_age', 'conforming_loan_limit',
       'occupancy_type', 'lien_status', 'loan_purpose', 'loan_type',
       'applicant_age_above_62', 'interest_only_payment',
       'business_or_commercial_purpose', 'open-end_line_of_credit'],
      dtype='object')
Index(['debt_to_income_ratio', 'debt_to_income_ratio_df', 'loan_term',
       'loan_to_value_ratio', 'property_value', 'income', 'loan_amount'],
      dtype='object')


In [76]:
to_categorical(y)

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [16]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 222872 entries, 0 to 222871
Data columns (total 24 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   debt_to_income_ratio            222872 non-null  float64
 1   debt_to_income_ratio_df         222872 non-null  float64
 2   loan_term                       222872 non-null  float64
 3   loan_to_value_ratio             222872 non-null  float64
 4   property_value                  222872 non-null  float64
 5   income                          222872 non-null  float64
 6   loan_amount                     222872 non-null  float64
 7   applicant_sex                   222872 non-null  object 
 8   applicant_race                  222872 non-null  object 
 9   applicant_ethnicity             222872 non-null  object 
 10  applicant_age                   222872 non-null  object 
 11  coapplicant_sex                 222872 non-null  object 
 12  coapplicant_race     

In [17]:
def total_column_transformer():
    return ColumnTransformer(
        [
            ("ohe", OneHotEncoder(sparse_output=False, drop = "if_binary"), categorical_columns),
        ],
        remainder=StandardScaler(),
        verbose_feature_names_out=False,
    )
    
total_preprocessor = total_column_transformer().set_output(transform="pandas")
X_transformed = total_preprocessor.fit_transform(X)

### 3.4. train/test split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, y, test_size=0.3, stratify=y, random_state=42
)

In [81]:
y.value_counts(normalize=True)

application_outcome
1    0.659244
0    0.340756
Name: proportion, dtype: float64

## 4. Modeling

In [87]:
model = Sequential([
    Dense(200, activation='relu'),
    Dense(1, activation='sigmoid'),
])
# model.compile(optimizer="adam", loss = 'bce', metrics = ['acc', 'precision', 'recall'])
model.compile(optimizer="adam", loss = keras.losses.BinaryFocalCrossentropy(
    apply_class_balancing=True,
    alpha=0.4,
    name="binary_focal_crossentropy",
), metrics = ['acc', 'precision', 'recall'])
model.summary()

In [88]:
history = model.fit(X_train, y_train, epochs=500, validation_data=(X_test, y_test))

Epoch 1/500
[1m4876/4876[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - acc: 0.7709 - loss: 0.0597 - precision: 0.8178 - recall: 0.8377 - val_acc: 0.7935 - val_loss: 0.0561 - val_precision: 0.8291 - val_recall: 0.8652
Epoch 2/500
[1m4876/4876[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - acc: 0.7967 - loss: 0.0558 - precision: 0.8301 - recall: 0.8711 - val_acc: 0.7961 - val_loss: 0.0554 - val_precision: 0.8316 - val_recall: 0.8662
Epoch 3/500
[1m4876/4876[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - acc: 0.7980 - loss: 0.0552 - precision: 0.8319 - recall: 0.8700 - val_acc: 0.7856 - val_loss: 0.0556 - val_precision: 0.8452 - val_recall: 0.8261
Epoch 4/500
[1m4876/4876[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - acc: 0.8006 - loss: 0.0547 - precision: 0.8344 - recall: 0.8707 - val_acc: 0.7961 - val_loss: 0.0551 - val_precision: 0.8335 - val_recall: 0.8630
Epoch 5/500
[1m4876/4876[0m [32m━━━━━━━━━━━━━━━━

In [89]:
model.summary()

In [None]:
plt.figure(figsize=(4,3))
plt.plot(model.validation_score_, label='validation')
plt.plot(model.train_score_, label='training')
plt.legend()
plt.show()

In [72]:
predictions = np.where(model.predict(X_test) > 0.5, 1, 0)
print(classification_report(y_test, predictions))

[1m2090/2090[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step


              precision    recall  f1-score   support

           0       0.79      0.60      0.68     22784
           1       0.82      0.92      0.86     44078

    accuracy                           0.81     66862
   macro avg       0.80      0.76      0.77     66862
weighted avg       0.81      0.81      0.80     66862

