In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import glob

from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
path = 'data/coded_data/*.csv'

frames = []

# Process each file in chunks
for fname in glob.glob(path):
    chunks = pd.read_csv(fname, chunksize=500000)  
    for chunk in chunks:
        frames.append(chunk)

all_data = pd.concat(frames)

print("Data successfully loaded in chunks.")



Data successfully loaded in chunks.


In [3]:
all_data.columns

Index(['as_of_year', 'respondent_id', 'agency_code', 'loan_type',
       'property_type', 'loan_purpose', 'owner_occupancy', 'loan_amount_000s',
       'preapproval', 'action_taken', 'msamd', 'state_code', 'county_code',
       'census_tract_number', 'applicant_ethnicity', 'co_applicant_ethnicity',
       'applicant_race_1', 'applicant_race_2', 'applicant_race_3',
       'applicant_race_4', 'applicant_race_5', 'co_applicant_race_1',
       'co_applicant_race_2', 'co_applicant_race_3', 'co_applicant_race_4',
       'co_applicant_race_5', 'applicant_sex', 'co_applicant_sex',
       'applicant_income_000s', 'purchaser_type', 'denial_reason_1',
       'denial_reason_2', 'denial_reason_3', 'rate_spread', 'hoepa_status',
       'lien_status', 'edit_status', 'sequence_number', 'population',
       'minority_population', 'hud_median_family_income',
       'tract_to_msamd_income', 'number_of_owner_occupied_units',
       'number_of_1_to_4_family_units', 'application_date_indicator'],
      dtyp

In [4]:
def map_loan_approval(action):
    if action == 1:
        return 1
    elif action in [2, 4, 5]:
        return 2
    elif action == 3:
        return 0
    else:
        return None  # All values that are dealing w/ preapproval and other inconclusive outcomes go here

filtered_data = all_data
filtered_data['loan_approval'] = filtered_data['action_taken'].apply(map_loan_approval)

In [5]:
# Prune data

# Prune all property types except for 'One-to-four family dwelling'
filtered_data = filtered_data[filtered_data['property_type'] == 1]

# Only include loans that are either approved or denied
filtered_data = filtered_data[(filtered_data['loan_approval'] == 0) | (filtered_data['loan_approval'] == 1)]

# Only inlude home purchases (exclude refi)
filtered_data = filtered_data[filtered_data['loan_purpose'] == 1]

In [6]:
print(all_data.shape, filtered_data.shape)

(516205, 46) (70395, 46)


In [16]:
# Column Filtering

X_fair = filtered_data[['as_of_year', 'loan_type',
       'owner_occupancy', 'loan_amount_000s',
       'preapproval', 'applicant_income_000s', 'rate_spread', 'hud_median_family_income',
       'tract_to_msamd_income', 'number_of_owner_occupied_units',
       'number_of_1_to_4_family_units']].fillna(0)

y = filtered_data['loan_approval']

In [18]:
print(X_fair.isnull().any())

as_of_year                        False
loan_type                         False
owner_occupancy                   False
loan_amount_000s                  False
preapproval                       False
applicant_income_000s             False
rate_spread                       False
hud_median_family_income          False
tract_to_msamd_income             False
number_of_owner_occupied_units    False
number_of_1_to_4_family_units     False
dtype: bool


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_fair, y, test_size=0.3, random_state=42, stratify=y)
model = LogisticRegression(max_iter=1000, class_weight='balanced')

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.18      0.61      0.27      2668
         1.0       0.91      0.58      0.71     18451

    accuracy                           0.59     21119
   macro avg       0.54      0.60      0.49     21119
weighted avg       0.82      0.59      0.66     21119



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
