In [21]:
import pandas as pd

import numpy as np

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC


from utils import remove_top_quantile


import pickle

Below is a data loading/transformation pipeline which will be also used in notebook part 4.

In [22]:
# load data
df = pd.read_csv('../data/GiveMeSomeCredit-training.csv')
df = df.drop(["Unnamed: 0"], axis=1)

# impute missing values
imputer = SimpleImputer(strategy='median')

columns = df.columns
index = df.index    
df = pd.DataFrame(imputer.fit_transform(df))
df.columns = columns
df.index = index

# outlier removal
df = remove_top_quantile(df, "RevolvingUtilizationOfUnsecuredLines", 0.99)
df = remove_top_quantile(df, "DebtRatio", 0.99)
df = remove_top_quantile(df, "MonthlyIncome", 0.99)

# feature engineering
def f(a):
    # never late
    if a["NumberOfTime30-59DaysPastDueNotWorse"] == 0 and \
        a["NumberOfTime60-89DaysPastDueNotWorse"] == 0 and \
        a["NumberOfTimes90DaysLate"] == 0:
            return 0
    # 30-59 late
    if a["NumberOfTime30-59DaysPastDueNotWorse"] != 0 and \
        a["NumberOfTime60-89DaysPastDueNotWorse"] == 0 and \
        a["NumberOfTimes90DaysLate"] == 0:
            return 1
    # 60-89 late
    if a["NumberOfTime30-59DaysPastDueNotWorse"] != 0 and \
        a["NumberOfTime60-89DaysPastDueNotWorse"] != 0 and \
        a["NumberOfTimes90DaysLate"] == 0:
            return 2
    # 90+ late
    return 3


df["PastDueSevereness"] = df.apply(f, axis=1)

# drop
df = df.drop(["NumberOfTime30-59DaysPastDueNotWorse", "NumberOfTime30-59DaysPastDueNotWorse", "NumberOfTime30-59DaysPastDueNotWorse"], axis=1)

# scaling
scaler = StandardScaler()
df[[
    'age',
    'NumberOfDependents',
    'MonthlyIncome',
    'DebtRatio', 
    'RevolvingUtilizationOfUnsecuredLines', 
    'NumberOfOpenCreditLinesAndLoans', 
    'NumberRealEstateLoansOrLines'
]] = scaler.fit_transform(
    df[[
        'age',
        'NumberOfDependents',
        'MonthlyIncome',
        'DebtRatio', 
        'RevolvingUtilizationOfUnsecuredLines', 
        'NumberOfOpenCreditLinesAndLoans', 
        'NumberRealEstateLoansOrLines'
    ]])


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 135840 entries, 0 to 139998
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      135840 non-null  float64
 1   RevolvingUtilizationOfUnsecuredLines  135840 non-null  float64
 2   age                                   135840 non-null  float64
 3   DebtRatio                             135840 non-null  float64
 4   MonthlyIncome                         135840 non-null  float64
 5   NumberOfOpenCreditLinesAndLoans       135840 non-null  float64
 6   NumberOfTimes90DaysLate               135840 non-null  float64
 7   NumberRealEstateLoansOrLines          135840 non-null  float64
 8   NumberOfTime60-89DaysPastDueNotWorse  135840 non-null  float64
 9   NumberOfDependents                    135840 non-null  float64
 10  PastDueSevereness                     135840 non-null  int64  
dtypes: fl

In [24]:
# load data
df_test = pd.read_csv('../data/GiveMeSomeCredit-testing.csv')
df_test = df_test.drop(["Unnamed: 0"], axis=1)

# impute
columns = df_test.columns
index = df_test.index    
df_test = pd.DataFrame(imputer.transform(df_test))
df_test.columns = columns
df_test.index = index

# apply new feature
df_test["PastDueSevereness"] = df_test.apply(f, axis=1)

# drop
df_test = df_test.drop(["NumberOfTime30-59DaysPastDueNotWorse", "NumberOfTime30-59DaysPastDueNotWorse", "NumberOfTime30-59DaysPastDueNotWorse"], axis=1)

# scale
df_test[[
    'age',
    'NumberOfDependents',
    'MonthlyIncome',
    'DebtRatio', 
    'RevolvingUtilizationOfUnsecuredLines', 
    'NumberOfOpenCreditLinesAndLoans', 
    'NumberRealEstateLoansOrLines'
]] = scaler.transform(
    df_test[[
        'age',
        'NumberOfDependents',
        'MonthlyIncome',
        'DebtRatio', 
        'RevolvingUtilizationOfUnsecuredLines', 
        'NumberOfOpenCreditLinesAndLoans', 
        'NumberRealEstateLoansOrLines'
    ]])

In [25]:
X_train = df[[
    "RevolvingUtilizationOfUnsecuredLines",
    "age",	
    "DebtRatio",	
    "MonthlyIncome",	
    "NumberOfOpenCreditLinesAndLoans",	
    "NumberRealEstateLoansOrLines", 
    "NumberOfDependents", 	
    "PastDueSevereness"
]]

y_train = df["SeriousDlqin2yrs"]

X_test = df_test[[
    "RevolvingUtilizationOfUnsecuredLines",
    "age",	
    "DebtRatio",	
    "MonthlyIncome",	
    "NumberOfOpenCreditLinesAndLoans",	
    "NumberRealEstateLoansOrLines", 
    "NumberOfDependents", 	
    "PastDueSevereness"
]]

y_test = df_test["SeriousDlqin2yrs"]

### Baseline logistic regression model

In [26]:
logreg = LogisticRegression()

logreg.fit(X_train, y=y_train)

y_pred = logreg.predict(X_train)

accuracy = accuracy_score(y_train, y_pred)
precision = precision_score(y_train, y_pred)
recall = recall_score(y_train, y_pred)
f1 = f1_score(y_train, y_pred)

print(f"{accuracy=}, {precision=}, {recall=}, {f1=}")

accuracy=0.9357479387514723, precision=0.471047227926078, recall=0.13357400722021662, f1=0.20812919615314826


We can see very low recall rate, it stems from class imbalance. Lets try to make a model with "balanced" class_weight parameter in an attempt to tackle class imbalance.

In [27]:
logreg = LogisticRegression(class_weight="balanced")

logreg.fit(X_train, y=y_train)

y_pred = logreg.predict(X_train)

accuracy = accuracy_score(y_train, y_pred)
precision = precision_score(y_train, y_pred)
recall = recall_score(y_train, y_pred)
f1 = f1_score(y_train, y_pred)

print(f"{accuracy=}, {precision=}, {recall=}, {f1=}")

accuracy=0.7970995288574794, precision=0.2017353579175705, recall=0.7472924187725631, f1=0.31770472323992477


With balanced class weights we have much more restrictive model. Recall is significantly higher while precision is halved. F1 score still remains quite low.