# Home Credit Default Risk Dataset

In [None]:
# imports

import pandas as pd
import numpy as np
import altair as alt
import re
import gc
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split # need to do this from scratch
from sklearn.preprocessing import StandardScaler
from sklearn import svm

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Income Bracket Distribution by Loan Status Visualization

In [None]:
app_train = pd.read_csv("/content/drive/MyDrive/home-credit-default-risk/application_train.csv")

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# define income brackets and labels
income_brackets = [0, 50000, 100000, 150000, 200000, 250000, 300000, 400000, 500000, app_train['AMT_INCOME_TOTAL'].max()]
income_labels = ['$0-50k', '$50k-100k', '$100k-150k', '$150k-200k', '$200k-250k', '$250k-300k', '$300k-400k', '$400k-500k', '$500k+']
app_train['Income Bracket'] = pd.cut(app_train['AMT_INCOME_TOTAL'], bins=income_brackets, labels=income_labels)

# groupby TARGET and Income Bracket, and count
income_dist = app_train.groupby(['Income Bracket', 'TARGET'])['AMT_INCOME_TOTAL'].count().reset_index()
income_dist.columns = ['Income Bracket', 'TARGET', 'Count']

# map TARGET values to descriptive labels
income_dist['TARGET'] = income_dist['TARGET'].map({0: 'Repayer', 1: 'Defaulter'})

# calc % within each bracket
income_dist['Percentage'] = income_dist.groupby('TARGET')['Count'].transform(lambda x: x / x.sum() * 100)

# plot as stacked bar chart
stacked_bar_chart = alt.Chart(income_dist).mark_bar().encode(
    x=alt.X('TARGET:N', title='Loan Status'),
    y=alt.Y('Percentage:Q', title='Income Distribution (%)', scale=alt.Scale(domain=[0, 100])),  # Set y-axis scale to [0, 100]
    color=alt.Color('Income Bracket:N', title='Income Bracket', scale=alt.Scale(domain=income_labels)),
    tooltip=['Income Bracket', 'Percentage']
).properties(
    title=alt.TitleParams(text="Income Bracket Distribution by Loan Status", fontSize=20),
    width=400,
    height=400
).configure_axis(
    labelFontSize=12,
    titleFontSize=14
).configure_legend(
    titleFontSize=14,
    labelFontSize=12
).encode(
    x=alt.X('TARGET:N', title='Loan Status', axis=alt.Axis(labelAngle=0))  # Rotate labels by 45 degrees
)

stacked_bar_chart

  income_dist = app_train.groupby(['Income Bracket', 'TARGET'])['AMT_INCOME_TOTAL'].count().reset_index()
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


This visualizations was created for our proposal, to show that the income bracket distributions between loan status are very similar.

## METRICS

In [None]:
# accuracy

def accuracy(y_true, y_pred):
  correct_preds = sum(y_true == y_pred)
  accuracy = correct_preds / len(y_true)
  return accuracy

In [None]:
# f1

def f1(y_true, y_pred):
  tp = np.sum((y_true == 1) & (y_pred == 1))
  fp = np.sum((y_true == 0) & (y_pred == 1))
  fn = np.sum((y_true == 1) & (y_pred == 0))

  precision = tp / (tp + fp) if (tp + fp) > 0 else 0 # no division by 0
  recall = tp / (tp + fn) if (tp + fn) > 0 else 0
  f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
  return f1

In [None]:
# roc-auc

def roc_auc(y_true, y_proba):
  # Sort the true labels and probabilities together by probability values
  sorted_indices = np.argsort(y_proba)
  y_true_sorted = y_true[sorted_indices]
  y_proba_sorted = y_proba[sorted_indices]

  # Initialize variables to compute TPR and FPR at each threshold
  tpr_values = []
  fpr_values = []
  thresholds = np.unique(y_proba_sorted)

  # Calculate total positives and negatives
  P = np.sum(y_true == 1)
  N = np.sum(y_true == 0)

  for threshold in thresholds:
      # Classify as positive if the predicted probability >= threshold
      y_pred = (y_proba_sorted >= threshold).astype(int)

      # Calculate TP, FP, FN, TN
      TP = np.sum((y_true_sorted == 1) & (y_pred == 1))
      FP = np.sum((y_true_sorted == 0) & (y_pred == 1))

      # Calculate TPR and FPR for the current threshold
      TPR = TP / P if P > 0 else 0
      FPR = FP / N if N > 0 else 0

      tpr_values.append(TPR)
      fpr_values.append(FPR)

  # Sort FPR and TPR values
  fpr_values = np.array(fpr_values)
  tpr_values = np.array(tpr_values)

  # Calculate the area under the curve using the trapezoidal rule
  auc = -np.trapz(tpr_values, fpr_values)
  return auc

In [None]:
# brier score

def brier_score(y_true, y_prob):
  brief_score = sum((y_true - y_prob) ** 2) / len(y_true)
  return brier_score

## Reading in Data & Split 80-20
- need to try with the 2 other splits

In [None]:
merged_df = pd.read_csv('/content/drive/MyDrive/college/year 4/24 q1 (fall)/data/403/data403project2/merged_df_train_raw.csv')

X = merged_df.drop(columns=['TARGET', 'SK_ID_CURR'])
y = merged_df['TARGET']

del merged_df
gc.collect()

X['CODE_GENDER'].replace('XNA', np.nan, inplace=True)
X['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
X['NAME_FAMILY_STATUS'].replace('Unknown', np.nan, inplace=True)
X['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace=True)

X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
for col in X.select_dtypes(include='object').columns:
    X[col] = X[col].astype('category')

NameError: name 'pd' is not defined

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Logistic Regression (unpenalized & penalized)

### Logistic Regression (unpenalized)

In [None]:
log = LogisticRegression()
log.fit(X_train, y_train) # may need to one hot encode X?

test_pred = log.predict(X_test)

ValueError: could not convert string to float: 'Cash loans'

In [None]:
# accuracy scores!
test_accuracy = accuracy(y_test, test_pred)
print('TEST Accuracy: ', test_accuracy) # really good

In [None]:
# f1 scores!
test_f1 = f1(y_test, test_pred)
print('TEST F1 Score: ', test_f1)

In [None]:
# roc-auc scores!
test_proba = log.predict_proba(X_test)[:, 1]
test_roc_auc = roc_auc(y_test.values, test_proba)
print('TEST ROC-AUC Score: ', test_roc_auc)

In [None]:
# brier score!
y_prob = svm_classifier.predict_proba(X_test)[:, 1]
test_brier = brier_score(y_test, y_prob)
print('TEST Brier Score: ', test_brier)

### Logistic Regression (L1 Regularization)

In [None]:
log_l1 = LogisticRegression(penalty='l1', C=1.0, solver='saga') # L1 (Lasso) penalization
log_l1.fit(X_train, y_train)

y_pred = log_l1.predict(X_test)

In [None]:
# accuracy scores!
test_accuracy = accuracy(y_test, y_pred)
print('TEST ACCURACY: ', test_accuracy)

In [None]:
# f1 score
test_f1 = f1(y_test, y_pred)
print('TEST F1 Score: ', test_f1) # pretty bad

In [None]:
# roc-auc
test_proba = log_pen.predict_proba(X_test)[:, 1]  # Predicted probabilities for the positive class
test_roc_auc = roc_auc(y_test.values, test_proba)
print('TEST ROC-AUC Score:', test_roc_auc)

In [None]:
# brier score!
y_prob = svm_classifier.predict_proba(X_test)[:, 1]
test_brier = brier_score(y_test, y_prob)
print('TEST Brier Score: ', test_brier)

### Logistic Regression (L2 Regularization)

In [None]:
log_l2 = LogisticRegression(penalty='l2', C=1.0, solver='liblinear')  # L2 (Ridge) penalization
log_l2.fit(X_train, y_train)

y_pred = log_l2.predict(X_test)

In [None]:
# accuracy scores!
test_accuracy = accuracy(y_test, y_pred)
print('TEST ACCURACY: ', test_accuracy)

In [None]:
# f1 score
test_f1 = f1(y_test, y_pred)
print('TEST F1 Score: ', test_f1) # pretty bad

In [None]:
# roc-auc
test_proba = log_pen.predict_proba(X_test)[:, 1]  # Predicted probabilities for the positive class
test_roc_auc = roc_auc(y_test.values, test_proba)
print('TEST ROC-AUC Score:', test_roc_auc)

In [None]:
# brier score!
y_prob = svm_classifier.predict_proba(X_test)[:, 1]
test_brier = brier_score(y_test, y_prob)
print('TEST Brier Score: ', test_brier)

## Support Vector Machine (SVM)

In [None]:
svm_classifier = LinearSVC(C=1.0)
svm_classifier.fit(X_train, y_train)

y_pred = svm_classifier.predict(X_test)

In [None]:
# accuracy scores!
test_accuracy = accuracy(y_test, y_pred)
print('TEST ACCURACY: ', test_accuracy)

In [None]:
# f1 score
test_f1 = f1(y_test, y_pred)
print('TEST F1 Score: ', test_f1) # pretty bad

In [None]:
# roc-auc
test_proba = log_pen.predict_proba(X_test)[:, 1]  # Predicted probabilities for the positive class
test_roc_auc = roc_auc_custom(y_test.values, test_proba)
print('TEST ROC-AUC Score:', test_roc_auc)

In [None]:
# brier
y_prob = svm_classifier.predict_proba(X_test)[:, 1]
test_brier = brier_score(y_test, y_prob)
print('TEST Brier Score: ', test_brier)