# Loan Default Prediction
### Hubert, Kevin, Joseph, Ankita
### About
This is our final project for ECS111. Our goal is to create a model that can accurately and efficiently predict whether someone will default their loan, utilizing information from their credit history and other metrics

### Dataset
The data is collected through an online Kaggle Competition [shown here](https://www.kaggle.com/datasets/rikdifos/credit-card-approval-prediction)

In [2]:
# import packages
# processing
import pandas as pd
import seaborn as sns
import numpy as np

#modeling
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

#system
import os

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
# load data
application_records = pd.read_csv('application_record.csv.zip',compression='zip')
credit_records = pd.read_csv('credit_record.csv.zip',compression='zip')


In [5]:
application_records.head()
#'ID': client number
#'CODE_GENDER': gender 
#'FLAG_OWN_CAR': has car? 
#'FLAG_OWN_REALTY': property? 
#'CNT_CHILDREN': number of children
#'AMT_INCOME_TOTAL': annual income 
#'NAME_INCOME_TYPE': type of income 
#'NAME_EDUCATION_TYPE': education level
#'NAME_FAMILY_STATUS': marital status 
#'NAME_HOUSING_TYPE': way of living
#'DAYS_BIRTH': birthday?????
#'DAYS_EMPLOYED': start date?????
#'FLAG_MOBIL': has mobile phone? 
#'FLAG_WORK_PHONE': has work phone?
#'FLAG_PHONE': has phone?
# 'FLAG_EMAIL': has email? 
#'OCCUPATION_TYPE': occupation 
#'CNT_FAM_MEMBERS':family size

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [6]:
credit_records.head()

#'ID': client number
#'STATUS':
# C means loan is closed or paid off
# X means no loan for the month
# 0: 1-29 days past due 
# 1: 30-59 days past due 
# 2: 60-89 days overdue 
# 3: 90-119 days overdue 
# 4: 120-149 days overdue 
# 5: Overdue or bad debts, write-offs for more than 150 days

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [None]:

merged_data = pd.merge(application_records, credit_records, on='ID', how='inner')

'''
# columns
print(application_records.columns)
print(credit_records.columns)
'''

'''
print(merged_data.head())
'''

'''
# 777715 rows, 20 columns
print(merged_data.shape)
'''

In [11]:
## Data Preprocessing, One-Hot Encoding for Categorical Variables,  Scale Numerical Features ##



# Selecting features for the model (simplifying for this example)
features = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
            'AMT_INCOME_TOTAL', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 
            'NAME_HOUSING_TYPE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 
            'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL',
            'CNT_FAM_MEMBERS']
X = merged_data[features]

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'DAYS_BIRTH', 
                                        'DAYS_EMPLOYED', 'CNT_FAM_MEMBERS']),
        ('cat', categorical_transformer, ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 
                                          'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 
                                          'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 
                                          'FLAG_PHONE', 'FLAG_EMAIL'])
    ])

In [15]:
## Target Variable, 'STATUS' column ## (easier to identify overdue payments)

# Simplify the target variable: 1 for overdue payments, 0 for paid off loans
# C means loan is closed or paid off
# X means no loan for the month
# 0: 1-29 days past due 
# 1: 30-59 days past due 
# 2: 60-89 days overdue 
# 3: 90-119 days overdue 
# 4: 120-149 days overdue 
# 5: Overdue or bad debts, write-offs for more than 150 days
merged_data['Target'] = merged_data['STATUS'].apply(lambda x: 0 if x in ['C', 'X'] else 1)

Y = merged_data['Target']

'''
print(merged_data.head())
'''

'\nprint(merged_data.head())\n'

In [16]:
## Logistic Model ##
from sklearn.model_selection import train_test_split

# split into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

# Create the modeling pipeline
model = make_pipeline(preprocessor, LogisticRegression())

# Train the model
model.fit(X_train, Y_train)


In [17]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the testing set
Y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))


Accuracy: 0.6124480047318105
              precision    recall  f1-score   support

           0       0.61      1.00      0.76     95255
           1       0.59      0.00      0.00     60288

    accuracy                           0.61    155543
   macro avg       0.60      0.50      0.38    155543
weighted avg       0.61      0.61      0.47    155543

