In [4]:
import pandas as pd
import os

'''
# working directory fewi afq[hiw89t v39phctq8-]
print("Current Working Directory:", os.getcwd())
'''

# load data
application_records = pd.read_csv('application_record.csv')
credit_records = pd.read_csv('credit_record.csv')

merged_data = pd.merge(application_records, credit_records, on='ID', how='inner')

'''
# columns
print(application_records.columns)
print(credit_records.columns)
'''

'''
print(merged_data.head())
'''

'''
# 777715 rows, 20 columns
print(merged_data.shape)
'''

Index(['ID', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE',
       'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS'],
      dtype='object')
Index(['ID', 'MONTHS_BALANCE', 'STATUS'], dtype='object')


In [11]:
## Data Preprocessing, One-Hot Encoding for Categorical Variables,  Scale Numerical Features ##

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Selecting features for the model (simplifying for this example)
features = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
            'AMT_INCOME_TOTAL', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 
            'NAME_HOUSING_TYPE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 
            'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL',
            'CNT_FAM_MEMBERS']
X = merged_data[features]

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'DAYS_BIRTH', 
                                        'DAYS_EMPLOYED', 'CNT_FAM_MEMBERS']),
        ('cat', categorical_transformer, ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 
                                          'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 
                                          'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 
                                          'FLAG_PHONE', 'FLAG_EMAIL'])
    ])

In [15]:
## Target Variable, 'STATUS' column ## (easier to identify overdue payments)

# Simplify the target variable: 1 for overdue payments, 0 for paid off loans
# C means loan is closed or paid off
# X means no loan for the month
# 0: 1-29 days past due 
# 1: 30-59 days past due 
# 2: 60-89 days overdue 
# 3: 90-119 days overdue 
# 4: 120-149 days overdue 
# 5: Overdue or bad debts, write-offs for more than 150 days
merged_data['Target'] = merged_data['STATUS'].apply(lambda x: 0 if x in ['C', 'X'] else 1)

Y = merged_data['Target']

'''
print(merged_data.head())
'''

'\nprint(merged_data.head())\n'

In [16]:
## Logistic Model ##
from sklearn.model_selection import train_test_split

# split into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

# Create the modeling pipeline
model = make_pipeline(preprocessor, LogisticRegression())

# Train the model
model.fit(X_train, Y_train)


In [17]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the testing set
Y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))


Accuracy: 0.6124480047318105
              precision    recall  f1-score   support

           0       0.61      1.00      0.76     95255
           1       0.59      0.00      0.00     60288

    accuracy                           0.61    155543
   macro avg       0.60      0.50      0.38    155543
weighted avg       0.61      0.61      0.47    155543

