In [42]:
import warnings
warnings.filterwarnings('ignore')

In [43]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [44]:
# Load the data
file_path = Path('application_train.csv')
application_train = pd.read_csv(file_path)
application_train.head()


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
# Select relevant columns and create a new data frame
df = application_train.loc[:,["SK_ID_CURR",
               "AMT_ANNUITY",
               "TARGET",
               "DAYS_EMPLOYED",
               "AMT_CREDIT",
               "AMT_INCOME_TOTAL",
               "AMT_GOODS_PRICE",
               "AMT_REQ_CREDIT_BUREAU_YEAR",
               "CNT_FAM_MEMBERS",
               "CNT_CHILDREN",
               "NAME_EDUCATION_TYPE",
               "DAYS_BIRTH"]]
df.head()

Unnamed: 0,SK_ID_CURR,AMT_ANNUITY,TARGET,DAYS_EMPLOYED,AMT_CREDIT,AMT_INCOME_TOTAL,AMT_GOODS_PRICE,AMT_REQ_CREDIT_BUREAU_YEAR,CNT_FAM_MEMBERS,CNT_CHILDREN,NAME_EDUCATION_TYPE,DAYS_BIRTH
0,100002,24700.5,1,-637,406597.5,202500.0,351000.0,1.0,1.0,0,Secondary / secondary special,-9461
1,100003,35698.5,0,-1188,1293502.5,270000.0,1129500.0,0.0,2.0,0,Higher education,-16765
2,100004,6750.0,0,-225,135000.0,67500.0,135000.0,0.0,1.0,0,Secondary / secondary special,-19046
3,100006,29686.5,0,-3039,312682.5,135000.0,297000.0,,2.0,0,Secondary / secondary special,-19005
4,100007,21865.5,0,-3038,513000.0,121500.0,513000.0,0.0,1.0,0,Secondary / secondary special,-19932


In [46]:
# Enahnce the data with an addtional set of data and load the data
file_path = Path('credit_card_balance.csv')
credit_card_balance = pd.read_csv(file_path)
credit_card_balance.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,...,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,...,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,...,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,...,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0


In [47]:
# check for nulls
df.isna().sum()

SK_ID_CURR                        0
AMT_ANNUITY                      12
TARGET                            0
DAYS_EMPLOYED                     0
AMT_CREDIT                        0
AMT_INCOME_TOTAL                  0
AMT_GOODS_PRICE                 278
AMT_REQ_CREDIT_BUREAU_YEAR    41519
CNT_FAM_MEMBERS                   2
CNT_CHILDREN                      0
NAME_EDUCATION_TYPE               0
DAYS_BIRTH                        0
dtype: int64

In [48]:
# create a function to run through data set and remove nulls 
def fill_in(data):
    for i in data.columns:
        if data[i].dtype == "object":
            data[i] = data[i].fillna("")
        elif (data[i].dtype == "int64" or data[i].dtype == "float64"):
             data[i] = data[i].fillna(data[i].mean())
        else:
            pass
    return data

In [49]:
# remove nulls using function
df=fill_in(df)

In [50]:
# check that nulls are removed
df.isna().sum()

SK_ID_CURR                    0
AMT_ANNUITY                   0
TARGET                        0
DAYS_EMPLOYED                 0
AMT_CREDIT                    0
AMT_INCOME_TOTAL              0
AMT_GOODS_PRICE               0
AMT_REQ_CREDIT_BUREAU_YEAR    0
CNT_FAM_MEMBERS               0
CNT_CHILDREN                  0
NAME_EDUCATION_TYPE           0
DAYS_BIRTH                    0
dtype: int64

In [51]:
# transform the AGE column from days to years
df["AGE"] = round(abs(df["DAYS_BIRTH"])/365)
df.drop(columns="DAYS_BIRTH", inplace = True)

In [52]:
df.head()

Unnamed: 0,SK_ID_CURR,AMT_ANNUITY,TARGET,DAYS_EMPLOYED,AMT_CREDIT,AMT_INCOME_TOTAL,AMT_GOODS_PRICE,AMT_REQ_CREDIT_BUREAU_YEAR,CNT_FAM_MEMBERS,CNT_CHILDREN,NAME_EDUCATION_TYPE,AGE
0,100002,24700.5,1,-637,406597.5,202500.0,351000.0,1.0,1.0,0,Secondary / secondary special,26.0
1,100003,35698.5,0,-1188,1293502.5,270000.0,1129500.0,0.0,2.0,0,Higher education,46.0
2,100004,6750.0,0,-225,135000.0,67500.0,135000.0,0.0,1.0,0,Secondary / secondary special,52.0
3,100006,29686.5,0,-3039,312682.5,135000.0,297000.0,1.899974,2.0,0,Secondary / secondary special,52.0
4,100007,21865.5,0,-3038,513000.0,121500.0,513000.0,0.0,1.0,0,Secondary / secondary special,55.0


In [53]:
# merge the two data sets in prep for the model
merged = pd.merge(left=df, right=credit_card_balance, how='left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')

In [54]:
merged.head()

Unnamed: 0,SK_ID_CURR,AMT_ANNUITY,TARGET,DAYS_EMPLOYED,AMT_CREDIT,AMT_INCOME_TOTAL,AMT_GOODS_PRICE,AMT_REQ_CREDIT_BUREAU_YEAR,CNT_FAM_MEMBERS,CNT_CHILDREN,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,100002,24700.5,1,-637,406597.5,202500.0,351000.0,1.0,1.0,0,...,,,,,,,,,,
1,100003,35698.5,0,-1188,1293502.5,270000.0,1129500.0,0.0,2.0,0,...,,,,,,,,,,
2,100004,6750.0,0,-225,135000.0,67500.0,135000.0,0.0,1.0,0,...,,,,,,,,,,
3,100006,29686.5,0,-3039,312682.5,135000.0,297000.0,1.899974,2.0,0,...,0.0,0.0,,0.0,,,0.0,Active,0.0,0.0
4,100006,29686.5,0,-3039,312682.5,135000.0,297000.0,1.899974,2.0,0,...,0.0,0.0,,0.0,,,0.0,Active,0.0,0.0


In [55]:
# remove nulls from the merged data set using function
merged = fill_in(merged)

In [56]:
# verify nulls are removed
merged.isnull().sum()

SK_ID_CURR                    0
AMT_ANNUITY                   0
TARGET                        0
DAYS_EMPLOYED                 0
AMT_CREDIT                    0
AMT_INCOME_TOTAL              0
AMT_GOODS_PRICE               0
AMT_REQ_CREDIT_BUREAU_YEAR    0
CNT_FAM_MEMBERS               0
CNT_CHILDREN                  0
NAME_EDUCATION_TYPE           0
AGE                           0
SK_ID_PREV                    0
MONTHS_BALANCE                0
AMT_BALANCE                   0
AMT_CREDIT_LIMIT_ACTUAL       0
AMT_DRAWINGS_ATM_CURRENT      0
AMT_DRAWINGS_CURRENT          0
AMT_DRAWINGS_OTHER_CURRENT    0
AMT_DRAWINGS_POS_CURRENT      0
AMT_INST_MIN_REGULARITY       0
AMT_PAYMENT_CURRENT           0
AMT_PAYMENT_TOTAL_CURRENT     0
AMT_RECEIVABLE_PRINCIPAL      0
AMT_RECIVABLE                 0
AMT_TOTAL_RECEIVABLE          0
CNT_DRAWINGS_ATM_CURRENT      0
CNT_DRAWINGS_CURRENT          0
CNT_DRAWINGS_OTHER_CURRENT    0
CNT_DRAWINGS_POS_CURRENT      0
CNT_INSTALMENT_MATURE_CUM     0
NAME_CON

In [57]:
label_encoder = LabelEncoder()

In [58]:
# Binary encoding using Pandas to transform the following columns
df_encoded = pd.get_dummies(merged, columns=["NAME_EDUCATION_TYPE" , "NAME_CONTRACT_STATUS"])
df_encoded.head()

Unnamed: 0,SK_ID_CURR,AMT_ANNUITY,TARGET,DAYS_EMPLOYED,AMT_CREDIT,AMT_INCOME_TOTAL,AMT_GOODS_PRICE,AMT_REQ_CREDIT_BUREAU_YEAR,CNT_FAM_MEMBERS,CNT_CHILDREN,...,NAME_EDUCATION_TYPE_Lower secondary,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_CONTRACT_STATUS_,NAME_CONTRACT_STATUS_Active,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Completed,NAME_CONTRACT_STATUS_Demand,NAME_CONTRACT_STATUS_Refused,NAME_CONTRACT_STATUS_Sent proposal,NAME_CONTRACT_STATUS_Signed
0,100002,24700.5,1,-637,406597.5,202500.0,351000.0,1.0,1.0,0,...,0,1,1,0,0,0,0,0,0,0
1,100003,35698.5,0,-1188,1293502.5,270000.0,1129500.0,0.0,2.0,0,...,0,0,1,0,0,0,0,0,0,0
2,100004,6750.0,0,-225,135000.0,67500.0,135000.0,0.0,1.0,0,...,0,1,1,0,0,0,0,0,0,0
3,100006,29686.5,0,-3039,312682.5,135000.0,297000.0,1.899974,2.0,0,...,0,1,0,1,0,0,0,0,0,0
4,100006,29686.5,0,-3039,312682.5,135000.0,297000.0,1.899974,2.0,0,...,0,1,0,1,0,0,0,0,0,0


In [59]:
# check nulls are no nulls in data set 
df_encoded.isna().sum()

SK_ID_CURR                                           0
AMT_ANNUITY                                          0
TARGET                                               0
DAYS_EMPLOYED                                        0
AMT_CREDIT                                           0
AMT_INCOME_TOTAL                                     0
AMT_GOODS_PRICE                                      0
AMT_REQ_CREDIT_BUREAU_YEAR                           0
CNT_FAM_MEMBERS                                      0
CNT_CHILDREN                                         0
AGE                                                  0
SK_ID_PREV                                           0
MONTHS_BALANCE                                       0
AMT_BALANCE                                          0
AMT_CREDIT_LIMIT_ACTUAL                              0
AMT_DRAWINGS_ATM_CURRENT                             0
AMT_DRAWINGS_CURRENT                                 0
AMT_DRAWINGS_OTHER_CURRENT                           0
AMT_DRAWIN

In [60]:
# Create our target
y = df_encoded.loc[:,'TARGET'].copy().to_frame()
# Cresate our features
X = df_encoded.drop(columns="TARGET")


In [61]:
# review counts
y['TARGET'].value_counts()

0    3201279
1     247292
Name: TARGET, dtype: int64

In [62]:
# Create X_train, X_test, y_train, y_test
# YOUR CODE HERE
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X, 
                                                   y, 
                                                   random_state=1, 
                                                   stratify=y)
X_train.shape

(2586428, 44)

Data Pre-Processing

In [63]:
# Create the StandardScaler instance
from sklearn.preprocessing import StandardScaler
# YOUR CODE HERE
datascaler = StandardScaler()

In [64]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset
# YOUR CODE HERE
X_scaler = datascaler.fit(X_train)

In [65]:
# Scale the training and testing data
# YOUR CODE HERE
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

Simple Logistic Regression

In [66]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train_scaled, y_train)

LogisticRegression(random_state=1)

In [67]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.5000273182725842

In [68]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
confusion_matrix(y_test, y_pred)

array([[800299,     21],
       [ 61818,      5]])

In [69]:
# Create model performance stats to make it easier to read
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual Approve", "Actual Deny"], columns=["Predicted Approve", "Predicted Deny"]
)
# Calculating the accuracy score
acc_score = accuracy_score(y_test, y_pred)

In [70]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted Approve,Predicted Deny
Actual Approve,800299,21
Actual Deny,61818,5


Accuracy Score : 0.9282729199216372
Classification Report
              precision    recall  f1-score   support

           0       0.93      1.00      0.96    800320
           1       0.19      0.00      0.00     61823

    accuracy                           0.93    862143
   macro avg       0.56      0.50      0.48    862143
weighted avg       0.88      0.93      0.89    862143



In [71]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      1.00      0.00      0.96      0.01      0.00    800320
          1       0.19      0.00      1.00      0.00      0.01      0.00     61823

avg / total       0.88      0.93      0.07      0.89      0.01      0.00    862143

