Purpose:
Predict if a person will default their credit card next month

In [1]:
import pandas as pd
df=pd.read_csv('D:/Dropbox/GitHub/Credit Card Default Prediction/UCI_Credit_Card.csv')

In [2]:
# rename two columns
df=df.rename(columns={'default.payment.next.month':'def_pay','PAY_0':'PAY_1'})

# dealing with categorical variables

In [3]:
# SEX: replace 1 with male and 2 with female
df['SEX'] = df['SEX'].replace(1,'male').replace(2,'female')
# EDUCATION: 
df['EDUCATION'] = df['EDUCATION'].replace(0,5).replace(6,5).replace(1,'graduate school').replace(2,'university').replace(3,'high school').replace(4,'others').replace(5,'unknown')
# MARRIAGE
df['MARRIAGE'] = df['MARRIAGE'].replace(0,3).replace(1,'married').replace(2,'single').replace(3,'others')

In [4]:
# Calculate default rate
df.def_pay.sum()/len(df.def_pay)

0.2212

In [5]:
# prepare X and y for machine learning
X = df.drop(['def_pay','ID'],axis=1)
y=df['def_pay'].copy()

In [6]:
# split data into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [7]:
# Transofrmation pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# determine categorical and numerical features
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns


# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('standardize', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


# Logistic Regression

In [8]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
param_dist={
    'solver':['lbfgs', 'liblinear', 'sag', 'saga'],
    'C':np.logspace(-9, 9, num=50, base=10),
    'tol':np.logspace(-9, 9, num=50, base=10)
}

lr_cv = RandomizedSearchCV(LogisticRegression(),param_dist,cv=5,scoring='f1',verbose=1, n_jobs=-1, n_iter=1000)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('random_search', lr_cv)
                             ])

my_pipeline.fit(X_train, y_train)


# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(lr_cv.best_params_))
print("Best score is {}".format(lr_cv.best_score_))

# save grid search cross validation results to file
results = pd.DataFrame(lr_cv.cv_results_)
results.to_csv('credit default logistic regression random search result.csv',index=False)
print('random search cross validation results saved')

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 572 tasks      | elapsed:   24.4s
[Parallel(n_jobs=-1)]: Done 1008 tasks      | elapsed:   42.1s
[Parallel(n_jobs=-1)]: Done 1667 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 2448 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 3186 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 4520 tasks      | elapsed:  3.2min


Tuned Decision Tree Parameters: {'tol': 1000.0, 'solver': 'lbfgs', 'C': 177827941.00389227}
Best score is 0.40835104624954505
random search cross validation results saved


[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:  3.6min finished


In [9]:
# use the best hyper parameters to build the model
logistic_regression_clf = lr_cv.best_estimator_

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('clf', logistic_regression_clf)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)
y_train_pred = my_pipeline.predict(X_train)
y_pred = my_pipeline.predict(X_test)

from sklearn.metrics import accuracy_score
print('training set model accuracy = ', accuracy_score(y_train, y_train_pred))

from sklearn.metrics import f1_score
print('training set model f1 score = ', f1_score(y_train, y_train_pred))

from sklearn.metrics import accuracy_score
print('testing set model accuracy = ', accuracy_score(y_test, y_pred))

from sklearn.metrics import f1_score
print('testing set model f1 score = ', f1_score(y_test, y_pred))


from sklearn.metrics import classification_report
print('testing set model classification report : \n ', classification_report(y_test, y_pred))

training set model accuracy =  0.7972857142857143
training set model f1 score =  0.2434689888039808
testing set model accuracy =  0.7972222222222223
testing set model f1 score =  0.2302825811893716
testing set model classification report : 
                precision    recall  f1-score   support

           0       0.80      0.98      0.88      7040
           1       0.66      0.14      0.23      1960

    accuracy                           0.80      9000
   macro avg       0.73      0.56      0.56      9000
weighted avg       0.77      0.80      0.74      9000

