In [6]:
# Importing useful libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import rankdata
%matplotlib inline

In [7]:
#Data Preparation

default = pd.read_csv('default_of_credit_card_clients.csv')
default.reset_index(level = 0)
default.rename(columns=lambda x:x.lower(), inplace=True)
# Base values: female, other_education, not_married
default['grad school'] = (default['x3'] == 1).astype('int')
default['university'] = (default['x3'] == 2).astype('int')
default['high_school'] = (default['x3'] == 3).astype('int')
default.drop('x3', axis=1, inplace=True)

default['male'] = (default['x2']==1).astype('int')
default.drop('x2', axis=1, inplace=True)

default['married'] = (default['x4']==1).astype('int')
default.drop('x4', axis=1, inplace=True)

# For pay features if the <=0, then it means it was not delayed
pay_features = ['x6','x7','x8','x9','x10','x11']
for x in pay_features:
    default.loc[default[x]<=0, x] = 0
    
default.rename(columns={'default payment next month':'default'}, inplace=True)

In [8]:
default = pd.read_csv('default_of_credit_card_clients.csv')
default.head()

Unnamed: 0.1,Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
2,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0


In [9]:
# Building models using all features

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, precision_recall_curve
from sklearn.preprocessing import RobustScaler

In [24]:
target_name = 'default'
X = default.drop('default',axis=1)
robust_scaler = RobustScaler()
X = robust_scaler.fit_transform(X)
y = default[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=123, stratify=y)

ValueError: labels ['default'] not contained in axis

In [9]:
def CMatrix(CM, labels=['pay','default']):
    df = pd.DataFrame(data=CM, index=labels, columns=labels)
    df.index.name='TRUE'
    df.columns.name='PREDICTION'
    df.loc['Total'] = df.sum()
    df['Total'] = df.sun(axis=1)
    return df

In [10]:
# Preparing a DataFrame for model analysis

#DataFrame for evaluation metrics
metrics = pd.DataFrame(index=['accuracy', 'precision', 'recall'],
                      columns=['NULL', 'LogisticReg', 'ClassTree', 'NaiveBayes'])

In [11]:
# The NULL model: always predict the most common category

y_pred_test = np.repeat(y_train.value_counts().idxmax(), y_test.size)
metrics.loc['accuracy','NULL'] = accuracy_score(y_pred=y_pred_test, y_true=y_test)
metrics.loc['precision','NULL'] = precision_score(y_pred=y_pred_test, y_true=y_test)
metrics.loc['recall','NULL'] = recall_score(y_pred=y_pred_test, y_true=y_test)

CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
CMatrix(CM)

NameError: name 'y_train' is not defined

In [None]:
# Logistic Regression

# 1. Import the estimator object(model)
from sklearn.linear_model import LogisticRegression

# 2. Create an instance of the estimator
logistic_regression = LogisticRegression(n_jobs=-1 , random_state=15)

# 3. Use the training data to train the estimator
logistic_regression.fit(X_train, y_train)

# 4. Evaluate the model
y_pred_test = logistic_regression.predict(X_test)
metrics.loc['accuracy','LogisticReg'] = accuracy_score(y_pred=y_pred_test, y_true=y_test)
metrics.loc['precision','LogisticReg'] = precision_score(y_pred=y_pred_test, y_true=y_test)
metrics.loc['recall','LogisticReg'] = recall_score(y_pred=y_pred_test, y_true=y_test)
#confusion matrix
CM = confusion_matrix(y_pred = y_pred_test, y_true=y_test)
CMatrix(CM)

In [None]:
# Classification Trees

# 1. Import the estimator object(model)
from sklearn.tree import DecisionTreeClassifier

# 2. Create an instance of the estimator
class_tree = DecisionTreeClassifier(min_samples_split=30, min_samples_leaf=10, random_state=10)

# 3. Use the training data to train the estimator
class_tree.fit(X_train, y_train)

# 4. Evaluate the model
y_pred_test = class_tree.predict(X_test)
metrics.loc['accuracy','ClassTree'] = accuracy_score(y_pred=y_pred_test, y_true=y_test)
metrics.loc['precision','ClassTree'] = precision_score(y_pred=y_pred_test, y_true=y_test)
metrics.loc['recall','ClassTree'] = recall_score(y_pred=y_pred_test, y_true=y_test)
#confusion matrix
CM = confusion_matrix(y_pred = y_pred_test, y_true=y_test)
CMatrix(CM)

In [None]:
# Naive Bayes Classifier

# 1. Import the estimator object(model)
from sklearn.naive_bayes import GaussianNB

# 2. Create an instance of the estimator
NBC = GaussianNB()

# 3. Use the training data to train the estimator
NBC.fit(X_train , y_train)

# 4. Evaluate the model
y_pred_test = NBC.predict(X_test)
metrics.loc['accuracy','NaiveBayes'] = accuracy_score(y_pred=y_pred_test, y_true=y_test)
metrics.loc['precision','NaiveBayes'] = precision_score(y_pred=y_pred_test, y_true=y_test)
metrics.loc['recall','NaiveBayes'] = recall_score(y_pred=y_pred_test, y_true=y_test)
#confusion matrix
CM = confusion_matrix(y_pred = y_pred_test, y_true=y_test)
CMatrix(CM)

In [None]:
100*metrics

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
metrics.plot(kind='barh', ax=ax)
ax.grid();

In [None]:
precision_nb, recall_nb, threshold_nb = precision_recall_curve(y_true=y_test, probas_pred=NBC.predict_proba(X_test)[:,1])
precision_lr, recall_lr, threshold_lr = precision_recall_curve(y_true=y_test, probas_pred=logistic_regression.predict_proba(X_test)[:,1])

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(precision_nb, recall_nb, label='NaiveBayes')
ax.plot(precision_lr, recall_lr, label='LogisticReg')
ax.set_xlabel('Precision')
ax.set_ylabel('Recall')
ax.set_title('Precision-Recall curve')
#ax.hlines(y=0.5, xmin=0, xmax=1, color='red')
ax.legend()
ax,grid();

In [None]:
# Confusion matrix for modified Logistic Regression Classifier

fig, ax = plt.subplots(figsize=(8,5))
ax.plot(thresholds_lr, precision_lr[1:], label='Precision')
ax.plot(thresholds_lr, recall_lr[1:], label='Recall')
ax.set_xlabel('Classificatiob Threshold')
ax.set_ylabel('Precision, Recall')
ax.set_title('Logistic Regression Classifier: Precision-Recall')
ax.hlines(y=0.6, xmin=1, color='red')
ax.legend()
ax.grid()

In [None]:
# Classifier with threshold of 0.2

y_pred_proba = logistic_regression.predict_proba(X_test)[:,1]
y_pred_test = (y_pred_proba >=0.2).astype('int')
#Confusion matrix
CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
print("Recall: ", 100*recall_score(y_pred=y_pred_test, y_true=y_test))
print("Precision: ", 100*precision_score(y_pred=y_pred_test, y_true=y_test))
CMatrix(CM)

In [None]:
# Making Individual Predictions

def make_ind_prediction(new_data):
    data = new_data.values.reshape(1, -1)
    data = robust_scalar.transform(data)
    prob = logistic_regression.predict_proba(data)[0][1]
    if prob >= 0.2:
        return 'Will Default'
    else:
        return 'Will Pay'

In [None]:
pay = default[default['default']==0]
pay.head()

In [None]:
from collections import OrderedDict
new_customer = OrderedDict([('limit_bal',4000),('age',50),('bill_amt1',500),('bill_amt2',35509),('bill_amt3',689),('bill_amt4',0),('bill_amt5',0),('bill_amt6',0),('pay_amt1',0),('pay_amt2',35509),('pay_amt3',0),('pay_amt4',0),('pay_amt5',0),('pay_amt6',0),('male',1),('female',2),('grad_school',0),('university',1),('high_school',0),('married',1),('x6',-1),('x7',-1),('x8',-1),('x9',0),('x10',-1),('x11',0)])
new_customer = pd.Series(new_customer)
make_ind_prediction(new_customer)

In [None]:
for x in negative_index[0:100]:
    print(make_ind_prediction(negative.loc[x].drop('default')))