Attribute Information:

default: (Yes = 1, No = 0)

limit_bal: Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit.

Gender: (1 = male; 2 = female).

Education: (1 = graduate school; 2 = university; 3 = high school; 4 = others).

Marriage: (1 = married; 2 = single; 3 = others).

Age

pay_1 = the repayment status in September, 2005

pay_2 = the repayment status in August, 2005

pay_3 = the repayment status in July, 2005

pay_4 = the repayment status in June, 2005

pay_5 = the repayment status in May, 2005

pay_6 = the repayment status in April, 2005

(-1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.)

bill_amt1: amount of bill statement in September, 2005

bill_amt2: amount of bill statement in August, 2005

bill_amt3: amount of bill statement in July, 2005

bill_amt4: amount of bill statement in June, 2005

bill_amt5: amount of bill statement in May, 2005

bill_amt6: amount of bill statement in April, 2005

pay_amt1: amount paid in September, 2005

pay_amt2: amount paid in August, 2005

pay_amt3: amount paid in July, 2005

pay_amt4: amount paid in June, 2005

pay_amt5: amount paid in May, 2005

pay_amt6: amount paid in April, 2005

(NT dollar)

In [None]:
%autosave 60

In [3]:
from pandas_profiling import ProfileReport

In [2]:
import numpy as np
import pandas as pd
seed = 69 # Set the random seed for the entire document

from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, ElasticNet, ElasticNetCV
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC

from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss
from sklearn.metrics import auc, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.metrics import fbeta_score, cohen_kappa_score

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.model_selection import train_test_split


import time
import pickle

import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter 

%matplotlib inline

In [None]:
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as pd_sql
# We are also going to do some basic viz
import matplotlib.pyplot as plt
%matplotlib inline

# Load File from Sql Database

In [None]:
connection_args = {
    'host': 'localhost',  # We are connecting to our _local_ version of psql
    'dbname': 'card',    # DB that we are connecting to
    'port': 5432 }         
connection = pg.connect(**connection_args)

In [None]:
query = 'SELECT * FROM "credit_card_clients";'
df=pd_sql.read_sql(query, connection)
df.head()

# Data Cleaning

In [None]:
# create the header with proper names
new_header = df.iloc[0]
df = df[1:]
df.columns = new_header

In [None]:
# convert the entire dataframe to numeric
df = df.apply(pd.to_numeric)

In [None]:
# change column names
df = df.rename(columns={"SEX": "gender", "PAY_0": "PAY_1", "default payment next month": "default"})

In [None]:
df = df.drop("ID", axis = 1)

In [None]:
# lowercase the column names
df.columns = map(str.lower, df.columns)

In [None]:
df = df.reset_index(drop=True)

In [None]:
df.head(10)

# Features Engineering for Categorical Data
## Education

In [None]:
def assign_education(num):
    """
    replace the number with the level of education
    """
    if num == 1:
        return 'graduate school'
    elif num == 2:
        return 'university'
    elif num == 3:
        return 'high school'
    else:
        return 'other'

In [None]:
df.education =  df.education.apply(lambda x: assign_education(x))

In [None]:
# look at the distribution of education levels
sns.set(style="whitegrid")
sns.barplot(x=df.education.value_counts().index, y=df.education.value_counts())
plt.title('Distribution of Education')

## Gender

In [None]:
# set female as 1 and male as 0
df['gender'] =  [ 0 if x==1 else 1 for x in df['gender']]

In [None]:
# distribution of gender
sns.set(style="whitegrid")
sns.barplot(x=df.gender.value_counts().index, y=df.gender.value_counts())
plt.title('Distribution of Gender')

## Marriage

In [None]:
df['marriage'].value_counts()

In [None]:
def assign_marriage(num):
    """
    replace the number with the martial status
    """
    if num == 1:
        return 'married'
    elif num == 2:
        return 'single'
    else:
        return 'other'

In [None]:
df.marriage = df.marriage.apply(lambda x: assign_marriage(x))

In [None]:
sns.set(style="whitegrid")
sns.barplot(x=df.marriage.value_counts().index, y=df.marriage.value_counts())
plt.title('Distribution of Marital Status')

## pay_1, pay_2, pay_3, pay_4, pay_5, pay_6,

In [None]:
select_df = df[['pay_1','pay_2','pay_3','pay_4','pay_5','pay_6']]

count_pay_duly = []
for i in range(select_df.shape[0]):
    count_pay_duly.append(select_df.iloc[i].tolist().count(-1)) # total count of the payments on time in 6 months


In [None]:
df_no_pay_i = df.drop(['pay_1','pay_2','pay_3','pay_4','pay_5','pay_6'],axis = 1)


In [None]:
# add count_pay_duly into the df and drop pay_1...
df_no_pay_i['count_pay_duly'] = count_pay_duly

## Dummies

In [None]:
# get dummies of education and marriage
ed_dummies = pd.get_dummies(df_no_pay_i.education).drop(['other'], axis = 1)
ma_dummies = pd.get_dummies(df_no_pay_i.marriage).drop(['other'], axis = 1)

In [None]:
# set up a new df with all dummies 
dum_df  = df_no_pay_i.merge(ed_dummies, left_index=True, right_index=True).merge(
    ma_dummies, left_index=True, right_index=True)


In [None]:
unordered_df = dum_df.drop(['education', 'marriage'], 1)

In [None]:
unordered_df.columns

In [None]:
clean_df = unordered_df[['gender', 'graduate school', 'high school', 'university',
       'married', 'single' , 'age', 'count_pay_duly','limit_bal','bill_amt1',
       'bill_amt2', 'bill_amt3', 'bill_amt4', 'bill_amt5', 'bill_amt6',
       'pay_amt1', 'pay_amt2', 'pay_amt3', 'pay_amt4', 'pay_amt5', 'pay_amt6','default']]

In [None]:
clean_df.columns

In [None]:
import pickle

In [None]:
with open("clean_df.pickle", "wb") as f:
    pickle.dump(clean_df, f)

In [4]:
#############################
### pick up what you left ###
#############################

with open("clean_df.pickle", "rb") as f:
    clean_df = pickle.load(f)

In [None]:
#%config InlineBackend.figure_format = 'svg'
# plt.figure(figsize = (15,8))
# ax = sns.heatmap(dum_df.corr(),cmap = "YlOrRd",annot = True, vmin = -1, vmax = 1,linewidths = 0.5);
# bottom, top = ax.get_ylim()
# ax.set_ylim(bottom + 0.5, top - 0.5)

# EDA

In [None]:
clean_df.info()

In [None]:
Counter(clean_df.default)

In [None]:
from matplotlib.pyplot import show
import seaborn as sns
plt.title('Distribution of Credict Card Default')
sns.set(style="whitegrid")
total = float(len(clean_df.default)) 
ax = sns.countplot(x= clean_df.default, data=clean_df) # for Seaborn version 0.7 and more
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}'.format(height/total),
            ha="center") 
show()

In [None]:
clean_df.corr().iloc[:,-1]

In [None]:
clean_df.shape

In [None]:
clean_df.head()

# Modeling (no scaling)

In [None]:
# set X and y

X = clean_df.iloc[:,:-1]
y = clean_df.iloc[:,-1:]

In [None]:

# Data splitting for 80% Train/Val and 20% Test 
X_train_fe, X_test_fe, y_train_val_fe, y_test_fe = train_test_split(X, y, test_size = 0.2, random_state=42) # 20% holdout 

## This line instantiates the model. 
rf = RandomForestClassifier() 

## Fit the model on your training data.
rf.fit(X_train_fe.values, y_train_val_fe.values.ravel()) 

# Obtain the feature importance
feature_importance = pd.DataFrame(rf.feature_importances_,
                                   index = X_train_fe.columns,
                                   columns=['Variable_Importance']).sort_values('Variable_Importance',ascending=True)

# Set seaborn contexts 
sns.set(style="whitegrid")

feature_importance.plot.barh(figsize=(15,10))
plt.savefig('feature_importance.png')

In [5]:
from imblearn.over_sampling import SMOTE 
from imblearn import under_sampling, over_sampling
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from collections import defaultdict
from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn import svm


## Hyperparameter Tuning KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)

scores = cross_val_score(knn, X_smote_train, y_smote_train, cv=10, scoring='recall')

In [None]:
print(scores.mean())

In [None]:
k_range = [i for i in range(1, 31) if i%2 != 0]
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_smote_train, y_smote_train, cv=10, scoring='recall')
    k_scores.append(scores.mean())
print(k_scores)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Recall')

In [None]:
# choose k = 3

## Compare the models

Precision is a good measure to determine, when the costs of False Positive is high. Recall shall be the model metric we use to select our best model when there is a high cost associated with False Negative.

TP: defalut identified as default. we correctly identify the unreliable customes. 
FP: not default identified as default. Customers who are reliable are identified as default. 
TN: not default identified as not default. 
FN: default identified as not default.The cutomers who are reliable are correcly identified. 

Identifying accurately which customers are most probable to default represents significant business opportunity for all banks.

Our clients: banks

Recall shall be the model metric we use to select our best model when there is a high cost associated with False Negative.Our obejective is to help the bank identify and take action on customers with high probability of defaulting to improve their bottom line.


In [6]:
# set X and y

X = clean_df.iloc[:,:-1]
y = clean_df.iloc[:,-1:]

In [7]:
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE 


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=y)

In [9]:
kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

#collect results for each model
cv_results_recall = defaultdict(list) 
cv_results_precision = defaultdict(list) 
cv_results_f1 = defaultdict(list) 
cv_results_accuracy = defaultdict(list) 
cv_results_fbeta =  defaultdict(list)

# generate indies for kf
y_train = y_train.values
X_train = X_train.values

# choose n_neighbors as
models = {'Xgboost':XGBClassifier(),
          'GNB': GaussianNB(),
          'Logistic Regression': LogisticRegression(),
          'Random Forest':RandomForestClassifier(),
          'Linearsvc': LinearSVC(),
          'Knn':KNeighborsClassifier()}
          
for model_name, model in models.items():
    for train_ind, val_ind in kf.split(X_train, y_train):
        X_tr, y_tr = X_train[train_ind], y_train[train_ind]
    
        X_smote_train, y_smote_train = SMOTE(random_state=42).fit_sample(X_tr, y_tr)

        X_val,y_val = X_train[val_ind], y_train[val_ind]

        model.fit(X_smote_train, y_smote_train)

        y_pred = model.predict(X_val)

        # calculate recall score
        recall = recall_score(y_val, y_pred) 

        cv_results_recall[model_name].append(recall)
        
        # calculate precision score
        
        precision = precision_score(y_val, y_pred) 

        cv_results_precision[model_name].append(precision)
        
        # calculate f1 score
        f1 = f1_score(y_val,y_pred)
        
        cv_results_f1[model_name].append(f1)
        
        #calculate fbeta score
        fbeta = fbeta_score(y_val, y_pred, beta=2)
        
        cv_results_fbeta[model_name].append(fbeta)
        # calculate accuracy
        
        accuracy = accuracy_score(y_val,y_pred)
        
        cv_results_accuracy[model_name].append(accuracy)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https:

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [10]:
fbeta_mean = [np.mean(i) for i in cv_results_fbeta.values()]
fbeta_var = [np.var(i) for i in cv_results_fbeta.values()]
df_cv_fbeta = pd.DataFrame(
    {
     'fbeta_mean': fbeta_mean,
     'fbeta_var': fbeta_var
    })

In [11]:
df_cv_fbeta


Unnamed: 0,fbeta_mean,fbeta_var
0,0.305795,0.000384
1,0.588065,3.6e-05
2,0.39048,0.000838
3,0.285502,0.00033
4,0.454265,0.018755
5,0.453352,9.1e-05


In [12]:
model_lst = list(cv_results_recall.keys())

In [13]:
model_lst

['Xgboost', 'GNB', 'Logistic Regression', 'Random Forest', 'Linearsvc', 'Knn']

In [None]:
# the minority class and over-samples it until it is balanced with the majority class.
# the model with the best mean score and least amount of variance in performance is chosen

model_lst = list(cv_results_recall.keys())
recall_mean = [np.mean(i) for i in cv_results_recall.values()]
recall_var = [np.var(i) for i in cv_results_recall.values()]
df_cv_recall = pd.DataFrame(
    {'model_name': model_lst,
     'mean': recall_mean,
     'variance': recall_var
    })

In [None]:
df_cv_recall

In [None]:
print(f'cross validation results for precision score {cv_results_precision}')

In [None]:
precision_mean = [np.mean(i) for i in cv_results_precision.values()]
precision_var = [np.var(i) for i in cv_results_precision.values()]
df_cv_precision = pd.DataFrame(
    {'model_name': model_lst,
     'mean': precision_mean,
     'variance': precision_var
    })

In [None]:
df_cv_precision

## Select the best model to fit on test set

In [14]:
X_train_best, X_test_best, y_train_best, y_test_best = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=y)

In [15]:
X_sm,y_sm = SMOTE(random_state = 42).fit_sample(X_train_best.values, y_train_best.values)

  y = column_or_1d(y, warn=True)


In [18]:
xgb = XGBClassifier()
xgb.fit(X_sm, y_sm)
y_predict_xgb = xgb.predict(X_test.as_matrix())
print(fbeta_score(y_test.as_matrix(), y_predict_xgb,beta =2))

0.2949301267468314


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [19]:
gnb = GaussianNB()
gnb.fit(X_sm, y_sm)
y_predict_gnb = gnb.predict(X_test)
print(fbeta_score(y_test, y_predict_gnb,beta =2))

0.5928080885158337


In [20]:
lr = LogisticRegression()
lr.fit(X_sm, y_sm)
y_predict_lr = lr.predict(X_test)
print(fbeta_score(y_test, y_predict_lr,beta =2))

0.3724327451547585


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [21]:
linear_svc = LinearSVC()
linear_svc.fit(X_sm, y_sm)
y_predict_linear_svc = linear_svc.predict(X_test)
print(fbeta_score(y_test, y_predict_linear_svc,beta =2))

0.5695426685525695




In [22]:
rf = RandomForestClassifier()
rf.fit(X_sm, y_sm)
y_predict_rf = rf.predict(X_test)
print(fbeta_score(y_test, y_predict_rf,beta =2))

0.2900439954375102


In [23]:
# take a thousnad year to run svm

In [24]:
knn = KNeighborsClassifier()
knn.fit(X_sm, y_sm)
y_predict_knn = knn.predict(X_test)
print(fbeta_score(y_test, y_predict_knn,beta =2))

0.4353459445452189


## ROC curve

In [None]:
fpr_xgb, tpr_xgb, thresholds_xgb = roc_curve(y_test, xgb.predict_proba(X_test.as_matrix())[:,1])
auc_xgb = roc_auc_score(y_test, xgb.predict_proba(X_test.as_matrix())[:, 1]) 

fpr_gnb, tpr_gnb, thresholds_gnb = roc_curve(y_test, gnb.predict_proba(X_test)[:,1])
auc_gnb = roc_auc_score(y_test, gnb.predict_proba(X_test)[:, 1]) 

fpr_lr, tpr_lr, thresholds_lr = roc_curve(y_test, lr.predict_proba(X_test)[:,1])
auc_lr = roc_auc_score(y_test, lr.predict_proba(X_test)[:, 1]) 

fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, rf.predict_proba(X_test)[:,1])
auc_rf = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]) 

fpr_knn, tpr_knn, thresholds_knn = roc_curve(y_test, knn.predict_proba(X_test)[:,1])
auc_knn = roc_auc_score(y_test, knn.predict_proba(X_test)[:, 1]) 

fpr_lsvc, tpr_lsvc, thresholds_lsvc = roc_curve(y_test, lsvc.predict_proba(X_test)[:,1])
auc_lsvc = roc_auc_score(y_test, lsvc.predict_proba(X_test)[:, 1]) 

In [None]:
plt.plot(fpr_lr, tpr_lr, lw=1, label='Logistic Regression')
plt.plot(fpr_knn, tpr_knn, lw=1, label='KNN')
plt.plot(fpr_gnb, tpr_gnb, lw=1, label='Gaussian NB')
plt.plot(fpr_rf, tpr_rf, lw=1, label='Random Forest')
plt.plot(fpr_xgb, tpr_xgb, lw=1, label='XGBoost')
plt.plot(fpr_lsvc, tpr_lsvc, lw=1, label='SVM - Linear')


plt.plot([0, 1], [0, 1], c='violet', ls='--')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Model Comparison - ROC curve')
plt.legend(ncol=2, fontsize='small')
sns.despine()
plt.savefig('model_comp_roc.png')

In [None]:
# Print confusion matrix for Gaussian NB
gnb_confusion = confusion_matrix(y_test, gnb.predict(X_test))
plt.figure(dpi=150)
sns.heatmap(gnb_confusion, cmap=plt.cm.Blues, annot=True,
            square=True,
           xticklabels=['No Default', 'Default'],
           yticklabels=['No Default', 'Default']);

b, t = plt.ylim()  # discover the values for bottom and top
b += 0.5  # Add 0.5 to the bottom
t -= 0.5  # Subtract 0.5 from the top
plt.ylim(b, t)  # update the ylim(bottom, top) values

plt.xlabel('Prediction', size=15)
plt.ylabel('Actual', rotation=0, labelpad=40,size=15)
plt.title('GaussianNB confusion matrix');
plt.show()
plt.savefig('confusion.png')