# Group Coursework - Credit Card Default Prediction
#### This notebook has been accomplished by Group F: Chi, Jingting, Steve, Danny, Xiaoqi and Nicolas

#### Steps:
1. Introduction
2. Data Import
3. Data Transformation & Exploration
4. Methodology Overview
5. Model Training & Validation
6. Results
7. Final Predictions on Test Set

## Introduction

Blablabla... This part should be similar within the report and will be filled in after the report is done.

In [24]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

from sklearn.metrics import f1_score,classification_report
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier

## Data Import

In [11]:
# read data from train and test sets
p_train = pd.read_csv("../data/CreditCard_train.csv",encoding='gbk')
p_test = pd.read_csv("../data/CreditCard_test.csv",encoding='gbk')

In [12]:
print(p_train)
print(p_test)

      Unnamed: 0         X1   X2         X3        X4   X5     X6     X7  \
0             ID  LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2   
1              1      20000    2          2         1   24      2      2   
2              2     120000    2          2         2   26     -1      2   
3              3      90000    2          2         2   34      0      0   
4              4      50000    2          2         1   37      0      0   
...          ...        ...  ...        ...       ...  ...    ...    ...   
23996      23996      80000    1          2         1   25      1      2   
23997      23997      20000    1          2         1   25      0      0   
23998      23998      10000    1          2         2   26      0      0   
23999      23999      20000    1          5         2   26      0      0   
24000      24000     100000    1          1         2   26      0     -1   

          X8     X9  ...        X15        X16        X17       X18       X19  \
0     

## Data Transformation & Exploration

## This part should follow processing pipeline in the future when combined

In [13]:
# Data cleaning
# Delete unnecessary keys

origin_keys_train = p_train.keys()
origin_keys_test = p_test.keys()
need_keys_train = origin_keys_train[1:]
need_keys_test = origin_keys_test[1:]
dataset_train = []
dataset_test = []

for key in need_keys_train:
    data = []
    for i in range(len(p_train[key])):
        if i==0:
            continue
        else:
            data.append(p_train[key][i])
    dataset_train.append(data)
    
for key in need_keys_test:
    data = []
    for i in range(len(p_test[key])):
        if i==0:
            continue
        else:
            data.append(p_test[key][i])
    dataset_test.append(data)

# Transform and transpose data sets
dataset_train = np.array(dataset_train,dtype=np.float)
dataset_train = dataset_train.transpose()
dataset_test= np.array(dataset_test,dtype=np.float)
dataset_test = dataset_test.transpose()

# Check if there are missing values, preprocess it if so.
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(dataset_train)

# Achieve the transformed data sets
dataset_train = imp.transform(dataset_train)
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(dataset_test)

dataset_test = imp.transform(dataset_test)

In [14]:
print(dataset_train.shape)
print(dataset_test.shape)

(24000, 24)
(6000, 24)


In [15]:
# get x and y for both train and test
x_train, y_train = dataset_train[:,:23], dataset_train[:,-1]
x_test, y_test = dataset_test[:,:23], dataset_test[:,-1]

# Standarize data
scaler = preprocessing.StandardScaler().fit(x_train)
x_train, x_test = scaler.transform(x_train),scaler.transform(x_test)

In [17]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((24000, 23), (24000,), (6000, 23), (6000,))

## Methodology Overview

Here provided are three different methods called  Adaboost(Adaptive Boosting), Random Forest and GBRT(Gradient Boosted Regression Trees) respectively.

In [18]:
class adaboost(object):

    def __init__(self):
        # build the model
        # self.model = ensemble.AdaBoostClassifier(n_estimators=100,learning_rate=0.001)
        # self.model = ensemble.BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=12,),n_estimators=200)
        self.model  = ensemble.AdaBoostClassifier(n_estimators=200,learning_rate=0.001)
        # create the model
        # self.model = ensemble.BaggingClassifier(KNeighborsClassifier(), n_estimators=10, max_samples=0.5,max_features=0.5)

    def train(self,x_train,y_train):
        # training
        self.model.fit(x_train,y_train)

    def predict(self,x_test):
        # predicting
        y_pred = self.model.predict(x_test)

        return y_pred

    
class random_forest(object):

    def __init__(self):
        # build the model
        # self.model = ensemble.AdaBoostClassifier(n_estimators=100,learning_rate=0.001)
        # self.model = ensemble.BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=12,),n_estimators=200)
        # self.model  = ensemble.AdaBoostClassifier(n_estimators=200,learning_rate=0.001)
        # create the model
        self.model = ensemble.BaggingClassifier(KNeighborsClassifier(), n_estimators=200, max_samples=0.5,max_features=0.5)

    def train(self,x_train,y_train):
        # training
        self.model.fit(x_train,y_train)

    def predict(self,x_test):
        # predicting
        y_pred = self.model.predict(x_test)
        return y_pred

    
class GBRT(object):

    def __init__(self):
        # create the model
        self.model = ensemble.GradientBoostingClassifier(n_estimators=200)

    def train(self,x_train,y_train):
        # training
        self.model.fit(x_train,y_train)

    def predict(self,x_test):
        # predicting
        y_pred = self.model.predict(x_test)
        return y_pred

## Model Training & Validation

In [19]:
model1 = adaboost()
model2 = random_forest()
model3 = GBRT()

In [27]:
def train_and_cross_validate(model, n_splits=5):
    n_iter = 1      # this var is made just to keep track of the number of iteration
    kfold = KFold(n_splits=n_splits)
    cv_accuracy = []

    for train_idx, val_idx in kfold.split(x_train):
        x_train_cv, x_val = x_train[train_idx], x_train[val_idx]
        y_train_cv, y_val = y_train[train_idx], y_train[val_idx]

        model.train(x_train_cv, y_train_cv)
        pred = model.predict(x_val)

        accuracy = np.round(accuracy_score(y_val, pred), 3)

        print("Iteration : {}, Cross-Validation Accuracy : {}".format(n_iter, accuracy))

        n_iter += 1

        cv_accuracy.append(accuracy)

    print("Average accuracy : ", np.mean(cv_accuracy))

In [28]:
train_and_cross_validate(model1)  # Adaptive Boosting

Iteration : 1, Cross-Validation Accuracy : 0.805
Iteration : 2, Cross-Validation Accuracy : 0.807
Iteration : 3, Cross-Validation Accuracy : 0.812
Iteration : 4, Cross-Validation Accuracy : 0.822
Iteration : 5, Cross-Validation Accuracy : 0.838
Average accuracy :  0.8168000000000001


In [29]:
train_and_cross_validate(model2)  # Random Forest

Iteration : 1, Cross-Validation Accuracy : 0.797
Iteration : 2, Cross-Validation Accuracy : 0.803
Iteration : 3, Cross-Validation Accuracy : 0.802
Iteration : 4, Cross-Validation Accuracy : 0.803
Iteration : 5, Cross-Validation Accuracy : 0.833
Average accuracy :  0.8076000000000001


In [30]:
train_and_cross_validate(model3)  # Gradient Boosting

Iteration : 1, Cross-Validation Accuracy : 0.805
Iteration : 2, Cross-Validation Accuracy : 0.809
Iteration : 3, Cross-Validation Accuracy : 0.808
Iteration : 4, Cross-Validation Accuracy : 0.822
Iteration : 5, Cross-Validation Accuracy : 0.841
Average accuracy :  0.817


## Results

In [31]:
# prediction
y_pred_1 = model1.predict(x_test)
y_pred_2 = model2.predict(x_test)
y_pred_3 = model3.predict(x_test)

# Obtain the f1 score and report for three methods
print(f1_score(y_test,y_pred_1))
print(f1_score(y_test,y_pred_2))
print(f1_score(y_test,y_pred_3))

print(classification_report(y_test,y_pred_1))
print(classification_report(y_test,y_pred_2))
print(classification_report(y_test,y_pred_3))

0.4478260869565217
0.3539928486293206
0.45951629863301785
              precision    recall  f1-score   support

         0.0       0.84      0.97      0.90      4734
         1.0       0.72      0.33      0.45      1266

    accuracy                           0.83      6000
   macro avg       0.78      0.65      0.67      6000
weighted avg       0.82      0.83      0.80      6000

              precision    recall  f1-score   support

         0.0       0.83      0.98      0.89      4734
         1.0       0.72      0.23      0.35      1266

    accuracy                           0.82      6000
   macro avg       0.77      0.61      0.62      6000
weighted avg       0.80      0.82      0.78      6000

              precision    recall  f1-score   support

         0.0       0.85      0.96      0.90      4734
         1.0       0.69      0.35      0.46      1266

    accuracy                           0.83      6000
   macro avg       0.77      0.65      0.68      6000
weighted avg    