# Group Coursework - Credit Card Default Prediction
#### This notebook has been accomplished by Group F: Chi, Jingting, Steve, Danny, Xiaoqi and Nicolas

#### Steps:
1. Introduction
2. Data Import
3. Data Transformation & Exploration
4. Methodology Overview
5. Model Training & Validation
6. Results
7. Final Predictions on Test Set

## Introduction

Blablabla... This part should be similar within the report and will be filled in after the report is done.

In [10]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,classification_report
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier

## Data Import

In [11]:
# read data from train and test sets
p_train = pd.read_csv("CreditCard_train.csv",encoding='gbk')
p_test = pd.read_csv("CreditCard_test.csv",encoding='gbk')

In [12]:
print(p_train)
print(p_test)

      Unnamed: 0         X1   X2         X3        X4   X5     X6     X7  \
0             ID  LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2   
1              1      20000    2          2         1   24      2      2   
2              2     120000    2          2         2   26     -1      2   
3              3      90000    2          2         2   34      0      0   
4              4      50000    2          2         1   37      0      0   
...          ...        ...  ...        ...       ...  ...    ...    ...   
23996      23996      80000    1          2         1   25      1      2   
23997      23997      20000    1          2         1   25      0      0   
23998      23998      10000    1          2         2   26      0      0   
23999      23999      20000    1          5         2   26      0      0   
24000      24000     100000    1          1         2   26      0     -1   

          X8     X9  ...        X15        X16        X17       X18       X19  \
0     

## Data Transformation & Exploration

In [13]:
# Data cleaning
# Delete unnecessary keys

origin_keys_train = p_train.keys()
origin_keys_test = p_test.keys()
need_keys_train = origin_keys_train[1:]
need_keys_test = origin_keys_test[1:]
dataset_train = []
dataset_test = []

for key in need_keys_train:
    data = []
    for i in range(len(p_train[key])):
        if i==0:
            continue
        else:
            data.append(p_train[key][i])
    dataset_train.append(data)
    
for key in need_keys_test:
    data = []
    for i in range(len(p_test[key])):
        if i==0:
            continue
        else:
            data.append(p_test[key][i])
    dataset_test.append(data)

# Transform and transpose data sets
dataset_train = np.array(dataset_train,dtype=np.float)
dataset_train = dataset_train.transpose()
dataset_test= np.array(dataset_test,dtype=np.float)
dataset_test = dataset_test.transpose()

# Check if there are missing values, preprocess it if so.
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(dataset_train)

# Achieve the transformed data sets
dataset_train = imp.transform(dataset_train)
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(dataset_test)

dataset_test = imp.transform(dataset_test)

In [14]:
print(dataset_train.shape)

(24000, 24)


In [15]:
# data exploration
x,y = dataset_train[:,:23],dataset_train[:,-1]

# Split data sets
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.1,shuffle=True)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

# Standarize data
scaler = preprocessing.StandardScaler().fit(x_train)
x_train,x_test = scaler.transform(x_train),scaler.transform(x_test)

(21600, 23) (2400, 23) (21600,) (2400,)


## Methodology Overview

Here provided are three different methods called  Adaboost(Adaptive Boosting), Random Forest and GBRT(Gradient Boosted Regression Trees) respectively.

In [16]:
class adaboost(object):

    def __init__(self):
        # build the model
        # self.model = ensemble.AdaBoostClassifier(n_estimators=100,learning_rate=0.001)
        # self.model = ensemble.BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=12,),n_estimators=200)
        self.model  = ensemble.AdaBoostClassifier(n_estimators=200,learning_rate=0.001)
        # create the model
        # self.model = ensemble.BaggingClassifier(KNeighborsClassifier(), n_estimators=10, max_samples=0.5,max_features=0.5)

    def train(self,x_train,y_train):
        # training
        self.model.fit(x_train,y_train)

    def predict(self,x_test):
        # predicting
        y_pred = self.model.predict(x_test)

        return y_pred

    
class random_forest(object):

    def __init__(self):
        # build the model
        # self.model = ensemble.AdaBoostClassifier(n_estimators=100,learning_rate=0.001)
        # self.model = ensemble.BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=12,),n_estimators=200)
        # self.model  = ensemble.AdaBoostClassifier(n_estimators=200,learning_rate=0.001)
        # create the model
        self.model = ensemble.BaggingClassifier(KNeighborsClassifier(), n_estimators=200, max_samples=0.5,max_features=0.5)

    def train(self,x_train,y_train):
        # training
        self.model.fit(x_train,y_train)

    def predict(self,x_test):
        # predicting
        y_pred = self.model.predict(x_test)
        return y_pred

    
class GBRT(object):

    def __init__(self):
        # create the model
        self.model = ensemble.GradientBoostingClassifier(n_estimators=200)

    def train(self,x_train,y_train):
        # training
        self.model.fit(x_train,y_train)

    def predict(self,x_test):
        # predicting
        y_pred = self.model.predict(x_test)
        return y_pred

## Model Training & Validation

In [17]:
model1 = adaboost()
model2 = random_forest()
model3 = GBRT()

model1.train(x_train,y_train)
model2.train(x_train,y_train)
model3.train(x_train,y_train)

## Results

In [18]:
# prediction
y_pred_1 = model1.predict(x_test)
y_pred_2 = model2.predict(x_test)
y_pred_3 = model3.predict(x_test)

# Obtain the f1 score and report for three methods
print(f1_score(y_test,y_pred_1))
print(f1_score(y_test,y_pred_2))
print(f1_score(y_test,y_pred_3))

print(classification_report(y_test,y_pred_1))
print(classification_report(y_test,y_pred_2))
print(classification_report(y_test,y_pred_3))

0.47981545559400235
0.4034869240348693
0.513157894736842
              precision    recall  f1-score   support

         0.0       0.83      0.95      0.89      1826
         1.0       0.71      0.36      0.48       574

    accuracy                           0.81      2400
   macro avg       0.77      0.66      0.68      2400
weighted avg       0.80      0.81      0.79      2400

              precision    recall  f1-score   support

         0.0       0.81      0.96      0.88      1826
         1.0       0.71      0.28      0.40       574

    accuracy                           0.80      2400
   macro avg       0.76      0.62      0.64      2400
weighted avg       0.79      0.80      0.77      2400

              precision    recall  f1-score   support

         0.0       0.84      0.94      0.89      1826
         1.0       0.69      0.41      0.51       574

    accuracy                           0.81      2400
   macro avg       0.76      0.68      0.70      2400
weighted avg     

## Final Predictions on Test Set

In [19]:
# now predict all test sets
# retrain all the train sets by using three methods
model1.train(x, y)
model2.train(x, y)
model3.train(x, y)

# predictions
x_for_test,y_for_test = dataset_test[:,:23],dataset_test[:,-1]
y_for_pred_1 = model1.predict(x_for_test)
y_for_pred_2 = model2.predict(x_for_test)
y_for_pred_3 = model3.predict(x_for_test)

# Obtain the f1 score and report for three methods
print(f1_score(y_for_pred_1,y_for_test))
print(f1_score(y_for_pred_2,y_for_test))
print(f1_score(y_for_pred_3,y_for_test))

print(classification_report(y_for_pred_1,y_for_test))
print(classification_report(y_for_pred_2,y_for_test))
print(classification_report(y_for_pred_3,y_for_test))

0.4478260869565217
0.0962962962962963
0.47133087848500793
              precision    recall  f1-score   support

         0.0       0.97      0.84      0.90      5426
         1.0       0.33      0.72      0.45       574

    accuracy                           0.83      6000
   macro avg       0.65      0.78      0.67      6000
weighted avg       0.90      0.83      0.86      6000

              precision    recall  f1-score   support

         0.0       1.00      0.80      0.89      5916
         1.0       0.05      0.77      0.10        84

    accuracy                           0.80      6000
   macro avg       0.52      0.79      0.49      6000
weighted avg       0.98      0.80      0.87      6000

              precision    recall  f1-score   support

         0.0       0.96      0.85      0.90      5365
         1.0       0.35      0.71      0.47       635

    accuracy                           0.83      6000
   macro avg       0.66      0.78      0.69      6000
weighted avg    

In [20]:
# write the data into csv file
dataframe = pd.DataFrame({'origin_data': y_for_test, 'adaboost_pred': y_for_pred_1,'random_forst_pred': y_for_pred_2,
                          'GBRT_pred': y_for_pred_3})

dataframe.to_csv("result.csv",index=False,sep=",")
pf = pd.read_csv("result.csv",encoding="gbk")

In [22]:
# print the file into jupyter notebook
print(pf)

      origin_data  adaboost_pred  random_forst_pred  GBRT_pred
0             0.0            1.0                0.0        1.0
1             1.0            0.0                0.0        0.0
2             0.0            0.0                0.0        0.0
3             0.0            1.0                0.0        0.0
4             0.0            0.0                0.0        0.0
...           ...            ...                ...        ...
5995          0.0            0.0                0.0        0.0
5996          0.0            0.0                0.0        0.0
5997          1.0            1.0                0.0        1.0
5998          1.0            0.0                0.0        0.0
5999          1.0            0.0                0.0        0.0

[6000 rows x 4 columns]
