# MAP569 Project
## Time: 17/04/2020
## Name: Yelan MO, Jingyun YANG

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score
from sklearn.preprocessing import LabelEncoder,normalize
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn import metrics
# from graphviz import Source
from IPython.display import SVG



In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn import tree
from sklearn import model_selection
import xgboost as xgb
from xgboost import plot_importance



# 1. Data preprocessing

In [3]:
data_df = pd.read_csv('https://raw.githubusercontent.com/jyyang5/MAP569-project/master/CreditTraining.csv')
label_df = data_df['Prod_Category'].tolist()

# divide into categorical data and else as clean
clean_df = data_df.select_dtypes(exclude=['object']).copy()
cat_df = data_df.select_dtypes(include=['object']).copy()

# replace comma with dot 
temp_list = []
for ele in data_df['Net_Annual_Income'].tolist():
  if type(ele) is not float:
    temp_list.append(ele.replace(',','.'))
  else:
    temp_list.append(ele)

clean_df['Net_Annual_Income'] = temp_list
del cat_df['Net_Annual_Income']
clean_df[:2]

Unnamed: 0,Id_Customer,Y,Number_Of_Dependant,Years_At_Residence,Years_At_Business,Nb_Of_Products,Net_Annual_Income
0,7440,0,3.0,1,1.0,1,36
1,573,0,0.0,12,2.0,1,18


**Observations**: there are categorical and timestamps that is not numeric.

## 1.1. Deal with categorical data (timestamp data excluded)

From all categorical datas ['Customer_Type', 'BirthDate', 'Customer_Open_Date', 'P_Client',
       'Educational_Level', 'Marital_Status', 'Net_Annual_Income',
       'Prod_Sub_Category', 'Prod_Decision_Date', 'Source',
       'Type_Of_Residence', 'Prod_Closed_Date', 'Prod_Category']

We first exclude ['BirthDate', 'Customer_Open_Date', 'Net_Annual_Income', 'Prod_Decision_Date']

We use 0-1 encoding for each category

In [4]:
for name in ['Customer_Type', 'P_Client',
            'Educational_Level', 'Marital_Status',
            'Prod_Sub_Category', 'Source',
            'Type_Of_Residence', 'Prod_Category']:
        print(cat_df[name].value_counts())

Non Existing Client    3369
Existing Client        2011
Name: Customer_Type, dtype: int64
NP_Client    4968
P_Client      412
Name: P_Client, dtype: int64
University           4785
Master/PhD            522
Diploma                58
Secondary or Less      15
Name: Educational_Level, dtype: int64
Married      4206
Single       1046
Widowed        64
Divorced       63
Separated       1
Name: Marital_Status, dtype: int64
C    4638
G     624
P     118
Name: Prod_Sub_Category, dtype: int64
Sales     4119
Branch    1261
Name: Source, dtype: int64
Owned       4791
Old rent     323
Parents      179
New rent      83
Company        4
Name: Type_Of_Residence, dtype: int64
B    3176
D     670
C     517
K     265
L     236
G     188
E     101
H      79
J      71
M      49
A      19
F       5
I       4
Name: Prod_Category, dtype: int64


Since 'Customer_Type', 'P_Client', 'Source' all have just two types we therefore restrict ourselves to binary variable.

$$
Customer\_Type = 
 \begin{cases}
1& \text{Existing Client} \\
0 & \text{otherwise}
\end{cases} \\
P\_Client = 
 \begin{cases}
1& \text{P Client} \\
0 & \text{NP Client}
\end{cases} \\
Source= 
 \begin{cases}
1& \text{Sales} \\
0 & \text{Brranch}
\end{cases}
$$



In [5]:
# add categorical data [timestamp data not added]
clean_df1 = clean_df.join(pd.get_dummies(cat_df[['Customer_Type', 'P_Client',
                                                'Educational_Level', 'Marital_Status',
                                                'Prod_Sub_Category', 'Source',
                                                'Type_Of_Residence', 'Prod_Category']]))

clean_df1['Customer_Type'] = clean_df1['Customer_Type_Existing Client']
del clean_df1['Customer_Type_Existing Client']
del clean_df1['Customer_Type_Non Existing Client']

clean_df1['P_Client'] = clean_df1['P_Client_NP_Client']
del clean_df1['P_Client_NP_Client']
del clean_df1['P_Client_P_Client']

clean_df1['Source'] = clean_df1['Source_Branch']
del clean_df1['Source_Branch']
del clean_df1['Source_Sales']


clean_df1[:2]

Unnamed: 0,Id_Customer,Y,Number_Of_Dependant,Years_At_Residence,Years_At_Business,Nb_Of_Products,Net_Annual_Income,Educational_Level_Diploma,Educational_Level_Master/PhD,Educational_Level_Secondary or Less,...,Prod_Category_G,Prod_Category_H,Prod_Category_I,Prod_Category_J,Prod_Category_K,Prod_Category_L,Prod_Category_M,Customer_Type,P_Client,Source
0,7440,0,3.0,1,1.0,1,36,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,573,0,0.0,12,2.0,1,18,0,0,0,...,1,0,0,0,0,0,0,1,0,0


## 1.2. Deal with timestamps 
**Appraoches**: We use duration so that the variable is comparable 

['BirthDate', 'Customer_Open_Date', 'Net_Annual_Income', 'Prod_Decision_Date', 'Prod_Closed_Date']


- `Birth_Duration = Now - BirthDate` ('BirthDate'): assuming there is a distribution of credibility, just started working -> less credit, worked for a long time but not close to retirement -> high credit) 
- `Customer_Open_Duration = Now - Customer_Open_Date` ('Customer_Open_Date'): usually longer the history is, the more royal the customer is
- 'Prod_Closed_Date' - 'Prod_Decision_Date'
    - length of the product?: if product closed 
    - `Prod_not_closed = (Prod_closed != nan)`: dummy variable if the product is closed 'Prod_not_closed' = 0 if closed
    - `Prod_Decision_Duration = Now - Prod_Decision_Date`


In [7]:
# duration: birth - now
temp_list = [(datetime.now().date() - datetime.strptime(datetime_str, '%d/%m/%Y').date()).days for datetime_str in cat_df['BirthDate'].tolist()]
clean_df1['Birth_Duration'] = temp_list

# duration: Customer_Open_Date
temp_list = [(datetime.now().date() - datetime.strptime(datetime_str, '%d/%m/%Y').date()).days for datetime_str in cat_df['Customer_Open_Date'].tolist()]
clean_df1['Customer_Open_Duration'] = temp_list

# dummy var: product closed = 1
temp_list = [int(type(ele) != float) for ele in cat_df['Prod_Closed_Date'].tolist()]
clean_df1['Prod_not_closed'] = temp_list

# duration: Prod_Decision_Date
temp_list = [(datetime.now().date() - datetime.strptime(datetime_str, '%d/%m/%Y').date()).days for datetime_str in cat_df['Prod_Decision_Date'].tolist()]
clean_df1['Prod_Decision_Duration'] = temp_list

# convert to a list 
data_list = []
labels = []
for name in clean_df1.columns.values:
    if name is 'Id_Customer':
        pass
    elif name is 'Y':
        labels = clean_df1[name].tolist()
    else:
        data_list.append(clean_df1[name].tolist())

data_list = np.transpose(data_list)
data_list = np.array(data_list, dtype=np.float64)
np.where(np.isnan(data_list))

(array([ 634, 1879, 1987, 2750, 5045, 5144]), array([1, 5, 3, 3, 1, 5]))

Before we proceed to training and testing, we check if there is nan element

Customers with nan 
- Years_At_Business
    - Id_Customer = 398, 5882	
- Number_Of_Dependant
    - Id_Customer = 8953, 9588	
- Net_Annual_Income
    - Id_Customer = 9399, 9555	

## 1.3. Nan elements 
There are two usual approaches to deal with nan elements, the first one drops the data with nan, the second on the other hand replace the nan element. Since it is very likely that we may face future clients with unkown data (nan), so that we use the second approach to make the method proposed more general.

- Median 
- Most frequent 

In [8]:
clean_df2 = clean_df1.copy()

for name in ['Number_Of_Dependant', 'Years_At_Business', 'Net_Annual_Income']:
    temp_list = np.array(clean_df2[name].tolist(), dtype=np.float64)
    temp_list[np.isnan(temp_list)] = np.median(temp_list[~np.isnan(temp_list)])
    # print(sum(np.isnan(temp_list)))
    clean_df2[name] = temp_list

# convert to train mat 
data_list = []
labels = []
for name in clean_df2.columns.values:
    if name == 'Id_Customer':
        pass
        
    elif name is 'Y':
        labels = clean_df2[name].tolist()
    else:
        data_list.append(clean_df2[name].tolist())

data_list = np.transpose(data_list)
data_list = np.array(data_list, dtype=np.float64)
np.where(np.isnan(data_list))
train_df = clean_df2.copy()
del train_df['Y']
del train_df['Id_Customer']
'Check done: ', np.shape(train_df)== np.shape(data_list)

('Check done: ', True)

# 2. Models 

In [9]:
# split data
X_train, X_test, y_train, y_test = train_test_split(data_list, labels, test_size=0.25, random_state=42)

# np.save('X_train.npy',X_train)
# np.save('X_test.npy',X_test)
# np.save('y_train.npy',y_train)
# np.save('y_test.npy',y_test)

print('-----------all-----------')
for i in set(labels):
    print(i,labels.count(i))
print('percentage of 1:', labels.count(1)/len(labels))

print('-----------train-----------')
for i in set(labels):
    print(i,y_train.count(i))
print('percentage of 1:', y_train.count(1)/len(y_train))

print('-----------test-----------')
for i in set(labels):
    print(i,y_test.count(i))
print('percentage of 1:', y_test.count(1)/len(y_test))


-----------all-----------
0 4987
1 393
percentage of 1: 0.07304832713754647
-----------train-----------
0 3751
1 284
percentage of 1: 0.07038413878562577
-----------test-----------
0 1236
1 109
percentage of 1: 0.08104089219330855


The data seems to be unbalanced (two labels has a relative big difference in percentag), we first try without balancing the data.

## 2.1. Evaluation metrics

| 		|     same clusters|   different clusters|
| :-------- | --------:| :------: |
| same class    |   TP  |  FN |
|different class|   FP  |  TN |
- $P(precision) = \frac{TP}{TP+FP}$ 
- $R(recall) = \frac{TP}{TP+FN}$ 
- $F_{\beta}= \frac{(\beta^2+1)PR}{\beta^2P+R}$

### Note: since label=1 means that the client has defaulted on its credit, which is something that we definately want to avoid, we focus on *R(recall)* (percentage of detected clients among all truly defaulted clients) . In order to taken into consideration the *P(precision)* (the percentage of true clients among all detected clients), we use the *F 0.5-score*, the * $F_\beta$-score where $\beta = 2$, as the evaluation of the model

$\color{red}{\text{From now on, the precison, recall and F-2 score are referred to prediction with label=1}}$




## 2.2. Primal Test with Basic Model: SVM, KNN & Decision Tree

In [11]:

def eval_metric(y_test, preds):
    """ 
    print the classification report 
    """
    print('confusion matrix')
    print(confusion_matrix(y_test, preds))
    print('summary [label=1]')
    beta = 2
    res = precision_recall_fscore_support(y_test, preds, beta = beta, pos_label = 1, average = 'binary')
    print("precision:{}\nrecall:{}\nsupport:{}".format(round(res[0],3),round(res[1],3),res[3]))
    print('-------')
    print("accuracy:",round(accuracy_score(y_test, preds),3))
    print("F{}-score:{}".format(beta,round(res[2],3)))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, preds, pos_label=2)
    metrics.auc(fpr, tpr)

def tests(X_train, y_train, X_test, y_test):
    """ print the evaluation results of deterministic models
    """
    # KNeighbors
    neigh = KNeighborsClassifier(n_neighbors=2)
    neigh.fit(X_train, y_train)
    pred_KNN = neigh.predict(X_test)
    print('----------------K-Neighbors-------------------')
    eval_metric(y_test, pred_KNN)

    # SVM
    clf = SVC(gamma='auto')
    clf.fit(X_train, y_train)
    pred_SVM = clf.predict(X_test)
    print('----------------SVM-------------------')
    eval_metric(y_test, pred_SVM)
    
    #Decision Tree
    dt = tree.DecisionTreeClassifier()
    dt.fit(X_train, y_train)
    pred_DT = dt.predict(X_test)
    print('-----------------Decision Tree------------------')
    eval_metric(y_test, pred_DT)
   

In [12]:
tests(X_train, y_train, X_test, y_test)

----------------K-Neighbors-------------------
confusion matrix
[[1230    6]
 [ 107    2]]
summary [label=1]
precision:0.25
recall:0.018
support:None
-------
accuracy: 0.916
F2-score:0.023
----------------SVM-------------------
confusion matrix
[[1236    0]
 [ 109    0]]
summary [label=1]
precision:0.0
recall:0.0
support:None
-------
accuracy: 0.919
F2-score:0.0
-----------------Decision Tree------------------
confusion matrix
[[1179   57]
 [  56   53]]
summary [label=1]
precision:0.482
recall:0.486
support:None
-------
accuracy: 0.916
F2-score:0.485


**Observations**
- The accuracy of the three basic models reaches 0.9, which is satisfaisant for other ML tasks. However, we can notice that the SVM and KNN models predict that all the labels are '0', thanks to the imbalance of the data, the accuracy is still high with this sort of prediction, but it is not what we want. This is one reason why we chose to focus on Recall, Precision and F-score instead of Accuracy.
- The rebalancing of input data is necessary espacially for models such as SVM, KNN and LR 
- The F2-score of Decision Tree reaches 0.47 with imbalanced data, this shows that the Decision Tree, as well as random forest, which will be used in the next part, could work well on imbalanced data.

## 2.3. Balance data
In Section 2.2, we did some experiments but find that even if the overall accurancy is high, the prediction results of clients with label=1 is not good. Especially with classifiers like SVM, the class with more samples (majority class) is favoured. We therefore resample the minority class to balance the train data and see how the result might improve. Since the size of the minority class is very small, we w.l.o.g. use Upsampling.   

# 3. Encapsulation 
We combine all mentioned, add a few more functionalities and create the class below 

In [21]:
class Credit_predictor():
    def __init__(self,path):
        """
        read the data
        """
        self.raw_data = pd.read_csv(path)
        self.clean_df = self.raw_data.select_dtypes(exclude=['object']).copy()
        self.cat_df = self.raw_data.select_dtypes(include=['object']).copy()
        # categorical data 
        self.cates = ['Customer_Type', 'P_Client',
            'Educational_Level', 'Marital_Status',
            'Prod_Sub_Category', 'Source',
            'Type_Of_Residence', 'Prod_Category']
        # all timestamp data 
        self.date_trans_set = {'Birth_Duration':'BirthDate',
             'Customer_Open_Duration':'Customer_Open_Date',
             'Prod_Decision_Duration':'Prod_Decision_Date'}

        self.data_preprocessing_0()
        self.data_preprocessing_simple()
        self.data_preprocessing_onehot()

        # one-hot
        self.X_train, self.X_test, self.y_train, self.y_test = self.split_dataset(self.data_oh)
        # one-hot [balanced]
        self.X_train_b, self.X_test_b, self.y_train_b, self.y_test_b = self.split_dataset_4_imbal(self.data_oh)
        # categorical -> 0:n_classes-1
        self.X_train_t, self.X_test_t, self.y_train_t, self.y_test_t = self.split_dataset(self.data_simple)
        # categorical -> 0:n_classes-1 [balanced]
        self.X_train_tb, self.X_test_tb, self.y_train_tb, self.y_test_tb = self.split_dataset_4_imbal(self.data_simple)	
        
        
    def data_preprocessing_0(self):
        """
        seperate the data into numerical set and categorical set
        """
        self.clean_df = self.raw_data.select_dtypes(exclude=['object']).copy()
        self.cat_df = self.raw_data.select_dtypes(include=['object']).copy()
        temp_list = []
        # convert value from comma to dot 
        for ele in self.raw_data['Net_Annual_Income'].tolist():
            if type(ele) is not float:
                temp_list.append(ele.replace(',','.'))
            else:
                temp_list.append(ele)
        self.clean_df['Net_Annual_Income'] = temp_list
        del self.cat_df['Net_Annual_Income']
        self.dates_transformer()
        
    def dates_transformer(self):
        """
        processing for the date
        """
        # convert timestamp to duration: now - timestamp 
        for i in self.date_trans_set:
            temp_list = [(datetime.now().date() - datetime.strptime(datetime_str, '%d/%m/%Y').date()).days for datetime_str in self.cat_df[self.date_trans_set[i]].tolist()]
            self.clean_df[i] = temp_list
            del self.cat_df[self.date_trans_set[i]]
        # indicator whether product is closed 
        temp_list = [int(type(ele) != float) for ele in self.cat_df['Prod_Closed_Date'].tolist()]
        self.clean_df['Prod_not_closed'] = temp_list
        del self.cat_df['Prod_Closed_Date']

    def data_preprocessing_simple(self):
        self.data_simple = self.clean_df.copy()
        for i in self.cates:
            labelencoder = LabelEncoder()
            labelencoder.fit(self.cat_df[i])
            self.data_simple[i] = labelencoder.transform(self.cat_df[i])
        
    def data_preprocessing_onehot(self):
        binary = ['Customer_Type', 'P_Client', 'Source']
        no_binary = [i for i in self.cates if i not in binary]
        self.data_oh = self.clean_df.copy()
        for i in binary:
            labelencoder = LabelEncoder()
            labelencoder.fit(self.cat_df[i])
            """
            To see the representation of the labels : list(labelencoder.classes_)
            """
            self.data_oh[i] = labelencoder.transform(self.cat_df[i])
        self.data_oh = self.data_oh.join(pd.get_dummies(self.cat_df[no_binary]))
        
    def split_dataset(self,dataset):
        X = dataset.drop('Y',axis = 1)
        Y = dataset.Y
        try:
            imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
            imp_mean.fit(X)
        except:
            imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
            imp_mean.fit(X)
        '''
        SimpleImputer:
        strategy : mean, median, most_frequent, constant
        '''
        X_imputed = imp_mean.transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X_imputed, Y, test_size=0.25, random_state=42)
        return X_train, X_test, y_train, y_test
    
    
    def split_dataset_4_imbal(self,dataset):
        X_train, X_test, y_train, y_test = self.split_dataset(dataset)
        ros = RandomOverSampler(random_state=0)
        X_train, y_train = ros.fit_resample(X_train, y_train)
   #     ros = RandomOverSampler(random_state=0)
   #     X_test, y_test = ros.fit_resample(X_test, y_test)
        return X_train, X_test, y_train, y_test
        
    
    def convert_labels(self,labels):
        """ 
        convert the input (0/1) labels to what is needed 
        return a binary label sequence 
        """
        labels = np.array(labels)
        if sum(labels) > len(labels)/2:
            labels[labels==1] = -1
            labels[labels==0] = 1
            labels[labels==-1] = 0
        return labels

    def eval_metric(self,y_test, preds):
        """ 
        print the classification report 
        """
        print('confusion matrix')
        print(confusion_matrix(y_test, preds))
        print('--summary[label=1]--')
     #   print('summary')
        beta = 2
        res = precision_recall_fscore_support(y_test, preds, beta = beta, pos_label = 1, average = 'binary')
        print("precision:{}\nrecall:{}\nsupport:{}".format(round(res[0],3),round(res[1],3),res[3]))
        print('--------------------')
        print("accuracy:",round(accuracy_score(y_test, preds),3))
        print("F{}-score:{}".format(beta,round(res[2],3)))
        print("\n")
        fpr, tpr, thresholds = metrics.roc_curve(y_test, preds, pos_label=2)
        metrics.auc(fpr, tpr)   

            
    def model_eval(self, model, getPred = False, Normalize = False):
        """
        print the evaluation result of the model with the Cp object 

        if getPred is True return prediction sequence
        """
        # self.data_preprocessing_0()
        # self.data_preprocessing_simple()
        # self.data_preprocessing_onehot()
        
        if Normalize:
            X_train = normalize(X_train, norm='l2')
            X_test = normalize(X_test, norm='l2')
            
        print('----------------one-hot-------------------')
        model.fit(self.X_train, self.y_train)
        pred_y = model.predict(self.X_test)
        self.eval_metric(self.y_test, pred_y)
        print('----------------one-hot[balanced]-------------------')
        model.fit(self.X_train_b, self.y_train_b)
        pred_y_b = model.predict(self.X_test_b)
        self.eval_metric(self.y_test_b, pred_y_b)
        print('----------------0:n_classes-1-------------------')
        model.fit(self.X_train_t, self.y_train_t)
        pred_y_t = model.predict(self.X_test_t)
        Cp.eval_metric(self.y_test_t, pred_y_t)
        print('----------------0:n_classes-1[balanced]-------------------')
        model.fit(self.X_train_tb, self.y_train_tb)
        pred_y_tb = model.predict(self.X_test_tb)
        Cp.eval_metric(self.y_test_tb, pred_y_tb)
        
        if getPred is True:
            return pred_y, pred_y_b, pred_y_t, pred_y_tb


In [22]:
path = 'CreditTraining.csv'
Cp = Credit_predictor(path)


## 3.1. Model selection 
### 3.1.1 LogisticRegression
The *liblinear* method is chosen as it applies a coordinate descent (CD) algorithm, which performs better in a categarical data.

The result of this model shows that the resampling of the data affect much the result for Logistic regression, in particular the *precision*, *recall*, and *F2-score*. Weather using onehot embedding or not does not affect much the result, which is the same case for all the other tests.

The Logistic Regression Model reaches satisfactory recall and F2 score with balanced data, but low precision

In [23]:
lr = LogisticRegression(solver='liblinear')
lr_pred_y, lr_pred_y_b, lr_pred_y_t, lr_pred_y_tb = Cp.model_eval(lr,True)

----------------one-hot-------------------
confusion matrix
[[1233    3]
 [  96   13]]
--summary[label=1]--
precision:0.812
recall:0.119
support:None
--------------------
accuracy: 0.926
F2-score:0.144


----------------one-hot[balanced]-------------------
confusion matrix
[[1110  126]
 [  16   93]]
--summary[label=1]--
precision:0.425
recall:0.853
support:None
--------------------
accuracy: 0.894
F2-score:0.71


----------------0:n_classes-1-------------------
confusion matrix
[[1226   10]
 [  89   20]]
--summary[label=1]--
precision:0.667
recall:0.183
support:None
--------------------
accuracy: 0.926
F2-score:0.215


----------------0:n_classes-1[balanced]-------------------
confusion matrix
[[1112  124]
 [  16   93]]
--summary[label=1]--
precision:0.429
recall:0.853
support:None
--------------------
accuracy: 0.896
F2-score:0.712




### 3.1.2 Random Forest 
The rebalancing of the data affects not that much the *precision*, *recall*, and *F2-score* as Logistic Regression. Still, we can observe that the resampling could augment the *recall* and *F2-score*, but sacrifice the *precision*

In [24]:
rfc = RandomForestClassifier(random_state=100)
rfc_pred_y, rfc_pred_y_b, rfc_pred_y_t, rfc_pred_y_tb = Cp.model_eval(rfc, True)

----------------one-hot-------------------
confusion matrix
[[1228    8]
 [  76   33]]
--summary[label=1]--
precision:0.805
recall:0.303
support:None
--------------------
accuracy: 0.938
F2-score:0.346


----------------one-hot[balanced]-------------------
confusion matrix
[[1205   31]
 [  60   49]]
--summary[label=1]--
precision:0.612
recall:0.45
support:None
--------------------
accuracy: 0.932
F2-score:0.475


----------------0:n_classes-1-------------------
confusion matrix
[[1222   14]
 [  69   40]]
--summary[label=1]--
precision:0.741
recall:0.367
support:None
--------------------
accuracy: 0.938
F2-score:0.408


----------------0:n_classes-1[balanced]-------------------
confusion matrix
[[1197   39]
 [  57   52]]
--summary[label=1]--
precision:0.571
recall:0.477
support:None
--------------------
accuracy: 0.929
F2-score:0.493




### 3.1.3. SVM
The linear kernel is applied, as it gives the best result for SVM, but the calculation time is sacrified. The normalization of the data does not affect much the result.

In [25]:
clf = SVC(gamma='auto',kernel='linear')
clf_pred_y, clf_pred_y_b, clf_pred_y_t, clf_pred_y_tb = Cp.model_eval(clf, True)

----------------one-hot-------------------
confusion matrix
[[1236    0]
 [ 107    2]]
--summary[label=1]--
precision:1.0
recall:0.018
support:None
--------------------
accuracy: 0.92
F2-score:0.023


----------------one-hot[balanced]-------------------
confusion matrix
[[818 418]
 [ 12  97]]
--summary[label=1]--
precision:0.188
recall:0.89
support:None
--------------------
accuracy: 0.68
F2-score:0.51


----------------0:n_classes-1-------------------
confusion matrix
[[1235    1]
 [ 101    8]]
--summary[label=1]--
precision:0.889
recall:0.073
support:None
--------------------
accuracy: 0.924
F2-score:0.09


----------------0:n_classes-1[balanced]-------------------
confusion matrix
[[911 325]
 [ 15  94]]
--summary[label=1]--
precision:0.224
recall:0.862
support:None
--------------------
accuracy: 0.747
F2-score:0.55




### 3.1.4. LDA
The LDA(Linear Discriminant Analysis) model gives high *recall* but comparatively low *precision*, the rebalancing of the data and the use of onehot embedding does not affect much the result.

In [26]:
LDA = LinearDiscriminantAnalysis()
LDA_pred_y, LDA_pred_y_b, LDA_pred_y_t, LDA_pred_y_tb = Cp.model_eval(LDA, True)

----------------one-hot-------------------
confusion matrix
[[1134  102]
 [  21   88]]
--summary[label=1]--
precision:0.463
recall:0.807
support:None
--------------------
accuracy: 0.909
F2-score:0.703


----------------one-hot[balanced]-------------------
confusion matrix
[[1107  129]
 [  13   96]]
--summary[label=1]--
precision:0.427
recall:0.881
support:None
--------------------
accuracy: 0.894
F2-score:0.726


----------------0:n_classes-1-------------------
confusion matrix
[[1124  112]
 [  18   91]]
--summary[label=1]--
precision:0.448
recall:0.835
support:None
--------------------
accuracy: 0.903
F2-score:0.712


----------------0:n_classes-1[balanced]-------------------
confusion matrix
[[1108  128]
 [  13   96]]
--summary[label=1]--
precision:0.429
recall:0.881
support:None
--------------------
accuracy: 0.895
F2-score:0.727




### 3.1.5. XGBoost 
For XGBoost, resampling seems to improve a lot regarding the recall while sacrifice a little of the precision, the result after resampling is similar to what LDA achieved. Through experiments changing objective and evaluation metric do not seem to have a huge difference.
 


In [27]:
xg_reg = xgb.XGBClassifier(objective ='reg:logistic')
xg_pred_y, xg_pred_y_b, xg_pred_y_t, xg_pred_y_tb = Cp.model_eval(xg_reg, True)

----------------one-hot-------------------
confusion matrix
[[1219   17]
 [  83   26]]
--summary[label=1]--
precision:0.605
recall:0.239
support:None
--------------------
accuracy: 0.926
F2-score:0.271


----------------one-hot[balanced]-------------------
confusion matrix
[[1115  121]
 [  15   94]]
--summary[label=1]--
precision:0.437
recall:0.862
support:None
--------------------
accuracy: 0.899
F2-score:0.722


----------------0:n_classes-1-------------------
confusion matrix
[[1220   16]
 [  76   33]]
--summary[label=1]--
precision:0.673
recall:0.303
support:None
--------------------
accuracy: 0.932
F2-score:0.34


----------------0:n_classes-1[balanced]-------------------
confusion matrix
[[1111  125]
 [  16   93]]
--summary[label=1]--
precision:0.427
recall:0.853
support:None
--------------------
accuracy: 0.895
F2-score:0.711





**Observations**
- Balancing the data indeed helps to improve the model performance 
- We **select logistiRegression, LDA, and XGBoost given their $R_{\text{label=1}}$ around 0.85, $F1_{\text{label=1}} \approx 0.72$, $P_{\text{label=1}}$ around 0.42 using the two balanced data.**

# 4. Voting  

In this section, we propose two voting methods that utilize the predictions of several models and hopes it would maximize the information obatined so far.
- Improve precision of predicted defaulted
- Improve coverage of truly defaulted

## 4.1. Minimize innocent (Improve precision of predicted defaulted)

Since we have three models **logistiRegression, LDA, and XGBoost** using balanced data have $R_{\text{label=1}}$ around 0.85 meaning most of the defaulted customers are detected (which we think is a **relative good coverage**).

But $\text{label=1} \approx 0.42$ , meaning more than half of the predicted is not defaulted, and it would be costly to do a thorough investigation of the users so we want to **minimize those innocent but detected**.   

With 3 predictions from 3 individual parties, we want to extract the mutual information and believing that if one client is predictd defaulted by all parties then there is a larger probability that the one is defaulted compared with (1/3 of the parties voted so). And of course, we might lose some covergae. **We try taking a logical AND to do this**.

**Essentially, we are doing a voting by setting the weight to be 100% iff. all parties vote 1, ensuring that the number of inocent client is minimized**




In [29]:
Cp.eval_metric(Cp.y_test_tb, LDA_pred_y_tb & xg_pred_y_tb & lr_pred_y_tb)

confusion matrix
[[1118  118]
 [  17   92]]
--summary[label=1]--
precision:0.438
recall:0.844
support:None
--------------------
accuracy: 0.9
F2-score:0.712




In [30]:
from scipy.spatial import distance

parties_pred_mat = [LDA_pred_y_tb, xg_pred_y_tb, lr_pred_y_tb]
distance.cdist(parties_pred_mat, parties_pred_mat, 'hamming')

array([[0.        , 0.01189591, 0.00669145],
       [0.01189591, 0.        , 0.00966543],
       [0.00669145, 0.00966543, 0.        ]])

Indeed, the *Minimize innocent* voting mechanism fail to improve a $P_{\text{label=1}}$ a lot is largely due to the percentage of shared information is dominant for all three parties (around 1% difference).

## 4.2. Add trustworthy information ( Improve coverage of truly defaulted)

We use one of the three parties as a base (with good covergae) and then we add prediction by party with high $P_{\text{label=1}}$ but possibly low coverage ($R_{\text{label=1}}$), so as to improve precison and recall at the same time.

**Essentially, we are doing a voting by setting the weight of trustworthy party to 100%, meaning once they vote 1, the client is 1 in prediction.**

- Base
    - `lr_pred_y_tb`: $P_{\text{label=1}}=0.43, R_{\text{label=1}}=0.85$ 
    - `LDA_pred_y_tb`: $P_{\text{label=1}}=0.43, R_{\text{label=1}}=0.88$  
    - `xg_pred_y_tb`: $P_{\text{label=1}}=0.43, R_{\text{label=1}}=0.85$ 
- Trustworthy
    - `lr_pred_y` : $P_{\text{label=1}}=0.81, R_{\text{label=1}}=0.11$ 
    - `rfc_pred_y` : $P_{\text{label=1}}=0.76, R_{\text{label=1}}=0.28$ 
    - `clf_pred_y`: $P_{\text{label=1}}=1.0, R_{\text{label=1}}=0.02$ 
    - `clf_pred_y_t`: $P_{\text{label=1}}=0.89, R_{\text{label=1}}=0.07$ 

In [35]:
print('-------------------LDA+lr-------------------')
Cp.eval_metric(Cp.y_test_tb, lr_pred_y | LDA_pred_y_tb)

print('-------------------LDA+rfc-------------------')
Cp.eval_metric(Cp.y_test_tb, rfc_pred_y | LDA_pred_y_tb)

print('-------------------LDA+SVM-------------------')
Cp.eval_metric(Cp.y_test_tb, clf_pred_y | LDA_pred_y_tb)

print('-------------------LDA+rfc+lr+SVM-------------------')
Cp.eval_metric(Cp.y_test_tb, clf_pred_y | lr_pred_y | rfc_pred_y | LDA_pred_y_tb)

-------------------LDA+lr-------------------
confusion matrix
[[1108  128]
 [  13   96]]
--summary[label=1]--
precision:0.429
recall:0.881
support:None
--------------------
accuracy: 0.895
F2-score:0.727


-------------------LDA+rfc-------------------
confusion matrix
[[1108  128]
 [  13   96]]
--summary[label=1]--
precision:0.429
recall:0.881
support:None
--------------------
accuracy: 0.895
F2-score:0.727


-------------------LDA+SVM-------------------
confusion matrix
[[1108  128]
 [  13   96]]
--summary[label=1]--
precision:0.429
recall:0.881
support:None
--------------------
accuracy: 0.895
F2-score:0.727


-------------------LDA+rfc+lr+SVM-------------------
confusion matrix
[[1108  128]
 [  13   96]]
--summary[label=1]--
precision:0.429
recall:0.881
support:None
--------------------
accuracy: 0.895
F2-score:0.727




In [36]:
print('-------------------XG+lr-------------------')
Cp.eval_metric(Cp.y_test_tb, lr_pred_y | xg_pred_y_tb)

print('-------------------XG+rfc-------------------')
Cp.eval_metric(Cp.y_test_tb, rfc_pred_y | xg_pred_y_tb)

print('-------------------XG+SVM-------------------')
Cp.eval_metric(Cp.y_test_tb, clf_pred_y | xg_pred_y_tb)

print('-------------------XG+rfc+lr+SVM-------------------')
Cp.eval_metric(Cp.y_test_tb, lr_pred_y | rfc_pred_y | clf_pred_y | xg_pred_y_tb)

-------------------XG+lr-------------------
confusion matrix
[[1111  125]
 [  16   93]]
--summary[label=1]--
precision:0.427
recall:0.853
support:None
--------------------
accuracy: 0.895
F2-score:0.711


-------------------XG+rfc-------------------
confusion matrix
[[1111  125]
 [  16   93]]
--summary[label=1]--
precision:0.427
recall:0.853
support:None
--------------------
accuracy: 0.895
F2-score:0.711


-------------------XG+SVM-------------------
confusion matrix
[[1111  125]
 [  16   93]]
--summary[label=1]--
precision:0.427
recall:0.853
support:None
--------------------
accuracy: 0.895
F2-score:0.711


-------------------XG+rfc+lr+SVM-------------------
confusion matrix
[[1111  125]
 [  16   93]]
--summary[label=1]--
precision:0.427
recall:0.853
support:None
--------------------
accuracy: 0.895
F2-score:0.711




In [37]:
print('-------------------lr+rfc-------------------')
Cp.eval_metric(Cp.y_test_tb, rfc_pred_y | lr_pred_y_tb)

print('-------------------lr+SVM-------------------')
Cp.eval_metric(Cp.y_test_tb, clf_pred_y | lr_pred_y_tb)

print('-------------------lr+rfc+SVM-------------------')
Cp.eval_metric(Cp.y_test_tb, rfc_pred_y | clf_pred_y | lr_pred_y_tb)

-------------------lr+rfc-------------------
confusion matrix
[[1112  124]
 [  15   94]]
--summary[label=1]--
precision:0.431
recall:0.862
support:None
--------------------
accuracy: 0.897
F2-score:0.719


-------------------lr+SVM-------------------
confusion matrix
[[1112  124]
 [  16   93]]
--summary[label=1]--
precision:0.429
recall:0.853
support:None
--------------------
accuracy: 0.896
F2-score:0.712


-------------------lr+rfc+SVM-------------------
confusion matrix
[[1112  124]
 [  15   94]]
--summary[label=1]--
precision:0.431
recall:0.862
support:None
--------------------
accuracy: 0.897
F2-score:0.719




Unfortunately, these *trustworthy information* seem to have been covered already by the parties with good coverage.

# 5. Conclusion 

Given the current dataset, using LDA on `0:n_classes-1[balanced]` gives the best result (precision:0.429, recall:0.881, accuracy: 0.895, F2-score:0.727) so far. Modles like XGBosst and linear regression using the same embedding gives a similar result. This suggests that the model has tried to extract as much as the infomration as they can. And for future work more domain-specific and fine-grained feature engineering is needed to further improve the result.

We also experimented with two proposed voting mechanisms, but both failed to improve the result given the information is largely covered by the mentioned three models. For future work, one potential appraoch is to perform voting using part of the data following a similar idea like the cross-validation. 

