In [135]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder,normalize
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics



In [143]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 

In [136]:
class Credit_predictor():
    def __init__(self,path):
        """
        read the data
        """
        self.raw_data = pd.read_csv('CreditTraining.csv')
        self.clean_df = self.raw_data.select_dtypes(exclude=['object']).copy()
        self.cat_df = self.raw_data.select_dtypes(include=['object']).copy()
        self.cates = ['Customer_Type', 'P_Client',
            'Educational_Level', 'Marital_Status',
            'Prod_Sub_Category', 'Source',
            'Type_Of_Residence', 'Prod_Category']
        self.date_trans_set = {'Birth_Duration':'BirthDate',
             'Customer_Open_Duration':'Customer_Open_Date',
             'Prod_Decision_Duration':'Prod_Decision_Date'}
        
    def data_preprocessing_0(self):
        """
        seperate the data into numerical set and categorical set
        """
        self.clean_df = self.raw_data.select_dtypes(exclude=['object']).copy()
        self.cat_df = self.raw_data.select_dtypes(include=['object']).copy()
        temp_list = []
        for ele in self.raw_data['Net_Annual_Income'].tolist():
            if type(ele) is not float:
                temp_list.append(ele.replace(',','.'))
            else:
                temp_list.append(ele)

        self.clean_df['Net_Annual_Income'] = temp_list
        del self.cat_df['Net_Annual_Income']
        self.dates_transformer()
        
    def dates_transformer(self):
        """
        processing for the date
        """
        for i in self.date_trans_set:
            temp_list = [(datetime.now().date() - datetime.strptime(datetime_str, '%d/%m/%Y').date()).days for datetime_str in self.cat_df[self.date_trans_set[i]].tolist()]
            self.clean_df[i] = temp_list
            del self.cat_df[self.date_trans_set[i]]
        temp_list = [int(type(ele) != float) for ele in self.cat_df['Prod_Closed_Date'].tolist()]
        self.clean_df['Prod_not_closed'] = temp_list
        del self.cat_df['Prod_Closed_Date']

    def data_preprocessing_simple(self):
        self.data_simple = self.clean_df.copy()
        for i in self.cates:
            labelencoder = LabelEncoder()
            labelencoder.fit(self.cat_df[i])
            self.data_simple[i] = labelencoder.transform(self.cat_df[i])
        
    def data_preprocessing_onehot(self):
        binary = ['Customer_Type', 'P_Client', 'Source']
        no_binary = [i for i in self.cates if i not in binary]
        self.data_oh = self.clean_df.copy()
        for i in binary:
            labelencoder = LabelEncoder()
            labelencoder.fit(self.cat_df[i])
            """
            To see the representation of the labels : list(labelencoder.classes_)
            """
            self.data_oh[i] = labelencoder.transform(self.cat_df[i])
        self.data_oh = self.data_oh.join(pd.get_dummies(self.cat_df[no_binary]))
        
    def split_dataset(self,dataset):
        X = dataset.drop('Y',axis = 1)
        Y = dataset.Y
        try:
            imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
            imp_mean.fit(X)
        except:
            imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
            imp_mean.fit(X)
        '''
        SimpleImputer:
        strategy : mean, median, most_frequent, constant
        '''
        X_imputed = imp_mean.transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X_imputed, Y, test_size=0.25, random_state=42)
        return X_train, X_test, y_train, y_test
    
    def split_dataset_4_imbal(self,dataset):
        X_train, X_test, y_train, y_test = self.split_dataset(dataset)
        ros = RandomOverSampler(random_state=0)
        X_train, y_train = ros.fit_resample(X_train, y_train)
   #     ros = RandomOverSampler(random_state=0)
   #     X_test, y_test = ros.fit_resample(X_test, y_test)
        return X_train, X_test, y_train, y_test
        
    
    def convert_labels(self,labels):
        """ 
        convert the input (0/1) labels to what is needed 
        return a binary label sequence 
        """
        labels = np.array(labels)
        if sum(labels) > len(labels)/2:
            labels[labels==1] = -1
            labels[labels==0] = 1
            labels[labels==-1] = 0
        return labels

   
    def eval_metric(self, y_test, preds):
        """ 
        print the classification report 
        """
     #   preds = convert_labels(preds)
        print('confusion matrix')
        print(confusion_matrix(y_test, preds))
        print('\n')
        print('summary')
        print(classification_report(y_test, preds))
        fpr, tpr, thresholds = metrics.roc_curve(y_test, preds, pos_label=2)
        metrics.auc(fpr, tpr)


In [137]:
path = 'CreditTraining.csv'
Cp = Credit_predictor(path)
Cp.data_preprocessing_0()
Cp.data_preprocessing_simple()
Cp.data_preprocessing_onehot()

In [138]:
X_train, X_test, y_train, y_test = Cp.split_dataset(Cp.data_oh)
X_train_b, X_test_b, y_train_b, y_test_b = Cp.split_dataset_4_imbal(Cp.data_oh)
X_train_t, X_test_t, y_train_t, y_test_t = Cp.split_dataset(Cp.data_simple)
X_train_tb, X_test_tb, y_train_tb, y_test_tb = Cp.split_dataset_4_imbal(Cp.data_simple)

In [120]:
lr = LogisticRegression(solver='liblinear')
lr.fit(X_train, y_train)
pred_y = lr.predict(X_test)
Cp.eval_metric(y_test, pred_y)

confusion matrix
[[1228    8]
 [  89   20]]


summary
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      1236
           1       0.71      0.18      0.29       109

    accuracy                           0.93      1345
   macro avg       0.82      0.59      0.63      1345
weighted avg       0.91      0.93      0.91      1345





In [121]:
lr = LogisticRegression(solver='liblinear')
lr.fit(X_train_b, y_train_b)
pred_y_b = lr.predict(X_test_b)
Cp.eval_metric(y_test_b, pred_y_b)

confusion matrix
[[1113  123]
 [  16   93]]


summary
              precision    recall  f1-score   support

           0       0.99      0.90      0.94      1236
           1       0.43      0.85      0.57       109

    accuracy                           0.90      1345
   macro avg       0.71      0.88      0.76      1345
weighted avg       0.94      0.90      0.91      1345





In [122]:
lr = LogisticRegression(solver='liblinear')
lr.fit(X_train_tb, y_train_tb)
pred_y_tb = lr.predict(X_test_tb)
Cp.eval_metric(y_test_tb, pred_y_tb)

confusion matrix
[[1112  124]
 [  16   93]]


summary
              precision    recall  f1-score   support

           0       0.99      0.90      0.94      1236
           1       0.43      0.85      0.57       109

    accuracy                           0.90      1345
   macro avg       0.71      0.88      0.76      1345
weighted avg       0.94      0.90      0.91      1345





In [117]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train_t,y_train_t)
pred_dtc = dtc.predict(X_test_t)
Cp.eval_metric(y_test_t, pred_dtc)

confusion matrix
[[1180   56]
 [  60   49]]


summary
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      1236
           1       0.47      0.45      0.46       109

    accuracy                           0.91      1345
   macro avg       0.71      0.70      0.71      1345
weighted avg       0.91      0.91      0.91      1345





In [118]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)
pred_dtc = dtc.predict(X_test)
Cp.eval_metric(y_test, pred_dtc)

confusion matrix
[[1178   58]
 [  58   51]]


summary
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      1236
           1       0.47      0.47      0.47       109

    accuracy                           0.91      1345
   macro avg       0.71      0.71      0.71      1345
weighted avg       0.91      0.91      0.91      1345





In [123]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train_b,y_train_b)
pred_dtc_b = dtc.predict(X_test_b)
Cp.eval_metric(y_test_b, pred_dtc_b)

confusion matrix
[[1184   52]
 [  54   55]]


summary
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1236
           1       0.51      0.50      0.51       109

    accuracy                           0.92      1345
   macro avg       0.74      0.73      0.73      1345
weighted avg       0.92      0.92      0.92      1345





In [95]:
# random forest model 
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
# predictions
pred_rfc = rfc.predict(X_test)

Cp.eval_metric(pred_rfc, y_test)

confusion matrix
[[1229   80]
 [   7   29]]


summary
              precision    recall  f1-score   support

           0       0.99      0.94      0.97      1309
           1       0.27      0.81      0.40        36

    accuracy                           0.94      1345
   macro avg       0.63      0.87      0.68      1345
weighted avg       0.97      0.94      0.95      1345





In [96]:
rfc = RandomForestClassifier()
rfc.fit(X_train_b,y_train_b)
# predictions
pred_rfc = rfc.predict(X_test_b)

Cp.eval_metric(pred_rfc, y_test_b)

confusion matrix
[[1204   58]
 [  32   51]]


summary
              precision    recall  f1-score   support

           0       0.97      0.95      0.96      1262
           1       0.47      0.61      0.53        83

    accuracy                           0.93      1345
   macro avg       0.72      0.78      0.75      1345
weighted avg       0.94      0.93      0.94      1345





In [128]:
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(X_train, y_train)
pred_KNN = KNN.predict(X_test)

Cp.eval_metric(y_test, pred_KNN)

confusion matrix
[[1211   25]
 [ 109    0]]


summary
              precision    recall  f1-score   support

           0       0.92      0.98      0.95      1236
           1       0.00      0.00      0.00       109

    accuracy                           0.90      1345
   macro avg       0.46      0.49      0.47      1345
weighted avg       0.84      0.90      0.87      1345





In [129]:
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(X_train_b, y_train_b)
pred_KNN_b = KNN.predict(X_test_b)

Cp.eval_metric(y_test_b, pred_KNN_b)

confusion matrix
[[1057  179]
 [  91   18]]


summary
              precision    recall  f1-score   support

           0       0.92      0.86      0.89      1236
           1       0.09      0.17      0.12       109

    accuracy                           0.80      1345
   macro avg       0.51      0.51      0.50      1345
weighted avg       0.85      0.80      0.82      1345





In [130]:
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(X_train_tb, y_train_tb)
pred_KNN_tb = KNN.predict(X_test_tb)

Cp.eval_metric(y_test_tb, pred_KNN_tb)

confusion matrix
[[1057  179]
 [  91   18]]


summary
              precision    recall  f1-score   support

           0       0.92      0.86      0.89      1236
           1       0.09      0.17      0.12       109

    accuracy                           0.80      1345
   macro avg       0.51      0.51      0.50      1345
weighted avg       0.85      0.80      0.82      1345





In [132]:
clf = SVC(gamma='auto')
clf.fit(X_train, y_train)
pred_SVM = clf.predict(X_test)

Cp.eval_metric(y_test, pred_SVM)

confusion matrix
[[1236    0]
 [ 109    0]]


summary
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      1236
           1       0.00      0.00      0.00       109

    accuracy                           0.92      1345
   macro avg       0.46      0.50      0.48      1345
weighted avg       0.84      0.92      0.88      1345



  _warn_prf(average, modifier, msg_start, len(result))


In [142]:
X_train_n = normalize(X_train_b, norm='l2')
X_test_n = normalize(X_test_b, norm='l2')

clf = SVC(gamma='auto')
clf.fit(X_train_n, y_train_b)
pred_SVM_b = clf.predict(X_test_n)

Cp.eval_metric(y_test_b, pred_SVM_b)

confusion matrix
[[660 576]
 [ 55  54]]


summary
              precision    recall  f1-score   support

           0       0.92      0.53      0.68      1236
           1       0.09      0.50      0.15       109

    accuracy                           0.53      1345
   macro avg       0.50      0.51      0.41      1345
weighted avg       0.86      0.53      0.63      1345





In [145]:
LDA = LinearDiscriminantAnalysis()
LDA.fit(X_train_b, y_train_b)
pred_LDA_b = LDA.predict(X_test_b)

Cp.eval_metric(y_test_b, pred_LDA_b)


confusion matrix
[[1107  129]
 [  13   96]]


summary
              precision    recall  f1-score   support

           0       0.99      0.90      0.94      1236
           1       0.43      0.88      0.57       109

    accuracy                           0.89      1345
   macro avg       0.71      0.89      0.76      1345
weighted avg       0.94      0.89      0.91      1345



