In [161]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df = pd.read_csv("lending-club-data3a.csv")

In [162]:
df.head()

Unnamed: 0,credit,term,income,y
0,excellent,3yrs,middle,safe
1,fair,3yrs,low,safe
2,excellent,3yrs,low,safe
3,poor,3yrs,middle,risky
4,excellent,3yrs,low,safe


In [163]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['y']), df['y'], test_size=0.2, random_state=123)

In [164]:
X_train.head()

Unnamed: 0,credit,term,income
9642,fair,3yrs,middle
17924,poor,3yrs,middle
1642,fair,3yrs,low
9670,fair,3yrs,middle
16860,excellent,3yrs,low


In [165]:
class Node:
    def __init__(self, feature=None, threshold=None, value=None, left=None, right=None):
        self.feature = feature
        self.value = value  # Value for leaf nodes
        self.threshold = threshold
        self.left = left
        self.right = right

class decisionTree():
    
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        
    def best_split(self, X, y):
        features =  X.columns.tolist()
        best_feature = np.nan
        best_feature_val = np.nan
        #best_feature = features[0]
        #best_feature_val= X[features[0]].unique()[0]
        err_rate_min = 1
        m = X.shape[0]
        #print(f"m = {m}")
        for feat in features:
            unique_val = X[feat].unique()
            if len(unique_val)==1:
                # print(f"feat = {feat}")
                continue;
            #print(f"feat = {feat}")
            unique_label = y.unique()
            for i in range(len(unique_val)):
                err = 0
                mask = X[feat] == unique_val[i]
                y_result1 = []
                y_result2 = []
                for j in range(len(unique_label)):
                    y_result1.append((y[mask] == unique_label[j]).sum())
                    y_result2.append((y[~mask] == unique_label[j]).sum())
                # print(y_result1)
                # print(y_result2)
                err = min(y_result1) + min(y_result2)
                err_rate = err/m
                #print(f"unique_val = {unique_val[i]}, err_rate = {err}")
                if err_rate < err_rate_min:
                    err_rate_min = err_rate
                    best_feature = feat
                    best_feature_val = unique_val[i]
        return best_feature, best_feature_val, err_rate_min

    def build_tree(self, X, y, depth=0):
        if len(y) == 1 or len(y.unique())==1 or (self.max_depth is not None and depth == self.max_depth):
            #print(f'len(y) = {len(y)}, y = {y.mode().values}')
            return Node(value=y.mode().values[0])
        
        best_feature, best_feature_val, err_rate_min = self.best_split(X, y)
        #print (f"best feature: {best_feature}")
        #print (f"best_feature_val: {best_feature_val}")
        #print (f"err_rate_min: {err_rate_min}")
        mask = X[best_feature] == best_feature_val
        #print("left node:")
        # print(y[~mask].value_counts())
        left_child = self.build_tree(X[mask], y[mask], depth + 1)
        #print("right node")
        right_child = self.build_tree(X[~mask], y[~mask], depth + 1)
        return Node(feature = best_feature, threshold = best_feature_val, left = left_child, right = right_child)

    def fit(self, X, y):
        self.root = self.build_tree(X,y)

    def predict(self, X):
        res  = []
        for ind, x in X.iterrows():
            res.append(self.make_predict(x, self.root))
        return res
        
    def make_predict(self, x, node):
        if node.value != None:
            return node.value

        if x[node.feature] == node.threshold:
            return self.make_predict(x, node.left)
        else:
            return self.make_predict(x, node.right)

tree = decisionTree(max_depth=3)
tree.fit(X_train, y_train)
y_test_pred = tree.predict(X_test)

mapping = {'safe': 1, 'risky': 0}
y_test = list(map(mapping.get, y_test))
y_test_pred = list(map(mapping.get, y_test_pred))


In [166]:
def confusion_matrix_dt(y, y_pred):
    tp, fp, tn, fn, = 0, 0, 0, 0
    ind = 0
    while ind < len(y_pred):
        if (y_pred[ind]== 1):
            if (y[ind]== 1):
                tp+=1
            elif y[ind]== 0:
                fp+=1
        elif (y_pred[ind]== 0):
            if (y[ind]==1):
                fn+=1
            elif (y[ind]== 0):
                tn+=1
        ind+=1

    accuracy = (tp + tn)/(fp + fn + tp + tn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    F1 = 2 / ((1 / precision) + (1 / recall))
    
    print(f"true positive: {tp}, false positive: {fp}\ntrue negative: {tn}, false negative: {fn}\n")
    print(f"accuracy = {accuracy}")
    print(f"precision = {precision}")
    print(f"recall = {recall}")
    print(f"F1 = {F1}\n")

confusion_matrix_dt(y_test, y_test_pred)

true positive: 2746, false positive: 774
true negative: 480, false negative: 0

accuracy = 0.8065
precision = 0.7801136363636364
recall = 1.0
F1 = 0.8764762208745611

