In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts
import scipy.stats as stats

In [2]:
df_x = pd.read_csv('spambase.data', header=None, engine='python')
df_x = df_x.sample(frac=1, random_state=0)
df_y = pd.DataFrame(data=df_x[df_x.columns[-1]])
spam_x, nspam_x = [x for _, x in df_x.groupby(df_x[df_x.columns[-1]] == 0)]
spam_y = pd.DataFrame(data=spam_x[spam_x.columns[-1]])
nspam_y = pd.DataFrame(data=nspam_x[nspam_x.columns[-1]])
df_x.drop(df_x.columns[[-1,]], axis=1, inplace=True)
spam_x.drop(spam_x.columns[[-1,]], axis=1, inplace=True)
nspam_x.drop(nspam_x.columns[[-1,]], axis=1, inplace=True)

In [3]:
df_x_train, df_x_test, df_y_train, df_y_test = tts(df_x, df_y, test_size=0.33)

In [4]:
mean = df_x_train.mean(axis=0)
std = df_x_train.std(axis=0)
df_x_s_train = df_x_train.subtract(mean).divide(std)
df_x_s_test = df_x_test.subtract(mean).divide(std)

In [5]:
total_entropy = 0
labels = df_y[df_y.columns[0]]
target_values = labels.unique()
counts = labels.value_counts()
for value in target_values:
    total_entropy -= (counts[value]/len(labels) * np.log2(counts[value]/len(labels)))

In [6]:
def binarize(data):
    data = (data > data.mean(axis=0)).astype(int)
    return data

def attribute_entropy(x, y, index):
    target_value = y[y.columns[0]].unique()
    features = x[index].unique()
    
    attribute_entropy = 0
    for feature in features:
        feature_entropy = 0
        available_features = len(x[index][x[index]==feature][y[y.columns[0]]==target_value[0]])
        total_features = len(x[index][x[index]==feature])
        feature_prob = (available_features/(total_features + np.finfo(float).eps) + np.finfo(float).eps)
        #print(feature_prob)
        feature_entropy -= feature_prob * np.log2(feature_prob)
        attribute_entropy -= total_features/len(x) * feature_entropy
        
    return(np.abs(attribute_entropy))

df_x_b_train = binarize(df_x_s_train)
df_x_b_test = binarize(df_x_s_test)

In [7]:
class Node:
    def __init__(self, val=None):
        self.val = val
        self.right = None
        self.left = None
    def setRight(self, right):
        self.right = right
    def setLeft(self, left):
        self.left = left
    def setVal(self, val):
        self.val = val
    def getRight(self):
        return self.right
    def getLeft(self):
        return self.left
    def getVal(self):
        return self.val

In [8]:
class DT:
    def __init__(self):
        self.root = None
    
    def ID3(self, data, attributes, default):
        data_x = data[:,:-1]
        data_y = data[:,-1]
        if len(data_x) == 0:
            return Node(default)
        elif np.all(data_y) == 1:
            return Node(data_y[0])
        elif len(attributes) == 1:
            return Node(stats.mode(data_y).mode[0])
        best = self.attribute_entropy(data, attributes)
        root = Node(attributes[best])
        new_attr = np.delete(attributes, best)
        new_data = np.delete(data, best, 1)
        spam = new_data[(data[:,best] == 1)]
        nspam = new_data[(data[:,best] == 0)]
        right = self.ID3(spam, new_attr, stats.mode(data_y).mode[0])
        root.setRight(right)
        left = self.ID3(nspam, new_attr, stats.mode(data_y).mode[0])
        root.setLeft(left)
        return root
            
    def attribute_entropy(self, x, attributes):
        entropies = []
        for i in range(len(attributes) - 1):
            z_x = x[(x[:,i] == 0)]
            z_x_rows = z_x.shape[0]
            o_x = x[(x[:,i] == 1)]
            o_x_rows = o_x.shape[0]
            z_y = z_x[(z_x[:,-1] == 0)]
            z_y_rows = z_y.shape[0]
            o_y = z_x[(z_x[:,-1] == 1)]
            o_y_rows = o_y.shape[1]
            o_x_y = o_x[(o_x[:,-1] == 0)]
            o_x_y_rows = o_x_y.shape[0]
            o_y_y = o_x[(o_x[:,-1] == 1)]
            o_y_y_rows = o_y_y.shape[1]
            total_z_x = z_x_rows / len(x)
            total_o_x = o_x_rows / len(x)
            ent1 = self.ent_helper(z_y_rows, o_y_rows, total_z_x)
            ent2 = self.ent_helper(o_x_y_rows, o_y_y_rows, total_o_x)
            ent = ent1 * total_z_x + ent2 * total_o_x
            entropies.append(ent)
            
        return entropies.index(np.min(entropies))
            
    def predict(self, root, test_x, test_y):
        p = []
        for i in range(test_x.shape[0]):
            current = root
            while not False:
                feature = current.getVal()
                if test_x.values[i][feature] == 0:
                    current = current.getLeft()
                else:
                    current = current.getRight()
                if current.getLeft() is None and current.getRight() is None:
                    p.append(current.getVal())
                    break;
        tp, tn, fp, fn = 0,0,0,0
        for i in range(len(p)):
            if p[i] == test_y.values[i]:
                if p[i] == 1:
                    tp += 1
                else:
                    tn += 1
            else:
                if p[i] == 1:
                    fp += 1
                else:
                    fn += 1
        accuracy = (tp + tn)/test_y.shape[0]
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2 * precision * recall / (precision + recall)
        print(tp, tn, fp, fn)
        print(accuracy, precision, recall, f1)
                
            
    def ent_helper(self, zero, ones, rows):
        eps = np.finfo(float).eps
        z = zero / (rows + eps)
        o = ones / (rows + eps)
        return -(o * np.log2(o + eps) + z * np.log2(z + eps))

dt = DT()
root = dt.ID3(np.hstack([df_x_b_train, df_y_train]), np.arange(0, 57), stats.mode(df_y_train).mode[0])
dt.predict(root, df_x_b_test, df_y_test)

499 862 68 90
0.8959842001316656 0.8800705467372134 0.8471986417657046 0.8633217993079585
