In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

In [21]:
# Load data
matrix = []

with open("spambase.data", "r") as raw_data:
    for raw_line in raw_data:
        line = [float(x) for x in raw_line.split(",")]
        matrix.append(line)

data = pd.DataFrame(matrix)
row, col = data.shape
X, y = data.iloc[:,:col - 1], data[col - 1]
# y = y.astype(bool)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
clf = RandomForestClassifier(n_estimators=100)

In [25]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
clf.predict_proba(X_test)

array([[0.4 , 0.6 ],
       [0.97, 0.03],
       [0.96, 0.04],
       ...,
       [1.  , 0.  ],
       [0.91, 0.09],
       [0.58, 0.42]])

In [27]:
test_data = clf.predict(X_test)

print("Confusion matrix\n", confusion_matrix(test_data, y_test), "\n")

tn, fp, fn, tp = confusion_matrix(test_data, y_test).ravel()
print("True negative:", tn, ", false positive:", fp, ", false negative:", fn, ",true positive:", tp, "\n")

print("Accuracy score", accuracy_score(test_data, y_test), "\n")

print("Precision", precision_score(test_data, y_test), "\n")

print("Recall", recall_score(test_data, y_test), "\n")

print("F1 score", f1_score(test_data, y_test), "\n")

Confusion matrix
 [[677  40]
 [ 20 414]] 

True negative: 677 , false positive: 40 , false negative: 20 ,true positive: 414 

Accuracy score 0.947871416159861 

Precision 0.9118942731277533 

Recall 0.9539170506912442 

F1 score 0.9324324324324325 



In [28]:
test_data = clf.predict(X_train)

print("Confusion matrix\n", confusion_matrix(test_data, y_train), "\n")

tn, fp, fn, tp = confusion_matrix(test_data, y_train).ravel()
print("True negative:", tn, ", false positive:", fp, ", false negative:", fn, ",true positive:", tp, "\n")

print("Accuracy score", accuracy_score(test_data, y_train), "\n")

print("Precision", precision_score(test_data, y_train), "\n")

print("Recall", recall_score(test_data, y_train), "\n")

print("F1 score", f1_score(test_data, y_train), "\n")

Confusion matrix
 [[2091    0]
 [   0 1359]] 

True negative: 2091 , false positive: 0 , false negative: 0 ,true positive: 1359 

Accuracy score 1.0 

Precision 1.0 

Recall 1.0 

F1 score 1.0 



In [29]:
len(X_train.columns)

57

In [195]:
class Predicate:
    def __init__(self, column, value):
        self.column = column
        self.value = value
    
    def match(self, example):
        if self.column in example:
            val = example[self.column]
            if Util.is_numeric(val):
                return val >= self.value
            else:
                return val == self.value
        
    def __repr__(self):
        # This is just a helper method to print
        # the question in a readable format.
        condition = "=="
        if Util.is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            header[self.column], condition, str(self.value))

In [233]:
# TODO: implement decision tree classifier for numerical feature

class Util:
    @staticmethod
    def label_count(labels):
        count = {}
        for r in labels:
            if r not in count:
                count[r] = 0
            count[r] += 1
        return count
    
    @staticmethod
    def is_numeric(val):
        return isinstance(val, int) or isinstance(val, float)
    
    @staticmethod
    def partition(X, y, pred):
        true_X, false_X, true_y, false_y = [], [], [], []
        
        for x_inst, y_inst  in zip(X, y):
            if pred.match(x_inst):
                true_X.append(x_inst)
                true_y.append(y_inst)
            else:
                false_X.append(x_inst)
                false_y.append(y_inst)
        return true_X, true_y, false_X, false_y
    
    @staticmethod
    def gini_impur(labels):
        """
        Gini impurity
        """
        counts = Util.label_count(labels)
        total = 0
        for lbl in counts:
            prob_of_lbl = float(counts[lbl]) / len(labels)
            total += (prob_of_lbl * prob_of_lbl)
        return 1 - total
    
    @staticmethod
    def info_gain(leftLbl, rightLbl, curr_uncertainty):
        """
        Calculating information gain
        """
        p = float (len(leftLbl)) / (len(leftLbl) + len(rightLbl))
        return curr_uncertainty - p * Util.gini_impur(leftLbl) - (1 - p) * Util.gini_impur(rightLbl)
        
class Leaf:
    def __init__(self, y):
        pred = Util.label_count(y)
        for l in pred:
            pred[l] = pred[l] / len(y)
        self.predictions = pred
        
    def isLeaf(self):
        return True

class DecTreeNode:
    def __init__(self, pred, true_branch, false_branch):
        self.pred = pred
        self.true_branch = true_branch
        self.false_branch = false_branch
        
    def isLeaf(self):
        return False
    
class DecTreeClassifier:
    def __init__(self, max_depth=10):
        self.max_depth = max_depth
    
    def findBestSplit(self, X, y):
        best_gain = 0  
        best_pred = None
        current_uncert = Util.gini_impur(y)
        n_features = np.array(X).shape[1]
        
        for col in range(n_features):
            vals = set([row[col] for row in X]) # different values in column
            
            for v in vals:
                pred = Predicate(col, v)
                true_X, true_y, false_X, false_y = Util.partition(X, y, pred)
                
                if len(true_X) == 0 or len(false_X) == 0:
                    continue
                
                gain = Util.info_gain(true_y, false_y, current_uncert)
                
                if gain >= best_gain:
                    best_gain, best_pred = gain, pred
        return best_gain, best_pred
    
    def build_tree(self, X, y, depth=0):
        gain, pred = self.findBestSplit(X, y)
        if gain == 0 or depth == self.max_depth:
            return Leaf(y)
        true_X, true_y, false_X, false_y = Util.partition(X, y, pred)
        
        true_branch = self.build_tree(true_X, true_y, depth + 1)
        false_branch = self.build_tree(false_X, false_y, depth + 1)
        
        return DecTreeNode(pred, true_branch, false_branch)
    
    def predict_by_tree(self, tree, X_inst):
        if tree.isLeaf():
            return tree.predictions
        elif tree.pred.match(X_inst):
            return self.predict_by_tree(tree.true_branch, X_inst)
        else:
            return self.predict_by_tree(tree.false_branch, X_inst)
    
    def fit(self, X, y):
        self.tree = self.build_tree(X.values, y.values) 
    
    def predict_instance(self, X_inst):
        return self.predict_by_tree(self.tree, X_inst)

In [234]:
len(X_train.columns)

57

In [236]:
import time 

decTreeClf = DecTreeClassifier()
start = time.time()
decTreeClf.fit(X_train[:1000], y_train[:1000])
print("Decision tree classifier take", time.time() - start, "seconds to build tree")

Decision tree classifier take 217.1205952167511 seconds to build tree


In [237]:
decTreeClf.predict(X_test[0])

{0.0: 0.8823529411764706, 1.0: 0.11764705882352941}

In [238]:
decTreeClf.predict(X_test[40])

{0.0: 0.8823529411764706, 1.0: 0.11764705882352941}

In [35]:
from sklearn.utils import resample

class RandomForestClassifier:
    def __init__(self, num_est=10):
        self.num_est = num_est
        self.forest = []
    
    def fit(self, X, y):
        self.X = X
        self.y = y
        
        # Bagging
        for i in range(self.num_est):
            X_samp, y_samp = resample(X, y, replace=True, random_state=0)
            clf = DecTreeClassifier()
            clf.fit(X_samp, y_samp)
            self.forest.append(clf)
            
    def predict(self, X):
        labels = {}
        for clf in self.forest:
            y = clf.predict(X)
            if y not in labels:
                labels[y] = 0
            labels[y] += 1
        
        for l in labels:
            labels[l] = labels[i] / self.num_est
        
        return labels