In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

In [2]:
# Load data
matrix = []

with open("spambase.data", "r") as raw_data:
    for raw_line in raw_data:
        line = [float(x) for x in raw_line.split(",")]
        matrix.append(line)

data = pd.DataFrame(matrix)
row, col = data.shape
X, y = data.iloc[:,:col - 1], data[col - 1]
y = y.astype(bool)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [5]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
clf = RandomForestClassifier(n_estimators=100)

In [21]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [22]:
clf.predict_proba(X_test)

array([[0.99, 0.01],
       [0.1 , 0.9 ],
       [0.  , 1.  ],
       ...,
       [0.01, 0.99],
       [0.99, 0.01],
       [0.79, 0.21]])

In [23]:
test_data = clf.predict(X_test)

print("Confusion matrix\n", confusion_matrix(test_data, y_test), "\n")

tn, fp, fn, tp = confusion_matrix(test_data, y_test).ravel()
print("True negative:", tn, ", false positive:", fp, ", false negative:", fn, ",true positive:", tp, "\n")

print("Accuracy score", accuracy_score(test_data, y_test), "\n")

print("Precision", precision_score(test_data, y_test), "\n")

print("Recall", recall_score(test_data, y_test), "\n")

print("F1 score", f1_score(test_data, y_test), "\n")

Confusion matrix
 [[666  35]
 [ 31 419]] 

True negative: 666 , false positive: 35 , false negative: 31 ,true positive: 419 

Accuracy score 0.9426585577758471 

Precision 0.9229074889867841 

Recall 0.9311111111111111 

F1 score 0.9269911504424778 



In [24]:
test_data = clf.predict(X_train)

print("Confusion matrix\n", confusion_matrix(test_data, y_train), "\n")

tn, fp, fn, tp = confusion_matrix(test_data, y_train).ravel()
print("True negative:", tn, ", false positive:", fp, ", false negative:", fn, ",true positive:", tp, "\n")

print("Accuracy score", accuracy_score(test_data, y_train), "\n")

print("Precision", precision_score(test_data, y_train), "\n")

print("Recall", recall_score(test_data, y_train), "\n")

print("F1 score", f1_score(test_data, y_train), "\n")

Confusion matrix
 [[2091    1]
 [   0 1358]] 

True negative: 2091 , false positive: 1 , false negative: 0 ,true positive: 1358 

Accuracy score 0.9997101449275362 

Precision 0.9992641648270787 

Recall 1.0 

F1 score 0.999631947000368 



In [None]:
def is_numeric(val):
    return isinstance(val, int) or isinstance(val, float)

class Predicate:
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        # This is just a helper method to print
        # the question in a readable format.
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            header[self.column], condition, str(self.value))

In [1]:
# TODO: implement decision tree classifier for numerical feature

class DecTreeClassifier:
    def __init__(self):
        pass
    
    def fit(self, X, y):
        self.X = X
        self.y = y
    
    def predict(self, X):
        return 1

In [2]:
from sklearn.utils import resample

class RandomForestClassifier:
    def __init__(self, num_est=10):
        self.num_est = num_est
        self.forest = []
    
    def fit(self, X, y):
        self.X = X
        self.y = y
        
        # Bagging
        for i in range(self.num_est):
            X_samp, y_samp = resample(X, y, replace=True, random_state=0)
            clf = DecTreeClassifier()
            clf.fit(X_samp, y_samp)
            self.forest.append(clf)
            
    def predict(self, X):
        labels = {}
        for clf in self.forest:
            y = clf.predict(X)
            if y not in labels:
                labels[y] = 0
            labels[y] += 1
        
        for l in labels:
            labels[l] = labels[i] / self.num_est
        
        return labels