In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np

from sklearn.model_selection import train_test_split


from src.models.cart import CART
from src.data.load_dataset import load_spambase

In [3]:
X, y = load_spambase()

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [5]:
X_train.shape, X_test.shape

((3680, 57), (921, 57))

In [12]:
cart_classifier = CART(max_depth=10, min_samples_split=2, min_impurity_decrease=0, max_features=40)
cart_classifier.fit(X_train, y_train, verbose=True)

Maximum depth reached during fit: 10


In [11]:
dec_tree = CART(max_depth=10, min_samples_split=2, min_impurity_decrease=0)
dec_tree.fit(X_train, y_train)

Accuracy: 0.928


In [13]:
# accuracy
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

y_pred = cart_classifier.predict(X_test)
acc = accuracy(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")

Accuracy: 0.910


In [12]:
# precision
def precision(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    return tp / (tp + fp)

In [18]:
prec = precision(y_test, y_pred)
print(f"Precision: {prec:.3f}")

Precision: 0.923


In [19]:
# recall
def recall(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return tp / (tp + fn)

rec = recall(y_test, y_pred)
print(f"Recall: {rec:.3f}")

Recall: 0.893


In [20]:
# f1
def f1(y_true, y_pred):
    prec = precision(y_true, y_pred)
    rec = recall(y_true, y_pred)
    return 2 * (prec * rec) / (prec + rec)

f1_score = f1(y_test, y_pred)
print(f"F1 Score: {f1_score:.3f}")

F1 Score: 0.908


In [21]:
# confusion matrix
def confusion_matrix(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return np.array([[tn, fp], [fn, tp]])

conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{conf_matrix}")

Confusion Matrix:
[[531  27]
 [ 39 324]]


['word_freq_make',
 'word_freq_address',
 'word_freq_all',
 'word_freq_3d',
 'word_freq_our',
 'word_freq_over',
 'word_freq_remove',
 'word_freq_internet',
 'word_freq_order',
 'word_freq_mail',
 'word_freq_receive',
 'word_freq_will',
 'word_freq_people',
 'word_freq_report',
 'word_freq_addresses',
 'word_freq_free',
 'word_freq_business',
 'word_freq_email',
 'word_freq_you',
 'word_freq_credit',
 'word_freq_your',
 'word_freq_font',
 'word_freq_000',
 'word_freq_money',
 'word_freq_hp',
 'word_freq_hpl',
 'word_freq_george',
 'word_freq_650',
 'word_freq_lab',
 'word_freq_labs',
 'word_freq_telnet',
 'word_freq_857',
 'word_freq_data',
 'word_freq_415',
 'word_freq_85',
 'word_freq_technology',
 'word_freq_1999',
 'word_freq_parts',
 'word_freq_pm',
 'word_freq_direct',
 'word_freq_cs',
 'word_freq_meeting',
 'word_freq_original',
 'word_freq_project',
 'word_freq_re',
 'word_freq_edu',
 'word_freq_table',
 'word_freq_conference',
 'char_freq_;',
 'char_freq_(',
 'char_freq_[',
 '