In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("spambase/spambase.data", delimiter=",", header=None)

In [3]:
with open("spambase/spambase.names", 'r') as file:
    text_array = file.readlines()

In [4]:
attributes = [line.split(":")[0].strip() for line in text_array]
attributes.append("target")
print(len(attributes))

58


In [5]:
data.columns = attributes

In [6]:
data

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,target
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61,278,1
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028,1
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88,0
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14,0
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118,0
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78,0


In [7]:
x_cols = attributes[:-1]

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data[x_cols], data["target"], random_state=42, test_size=0.2)

In [40]:
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import cross_val_score
best_p = 0
best_score = float('inf')
best_cols = []
# Varying p
for p in range(1, x_train.shape[1] + 1):
    # Creating bootstrap sample
    col_subset = np.random.choice(x_cols, p, True)
    X_sample, y_sample = resample(x_train, y_train, n_samples=1000, random_state=42)
    X_sample = X_sample[col_subset]

    # Train a decision tree classifier with depth 6
    clf = DecisionTreeClassifier(max_depth=6, random_state=42)
    score = -np.mean(cross_val_score(clf, X_sample, y_sample, cv=5, scoring='neg_mean_squared_error'))

    if score < best_score:
        best_cols = col_subset
        best_score = score
        best_p = p

print(f"The best value of p is: {best_p} with a cross-validation error of: {best_score}")

The best value of p is: 49 with a cross-validation error of: 0.081


In [42]:
def random_forest(X_train, y_train, T, max_depth=6):
    trees = []
    tree_cols = []
    col_subset = best_cols
    for _ in range(T):
        # Bootstrap sampling
        X_sample, y_sample = resample(X_train, y_train, n_samples=1000)
        X_sample = X_sample[col_subset]

        # Train decision tree
        tree = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
        tree.fit(X_sample, y_sample)
        trees.append(tree)
        tree_cols.append(col_subset)
    return trees, tree_cols

def predict_rf(trees, X, col_sub):
    predictions = []
    for tree, cols in zip(trees, col_sub):
        predictions.append(tree.predict(X[cols]))

    # Majority voting
    final_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=np.array(predictions))
    return final_predictions

Ts = [1, 50, 100, 150, 200, 300, 400]
train_preds = []
test_preds = []
for T in Ts:
    trees, tree_cols = random_forest(x_train, y_train, T)
    train_preds.append(predict_rf(trees, x_train, tree_cols))
    test_preds.append(predict_rf(trees, x_test, tree_cols))

In [43]:
from sklearn.metrics import f1_score, auc, log_loss
for t, t_pred in zip(Ts, train_preds):
    print(f"training error for T = {t} :{log_loss(y_train, t_pred)}")

training error for T = 1 :3.4280648603779897
training error for T = 50 :2.634712706976227
training error for T = 100 :2.791424243450649
training error for T = 150 :2.762040830361695
training error for T = 200 :2.7130684752134377
training error for T = 300 :2.683685062124484
training error for T = 400 :2.7130684752134377


In [44]:
for t, test_pred in zip(Ts, test_preds):
    print(f"test error for T = {t} :{log_loss(y_test, test_pred)}")

test error for T = 1 :4.070075952734184
test error for T = 50 :3.3265043844462086
test error for T = 100 :3.44391042154431
test error for T = 150 :3.3656397301455754
test error for T = 200 :3.483045767243677
test error for T = 300 :3.44391042154431
test error for T = 400 :3.4047750758449427


In [45]:
for t, test_pred in zip(Ts, test_preds):
    print(f"F1 score for T = {t} :{f1_score(y_test, test_pred)}")

F1 score for T = 1 :0.8575342465753424
F1 score for T = 50 :0.8830811554332876
F1 score for T = 100 :0.8787878787878787
F1 score for T = 150 :0.8821917808219178
F1 score for T = 200 :0.877914951989026
F1 score for T = 300 :0.8794520547945205
F1 score for T = 400 :0.8809849521203831


In [46]:
from sklearn.metrics import roc_curve
for t, test_pred in zip(Ts, test_preds):
    fpr, tpr, thresholds = roc_curve(y_test, test_pred)
    print(f"AUC for T = {t} :{auc(fpr, tpr)}")

AUC for T = 1 :0.875858322468492
AUC for T = 50 :0.8964725481674635
AUC for T = 100 :0.8929668260176734
AUC for T = 150 :0.8958713602781398
AUC for T = 200 :0.89236563812835
AUC for T = 300 :0.8936476894104012
AUC for T = 400 :0.8949297406924525


In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

trees = [10, 50, 100]
for T in trees:
    rf = RandomForestClassifier(n_estimators=T, random_state=42)
    rf.fit(x_train, y_train)
    train_pred = rf.predict(x_train)
    test_pred = rf.predict(x_test)

    # Reporting metrics
    train_loss = log_loss(y_train, train_pred)
    test_loss = accuracy_score(y_test, test_pred)

    train_f1 = f1_score(y_train, train_pred)
    test_f1 = f1_score(y_test, test_pred)

    train_auc = roc_auc_score(y_train, train_pred)
    test_auc = roc_auc_score(y_test, test_pred)

    print(f"For {T} trees:")
    print(f"Training loss: {train_loss}, Test loss: {test_loss}")
    print(f"Training F1: {train_f1}, Test F1: {test_f1}")
    print(f"Training AUC: {train_auc}, Test AUC: {test_auc}")

# Top 10 features
importances = rf.feature_importances_
indices = np.argsort(importances)[-10:]
print("Top 10 features:", data.columns[indices])


For 10 trees:
Training loss: 0.19588942059302822, Test loss: 0.9402823018458197
Training F1: 0.9929378531073447, Test F1: 0.9261744966442953
Training AUC: 0.9933621051209153, Test AUC: 0.9328914964508185
For 50 trees:
Training loss: 0.019588942059303022, Test loss: 0.9489685124864278
Training F1: 0.9992972593113141, Test F1: 0.9377483443708609
Training AUC: 0.9994270966472387, Test AUC: 0.9434883384035926
For 100 trees:
Training loss: 0.019588942059303022, Test loss: 0.9554831704668838
Training F1: 0.99929676511955, Test F1: 0.9458388375165125
Training AUC: 0.9992972593113141, Test AUC: 0.9504997827031725
Top 10 features: Index(['word_freq_you', 'word_freq_hp', 'word_freq_your',
       'capital_run_length_total', 'capital_run_length_average',
       'capital_run_length_longest', 'word_freq_free', 'word_freq_remove',
       'char_freq_$', 'char_freq_!'],
      dtype='object')
