In [3]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd

# fetch dataset 
spambase = fetch_ucirepo(id=94) 

# data (as pandas dataframes)
X = spambase.data.features 
y = spambase.data.targets 

data = pd.concat([X, y], axis = 1)

In [4]:
# ten fold cv
from sklearn.model_selection import StratifiedKFold

# stratified ten-fold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
# random forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import time
import numpy as np

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# lst to store metrics
rf_accuracy_scores = []
rf_f1_scores = []
rf_training_times = []

# cv
for train_idx, test_idx in skf.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx].squeeze(), y.iloc[test_idx].squeeze() #squeeze() to ensure 1D array

    # training time
    start_time = time.time()
    rf_model.fit(X_train, y_train)
    rf_training_times.append(time.time() - start_time)

    # make prediction on test data
    y_pred = rf_model.predict(X_test)

    # compute metrics
    rf_accuracy_scores.append(accuracy_score(y_test, y_pred))
    rf_f1_scores.append(f1_score(y_test, y_pred))

In [None]:
# naive bayes

from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()

# lists to store measures
nb_accuracy_scores = []
nb_f1_scores = []
nb_training_times = []

# cross validation

for train_idx, test_idx in skf.split(X, y):
    # split data into train and test sets
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx].squeeze(), y.iloc[test_idx].squeeze()

    # training time
    start_time = time.time()
    nb_model.fit(X_train, y_train)
    nb_training_times.append(time.time() - start_time)

    # make predictions
    y_pred = nb_model.predict(X_test)

    # metrics
    nb_accuracy_scores.append(accuracy_score(y_test, y_pred))
    nb_f1_scores.append(f1_score(y_test, y_pred))

In [None]:
# support Vector Machines

svm_accuracy_scores = [0 for i in range(10)]


In [None]:
# Step 2
# print cv as the book 12.4

results_cv_df = pd.DataFrame({
    "Fold": [i+1 for i in range(10)],
    "Random Forest": rf_accuracy_scores,
    "Naive Bayes": nb_accuracy_scores,
    "Support Vector Machines": svm_accuracy_scores
})

results_cv_df = pd.concat([
    results_cv_df,
    pd.DataFrame({
        "Fold": ["avg", "stdev"], 
        "Random Forest": [np.mean(rf_accuracy_scores), np.std(rf_accuracy_scores)],
        "Naive Bayes": [np.mean(nb_accuracy_scores), np.std(nb_accuracy_scores)],
        "Support Vector Machines": [np.mean(svm_accuracy_scores), np.std(svm_accuracy_scores)]
    })
], ignore_index=True)

print(results_cv_df.to_string(index=False, float_format="%.4f"))

 Fold  Random Forest  Naive Bayes  Support Vector Machines
    1         0.9523       0.8243                   0.0000
    2         0.9565       0.8196                   0.0000
    3         0.9609       0.8043                   0.0000
    4         0.9674       0.8217                   0.0000
    5         0.9457       0.8174                   0.0000
    6         0.9587       0.7891                   0.0000
    7         0.9565       0.8457                   0.0000
    8         0.9565       0.8261                   0.0000
    9         0.9435       0.8413                   0.0000
   10         0.9522       0.8130                   0.0000
  avg         0.9550       0.8203                   0.0000
stdev         0.0067       0.0156                   0.0000


In [None]:
# Step 3
# Friedman tests



In [None]:
# Step 4
# 