In [36]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd

# fetch dataset 
spambase = fetch_ucirepo(id=94) 

# data (as pandas dataframes)
X = spambase.data.features 
y = spambase.data.targets 

data = pd.concat([X, y], axis = 1)

In [37]:
# ten fold cv
from sklearn.model_selection import StratifiedKFold

# stratified ten-fold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [38]:
# random forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import time
import numpy as np

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# lst to store metrics
rf_accuracy_scores = []
rf_f1_scores = []
rf_training_times = []

# cv
for train_idx, test_idx in skf.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx].squeeze(), y.iloc[test_idx].squeeze() #squeeze() to ensure 1D array

    # training time
    start_time = time.time()
    rf_model.fit(X_train, y_train)
    rf_training_times.append(time.time() - start_time)

    # make prediction on test data
    y_pred = rf_model.predict(X_test)

    # compute metrics
    rf_accuracy_scores.append(accuracy_score(y_test, y_pred))
    rf_f1_scores.append(f1_score(y_test, y_pred))

In [39]:
# naive bayes

from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()

# lists to store measures
nb_accuracy_scores = []
nb_f1_scores = []
nb_training_times = []

# cross validation

for train_idx, test_idx in skf.split(X, y):
    # split data into train and test sets
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx].squeeze(), y.iloc[test_idx].squeeze()

    # training time
    start_time = time.time()
    nb_model.fit(X_train, y_train)
    nb_training_times.append(time.time() - start_time)

    # make predictions
    y_pred = nb_model.predict(X_test)

    # metrics
    nb_accuracy_scores.append(accuracy_score(y_test, y_pred))
    nb_f1_scores.append(f1_score(y_test, y_pred))

In [40]:
# Support Vector Machines

from sklearn.svm import SVC

svm = SVC()
svm_accuracy_scores = []
svm_f1_scores = []
svm_training_times = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index].squeeze(), y.iloc[test_index].squeeze()
    
    start_time = time.time()
    svm.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    y_pred = svm.predict(X_test)
    svm_accuracy_scores.append(accuracy_score(y_test, y_pred))
    svm_f1_scores.append(f1_score(y_test, y_pred))
    svm_training_times.append(training_time)


In [41]:
# Step 2
# print cv as the book 12.4

results_cv_df = pd.DataFrame({
    "Fold": [i+1 for i in range(10)],
    "Random Forest": rf_accuracy_scores,
    "Naive Bayes": nb_accuracy_scores,
    "Support Vector Machines": svm_accuracy_scores
})

results_cv_df = pd.concat([
    results_cv_df,
    pd.DataFrame({
        "Fold": ["avg", "stdev"], 
        "Random Forest": [np.mean(rf_accuracy_scores), np.std(rf_accuracy_scores)],
        "Naive Bayes": [np.mean(nb_accuracy_scores), np.std(nb_accuracy_scores)],
        "Support Vector Machines": [np.mean(svm_accuracy_scores), np.std(svm_accuracy_scores)]
    })
], ignore_index=True)

print(results_cv_df.to_string(index=False, float_format="%.4f"))

 Fold  Random Forest  Naive Bayes  Support Vector Machines
    1         0.9523       0.8243                   0.7158
    2         0.9565       0.8196                   0.7196
    3         0.9609       0.8043                   0.7348
    4         0.9674       0.8217                   0.7109
    5         0.9457       0.8174                   0.7196
    6         0.9565       0.7891                   0.7391
    7         0.9587       0.8457                   0.6935
    8         0.9565       0.8261                   0.7065
    9         0.9413       0.8413                   0.7022
   10         0.9522       0.8130                   0.7065
  avg         0.9548       0.8203                   0.7148
stdev         0.0071       0.0156                   0.0134


In [42]:
# Step 3
# Friedman tests

random_forest_ranks = []
naive_bayes_ranks = []
support_vector_machine_ranks = []

for i in range(10):
    if results_cv_df.iloc[i]["Random Forest"] < results_cv_df.iloc[i]["Naive Bayes"] and results_cv_df.iloc[i]["Random Forest"] < results_cv_df.iloc[i]["Support Vector Machines"]:
        # Random Forest smallest
        random_forest_ranks.append(3)
        if results_cv_df.iloc[i]["Naive Bayes"] < results_cv_df.iloc[i]["Support Vector Machines"]:
            # Naive Bayes middle
            naive_bayes_ranks.append(2)
            # Support Vector Machine largest
            support_vector_machine_ranks.append(1)
        else:
            # Support Vector Machine middle
            support_vector_machine_ranks.append(2)
            # Naive Bayes largest
            naive_bayes_ranks.append(1)

    elif results_cv_df.iloc[i]["Naive Bayes"] < results_cv_df.iloc[i]["Random Forest"] and results_cv_df.iloc[i]["Naive Bayes"] < results_cv_df.iloc[i]["Support Vector Machines"]:
        # Naive Bayes smallest
        naive_bayes_ranks.append(3)
        if results_cv_df.iloc[i]["Random Forest"] < results_cv_df.iloc[i]["Support Vector Machines"]:
            # Random Forest middle
            random_forest_ranks.append(2)
            # Support Vector Machine largest
            support_vector_machine_ranks.append(1)
        else:
            # Support Vector Machine middle
            support_vector_machine_ranks.append(2)
            # Random Forest largest
            random_forest_ranks.append(1)

    elif results_cv_df.iloc[i]["Support Vector Machines"] < results_cv_df.iloc[i]["Random Forest"] and results_cv_df.iloc[i]["Support Vector Machines"] < results_cv_df.iloc[i]["Naive Bayes"]:
        # Support Vector Machines smallest
        support_vector_machine_ranks.append(3)
        if results_cv_df.iloc[i]["Random Forest"] < results_cv_df.iloc[i]["Naive Bayes"]:
            # Random Forest middle
            random_forest_ranks.append(2)
            # Naive Bayes largest
            naive_bayes_ranks.append(1)
        else:
            # Naive Bayes middle
            naive_bayes_ranks.append(2)
            # Random Forest largest
            random_forest_ranks.append(1)

avg_rank = []
avg_rank.append(sum(random_forest_ranks)/10)
avg_rank.append(sum(support_vector_machine_ranks)/10)
avg_rank.append(sum(naive_bayes_ranks)/10)

friedman = 0
for avg in avg_rank:
    friedman += pow(base=(avg - 2), exp=2)
friedman *= 10

print(f"The friedman statistic is {friedman} which is greater than 7.8 and therefore there are a significant difference between the algorithms.")


The friedman statistic is 20.0 which is greater than 7.8 and therefore there are a significant difference between the algorithms.


In [None]:
# Step 4
# 
