In [None]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd

# fetch dataset 
spambase = fetch_ucirepo(id=94) 
  
# data (as pandas dataframes)
X = spambase.data.features 
y = spambase.data.targets 

data = pd.concat([X, y], axis = 1)

In [None]:
# Step 1

from sklearn.model_selection import StratifiedKFold

# stratified ten-fold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import time
import numpy as np

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# lst to store metrics
rf_accuracy_scores = []
rf_f1_scores = []
rf_training_times = []

# cv
for train_idx, test_idx in skf.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx].squeeze(), y.iloc[test_idx].squeeze() #squeeze() to ensure 1D array

    # training time
    start_time = time.time()
    rf_model.fit(X_train, y_train)
    rf_training_times.append(time.time() - start_time)

    # make prediction on test data
    y_pred = rf_model.predict(X_test)

    # compute metrics
    rf_accuracy_scores.append(accuracy_score(y_test, y_pred))
    rf_f1_scores.append(f1_score(y_test, y_pred))

#for i in range(10):
#    print(f"Fold {i+1}; Accuracy: {accuracy_scores[i]}")

#print(f"avg: {np.mean(accuracy_scores):.4f} std: {np.std(accuracy_scores):.4f}")

print(f"Mean F1-Score: {np.mean(rf_f1_scores):.4f} +-{np.std(rf_f1_scores):.4f}")
print(f"Mean Training Time: {np.mean(rf_training_times):.4f} +-{np.std(rf_training_times):.4f} s")


Mean F1-Score: 0.9422 +-0.0086
Mean Training Time: 0.3934 +-0.0154 s


In [None]:
# print as the book

results_cv_df = pd.DataFrame({
    "Fold": [i+1 for i in range(10)],
    "Random Forest": rf_accuracy_scores
})

results_cv_df = pd.concat([
    results_cv_df,
    pd.DataFrame({"Fold": ["avg", "stdev"], "Random Forest": [np.mean(rf_accuracy_scores), np.std(rf_accuracy_scores)]})
], ignore_index=True)

print(results_cv_df.to_string(index=False))

 Fold  Random Forest
    1       0.952278
    2       0.956522
    3       0.960870
    4       0.967391
    5       0.945652
    6       0.958696
    7       0.956522
    8       0.956522
    9       0.943478
   10       0.952174
  avg       0.955010
stdev       0.006664
