In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data_email = pd.read_csv('hw3Data/emails.csv', header=0).values[:,1:].astype(np.int32)
data_email.shape

(5000, 3001)

In [4]:
data_email

array([[ 0,  0,  1, ...,  0,  0,  0],
       [ 8, 13, 24, ...,  1,  0,  0],
       [ 0,  0,  1, ...,  0,  0,  0],
       ...,
       [ 6,  8,  1, ...,  0,  0,  0],
       [ 8,  6,  2, ...,  0,  0,  0],
       [13, 12,  3, ...,  0,  0,  0]])

In [3]:
X = data_email[:,0:-1]
y = data_email[:,-1]
X.shape, y.shape

((5000, 3000), (5000,))

In [5]:
def euclidean_distances(train_data, test_instance):
    return np.sqrt(np.sum((train_data - test_instance) ** 2, axis=1))

def one_nearest_neighbor(train_data, train_label, test_data):
    distances = euclidean_distances(train_data, test_data)
    nearest_indices = distances.argsort()[0]
    predicted_label = train_label[nearest_indices]
    return predicted_label

In [6]:
def k_fold_cross_validation(X, y, k=5):
    fold_size = len(X) // k
    accuracy_list = []
    precision_list = []
    recall_list = []

    for fold in range(k):
        start = fold * fold_size
        end = (fold + 1) * fold_size

        X_val = X[start:end]
        y_val = y[start:end]
        X_train = np.concatenate((X[:start], X[end:]))
        y_train = np.concatenate((y[:start], y[end:]))

        predicted_labels = []
        for i in range(len(X_val)):
            test_instance = X_val[i]
            prediction = one_nearest_neighbor(X_train, y_train, test_instance)
            predicted_labels.append(prediction)

        predicted_labels = np.array(predicted_labels)
        correct_predictions = np.sum(predicted_labels == y_val)
        TP = np.sum((predicted_labels == 1) & (y_val == 1))
        FP = np.sum((predicted_labels == 1) & (y_val == 0))
        FN = np.sum((predicted_labels == 0) & (y_val == 1))

        accuracy = correct_predictions / len(X_val)
        precision = TP / (TP + FP) if (TP + FP) != 0 else 0
        recall = TP / (TP + FN) if (TP + FN) != 0 else 0

        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)

        print(f"Fold {fold+1} - Accuracy: {accuracy*100:.2f}%, Precision: {precision*100:.2f}%, Recall: {recall*100:.2f}%")
    return accuracy_list, precision_list, recall_list

In [7]:
accuracies, precisions, recalls = k_fold_cross_validation(X, y, k=5)
avg_accuracy = np.mean(accuracies)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)

print(f"Average Accuracy: {avg_accuracy*100:.2f}%, Average Precision: {avg_precision*100:.2f}%, Average Recall: {avg_recall*100:.2f}%")


Fold 1 - Accuracy: 82.50%, Precision: 65.36%, Recall: 82.11%
Fold 2 - Accuracy: 85.50%, Precision: 68.97%, Recall: 86.64%
Fold 3 - Accuracy: 86.30%, Precision: 72.21%, Recall: 84.15%
Fold 4 - Accuracy: 85.40%, Precision: 72.16%, Recall: 81.97%
Fold 5 - Accuracy: 77.50%, Precision: 60.52%, Recall: 76.14%
Average Accuracy: 83.44%, Average Precision: 67.84%, Average Recall: 82.20%
