In [2]:
import numpy as np
import pandas as pd

In [3]:
data_email = pd.read_csv('hw3Data/emails.csv', header=0).values[:,1:].astype(np.int32)
data_email.shape

(5000, 3001)

In [4]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def gradient_descent(X, y, theta, alpha, epochs):
    m = len(y)
    for _ in range(epochs):
        h = sigmoid(X @ theta)
        gradient = 1/m * X.T @ (h - y)
        theta = theta - alpha * gradient
    return theta

def predict(X, theta):
    return sigmoid(X @ theta) >= 0.5

In [6]:
def get_metrics(y_true, y_pred):
    TP = np.sum((y_true == 1) & (y_pred == 1))
    TN = np.sum((y_true == 0) & (y_pred == 0))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))
    
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP) if TP + FP != 0 else 0
    recall = TP / (TP + FN) if TP + FN != 0 else 0
    
    return accuracy, precision, recall

In [7]:
X = data_email[:, 0:-1]
y = data_email[:, -1]

for alpha in [0.0001, 0.001, 0.01, 0.1, 1]:    
    theta_init = np.zeros(X.shape[1])
    
    epochs = 500
    
    num_folds = 5
    fold_size = len(y) // num_folds

    print(f"Learning rate: {alpha}:")
    accuracy_list = []
    precision_list = []
    recall_list = []
    for fold in range(num_folds):

        val_start = fold * fold_size
        val_end = (fold + 1) * fold_size
        
        X_val, y_val = X[val_start:val_end], y[val_start:val_end]
        X_train = np.concatenate((X[:val_start], X[val_end:]), axis=0)
        y_train = np.concatenate((y[:val_start], y[val_end:]), axis=0)
        
        theta = gradient_descent(X_train, y_train, theta_init, alpha, epochs)
        y_pred = predict(X_val, theta)
        
        accuracy, precision, recall = get_metrics(y_val, y_pred)

        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        
        print(f"Fold {fold+1} - Accuracy: {accuracy*100:.2f}%, Precision: {precision*100:.2f}%, Recall: {recall*100:.2f}%")
    
    avg_accuracy = np.mean(accuracy_list)
    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)

    print(f"Average Accuracy: {avg_accuracy*100:.2f}%, Average Precision: {avg_precision*100:.2f}%, Average Recall: {avg_recall*100:.2f}%")

Learning rate: 0.0001:
Fold 1 - Accuracy: 86.20%, Precision: 82.10%, Recall: 65.96%
Fold 2 - Accuracy: 83.80%, Precision: 77.51%, Recall: 58.48%
Fold 3 - Accuracy: 84.30%, Precision: 84.32%, Recall: 54.93%
Fold 4 - Accuracy: 82.70%, Precision: 77.88%, Recall: 57.48%
Fold 5 - Accuracy: 75.60%, Precision: 63.72%, Recall: 47.06%
Average Accuracy: 82.52%, Average Precision: 77.11%, Average Recall: 56.78%
Learning rate: 0.001:
Fold 1 - Accuracy: 81.70%, Precision: 62.50%, Recall: 89.47%
Fold 2 - Accuracy: 83.80%, Precision: 64.94%, Recall: 90.25%
Fold 3 - Accuracy: 85.10%, Precision: 89.94%, Recall: 53.52%
Fold 4 - Accuracy: 85.10%, Precision: 83.41%, Recall: 61.56%
Fold 5 - Accuracy: 77.20%, Precision: 76.71%, Recall: 36.60%
Average Accuracy: 82.58%, Average Precision: 75.50%, Average Recall: 66.28%
Learning rate: 0.01:


  return 1 / (1 + np.exp(-z))


Fold 1 - Accuracy: 72.30%, Precision: 50.77%, Recall: 92.98%
Fold 2 - Accuracy: 86.40%, Precision: 83.41%, Recall: 63.54%
Fold 3 - Accuracy: 88.20%, Precision: 83.74%, Recall: 72.54%
Fold 4 - Accuracy: 74.70%, Precision: 97.67%, Recall: 14.29%
Fold 5 - Accuracy: 82.20%, Precision: 68.08%, Recall: 78.76%
Average Accuracy: 80.76%, Average Precision: 76.73%, Average Recall: 64.42%
Learning rate: 0.1:
Fold 1 - Accuracy: 63.10%, Precision: 43.29%, Recall: 95.09%
Fold 2 - Accuracy: 78.50%, Precision: 87.80%, Recall: 25.99%
Fold 3 - Accuracy: 88.20%, Precision: 83.74%, Recall: 72.54%
Fold 4 - Accuracy: 72.40%, Precision: 51.64%, Recall: 96.60%
Fold 5 - Accuracy: 82.30%, Precision: 68.27%, Recall: 78.76%
Average Accuracy: 76.90%, Average Precision: 66.95%, Average Recall: 73.79%
Learning rate: 1:
Fold 1 - Accuracy: 87.40%, Precision: 88.04%, Recall: 64.56%
Fold 2 - Accuracy: 77.70%, Precision: 87.50%, Recall: 22.74%
Fold 3 - Accuracy: 88.20%, Precision: 84.02%, Recall: 72.18%
Fold 4 - Accuracy