# Data loading / preprocessing

In [68]:
import numpy as np

#Defining a rng with seed 256 to ensure consistency across runs 
rng = np.random.default_rng(seed=2026)

In [69]:
import pandas as pd

spam_df = pd.read_csv('spambase.data', header=None)
# without the header=None flag, the first row ended up being the column names

In [70]:
print(f"Spam Data Preview: \n")
spam_df.head()

Spam Data Preview: 



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


We still need the column names, which we can get from the file spambase.names.

In [71]:
# get a list with each line in the file
with open('spambase.names', 'r') as file:
    file_lines = file.readlines()

In [72]:
# the column names start at line 34
col_name_lines = file_lines[33:]
# for col_name in col_names: print(col_name) #- > this confirms that we are starting at the correct line

We also need to cut off everything after the colon in each line. 

In [73]:
col_names = [col_name_line.split(':')[0] for col_name_line in col_name_lines]
# for col_name in col_names: print(col_name) # -> this confirms the success of the parsing

In [74]:
print(f"Number of columns in dataset: {spam_df.shape[1]}")
print(f"Number of column names acquired: {len(col_names)}")

Number of columns in dataset: 58
Number of column names acquired: 57


We are missing the final column, which is the label column (1 for spam, 0 for not spam). 

In [75]:
col_names.append("is_spam")

In [76]:
# Now, assign the column names
spam_df.columns = col_names

In [77]:
print(f"Spam Data Preview: \n")
spam_df.head()

Spam Data Preview: 



Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,is_spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [78]:
print(f"Dimensions of dataset: {spam_df.shape}")

Dimensions of dataset: (4601, 58)


Before going any further, we want to make sure our data is clean and won't cause issues

In [79]:
# Check if there are missing values in each column of our spam_df
print("Missing values per column:")
print(spam_df.isnull().sum())

# Total count of missing values in the dataframe
print(f"\nTotal number of missing values: {spam_df.isnull().sum().sum()}")

Missing values per column:
word_freq_make                0
word_freq_address             0
word_freq_all                 0
word_freq_3d                  0
word_freq_our                 0
word_freq_over                0
word_freq_remove              0
word_freq_internet            0
word_freq_order               0
word_freq_mail                0
word_freq_receive             0
word_freq_will                0
word_freq_people              0
word_freq_report              0
word_freq_addresses           0
word_freq_free                0
word_freq_business            0
word_freq_email               0
word_freq_you                 0
word_freq_credit              0
word_freq_your                0
word_freq_font                0
word_freq_000                 0
word_freq_money               0
word_freq_hp                  0
word_freq_hpl                 0
word_freq_george              0
word_freq_650                 0
word_freq_lab                 0
word_freq_labs                0
word_freq_tel

In [80]:
# Ensure all feature columns are numeric (ints/floats)
print(f"Column types in spam_df:\n{spam_df.dtypes.value_counts()}")

Column types in spam_df:
float64    55
int64       3
Name: count, dtype: int64


We see here that we indeed have 57 continuous features and 1 binary classifier. Now, we split the data into a test and train set.

In [81]:
# Get the row indices of the dataframe 
samples = spam_df.index.tolist()   

# shuffle using the rng we defined with seed 256
rng.shuffle(samples)

# divide this with a 5% / 95% split
train_ind = samples[:round(len(samples)*0.05)]
test_ind = samples[-round(len(samples)*0.95):]

print(f"Number of training samples: {len(train_ind)}")
print(f"Number of testings samples: {len(test_ind)}")

Number of training samples: 230
Number of testings samples: 4371


In [82]:
# split the dataset

# inputs only
X = spam_df.iloc[:, :-1]

# outputs only
y = spam_df["is_spam"]

# train-test split
X_train = X.iloc[train_ind]
X_test = X.iloc[test_ind]

y_train = y.iloc[train_ind]
y_test = y.iloc[test_ind]

Here, we want to check the label distribution in our entire dataset and the training set to ensure the training set isn't too skewed, potentially introducing bias into our model

In [83]:
label_dist_spam_df = spam_df["is_spam"].value_counts(normalize=True)
label_dist_y_train = y_train.value_counts(normalize=True)
print(f"Label distribution in total dataset: {label_dist_spam_df}")
print(f"Label distribution in Training Set: {label_dist_y_train}")

Label distribution in total dataset: is_spam
0    0.605955
1    0.394045
Name: proportion, dtype: float64
Label distribution in Training Set: is_spam
0    0.613043
1    0.386957
Name: proportion, dtype: float64


# Pre-Processing

NEED TO EXPAND ON THIS SECTION (ADD MORE PRE-PROCESSING)

In [84]:
# Apply standardization
# Calculate mean and std from training data only
train_mean = X_train.mean()
train_std = X_train.std()

# Apply the same train_mean and train_std to both training and test sets to prevent data leakage
X_train = (X_train - train_mean) / train_std
X_test = (X_test - train_mean) / train_std

# Task 1
## Preprocessing Methodology:
During data preprocessing, to ensure a representative sample, we first shuffled the dataset using a fixed random seed (2026). We then performed a 5%/95% training-test data split to create a small training set of approximately 230 samples, a scenario designed to highlight the effects of overfitting. To prevent data leakage, we calculated the mean and standard deviation exclusively from the training set and used these parameters to standardize both the training and test features. Finally, we prepended a column of ones to the feature matrices to incorporate the bias term directly into our weight vector calculations.

### Cost Function

Here, the cost function changes as we need to include L2 Regularization. 

In [85]:
# we use "lambda_" as lambda is a function already
def cost_fn(x, y, w, lambda_=1e-3):
    """
    Compute the binary cross-entropy cost (negative log-likelihood).
    
    Parameters:
    x: input features, shape (N, D)
    y: binary labels (0 or 1), shape (N,)
    w: weights, shape (D,)
    lamba_: regularization strength

    Note: input parameters are a subset of the total dataset (mini-batch)
    
    Returns:
    J: scalar cost value (lower is better)
    """
    N, D = x.shape
    
    # Compute logits: z = x @ w, shape (N,)
    # These are the raw predictions before applying sigmoid
    z = np.dot(x, w)
    
    # Binary cross-entropy loss (mean over all samples in mini-batch)
    # np.log1p(x) computes log(1 + x) with better numerical stability
    # For y=1: cost is log(1 + exp(-z)) which penalizes z being too negative
    # For y=0: cost is log(1 + exp(z)) which penalizes z being too positive
    J = np.mean(y * np.log1p(np.exp(-z)) + (1-y) * np.log1p(np.exp(z)))

    # now, add regularization penalty 
    J += (lambda_ / (2 * N)) * np.sum(w[1:]**2) 
    # here, we are using the divide by two convention. We also divide by N to use the mean cost, 
    # to keep this consistent with the mean binary cross-entropy loss implemented in the code review
    # we assume w[0] is a bias, so it is excluded here
    
    return J

### Logistic Regression with SGD

We implemented a custom LogisticRegressionSGD class that utilizes Mini-batch Stochastic Gradient Descent for optimization. To handle the intercept, a bias column of ones is prepended to the feature matrix during the fitting process. The weights are updated by computing the gradient of the cross-entropy loss function over randomized mini-batches. To combat overfitting, we integrated $L_2$ regularization (weight decay) into the gradient calculation. Additionally, we implemented numerical clipping within the sigmoid function to ensure stability against overflow, and tracked the cross-entropy loss at each epoch to visualize the model's convergence behavior.

*Note: We have adopted the convention of using $\lambda \|\mathbf{w}\|^2_2$ for our L2 regularization penalty. 

In [86]:
import numpy as np

class LogisticRegressionSGD:
    def __init__(self, learning_rate=10**-2, epochs=500, batch_size=16, reg_lambda=10**-3):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.reg_lambda = reg_lambda
        self.w = None
        self.loss_history = []

    def sigmoid(self, z):
        # Clip values to avoid overflow in exp
        return 1 / (1 + np.exp(-np.clip(z, -250, 250)))

    def compute_cross_entropy_loss(self, x, y):
        n = x.shape[0]
        yh = self.sigmoid(np.dot(x, self.w))
        eps = 1e-15 
        
        # Cross Entropy + L2 Penalty (exclude bias w[0])
        loss = -np.mean(y * np.log(yh + eps) + (1 - y) * np.log(1 - yh + eps))
        l2_penalty = self.reg_lambda * np.sum(self.w[1:]**2)
        loss += l2_penalty
        return loss

    def gradient(self, x, y):
        n = x.shape[0]
        yh = self.sigmoid(np.dot(x, self.w))
        grad = np.dot(x.T, (yh - y)) / n
        
        # L2 Regularization Gradient: 2 * lambda * w
        # We do not regularize the bias 
        reg_term = 2 * self.reg_lambda * self.w
        reg_term[0] = 0 
        grad += reg_term
        return grad

    def fit(self, x, y):
        # Add bias column here (vector of 1s)
        n_samples = x.shape[0]
        x_with_bias = np.column_stack([np.ones(n_samples), x])
        
        n_features = x_with_bias.shape[1]
        self.w = np.zeros(n_features)
        self.loss_history = []

        for epoch in range(self.epochs):
            # Shuffle at the start of every epoch
            indices = rng.permutation(n_samples)
            x_shuffled = x_with_bias[indices]
            y_shuffled = y[indices]
            
            # Iterate through mini-batches
            for i in range(0, n_samples, self.batch_size):
                xb = x_shuffled[i : i + self.batch_size]
                yb = y_shuffled[i : i + self.batch_size]
                
                grad = self.gradient(xb, yb)
                self.w -= self.learning_rate * grad
            
            # Save loss for the Task 1 training curve
            cross_entropy_loss = self.compute_cross_entropy_loss(x_with_bias, y)
            self.loss_history.append(cross_entropy_loss)
            
        return self

    def predict_prob(self, x):
        # Add bias column for prediction
        x_with_bias = np.column_stack([np.ones(x.shape[0]), x])
        return self.sigmoid(np.dot(x_with_bias, self.w))

# Task 2

# Task 3

# Task 4