# Loading the Data

In [1]:
import numpy as np

np.random.default_rng(seed=2026)

Generator(PCG64) at 0x1086AD0E0

In [2]:
import pandas as pd

spam_df = pd.read_csv('spambase.data', header=None)
# without the header=None flag, the first row ended up being the column names

In [3]:
print(f"Spam Data Preview: \n")
spam_df.head()

Spam Data Preview: 



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


We still need the column names, which we can get from the file spambase.names.

In [4]:
# get a list with each line in the file
with open('spambase.names', 'r') as file:
    file_lines = file.readlines()

In [5]:
# the column names start at line 34
col_name_lines = file_lines[33:]
# for col_name in col_names: print(col_name) #- > this confirms that we are starting at the correct line

We also need to cut off everything after the colon in each line. 

In [6]:
col_names = [col_name_line.split(':')[0] for col_name_line in col_name_lines]
# for col_name in col_names: print(col_name) # -> this confirms the success of the parsing

In [7]:
print(f"Number of columns in dataset: {spam_df.shape[1]}")
print(f"Number of column names acquired: {len(col_names)}")

Number of columns in dataset: 58
Number of column names acquired: 57


We are missing the final column, which is the label column (1 for spam, 0 for not spam). 

In [8]:
col_names.append("is_spam")

In [9]:
# Now, assign the column names
spam_df.columns = col_names

In [10]:
print(f"Spam Data Preview: \n")
spam_df.head()

Spam Data Preview: 



Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,is_spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [11]:
print(f"Dimensions of dataset: {spam_df.shape}")

Dimensions of dataset: (4601, 58)


Before going any further, we want to make sure our data is clean and won't cause issues

In [29]:
# Check if there are missing values in each column of our spam_df
print("Missing values per column:")
print(spam_df.isnull().sum())

# Total count of missing values in the dataframe
print(f"\nTotal number of missing values: {spam_df.isnull().sum().sum()}")

Missing values per column:
word_freq_make                0
word_freq_address             0
word_freq_all                 0
word_freq_3d                  0
word_freq_our                 0
word_freq_over                0
word_freq_remove              0
word_freq_internet            0
word_freq_order               0
word_freq_mail                0
word_freq_receive             0
word_freq_will                0
word_freq_people              0
word_freq_report              0
word_freq_addresses           0
word_freq_free                0
word_freq_business            0
word_freq_email               0
word_freq_you                 0
word_freq_credit              0
word_freq_your                0
word_freq_font                0
word_freq_000                 0
word_freq_money               0
word_freq_hp                  0
word_freq_hpl                 0
word_freq_george              0
word_freq_650                 0
word_freq_lab                 0
word_freq_labs                0
word_freq_tel

In [32]:
# Ensure all feature columns are numeric (ints/floats)
print(f"Column types in spam_df:\n{spam_df.dtypes.value_counts()}")

Column types in spam_df:
float64    55
int64       3
Name: count, dtype: int64


We see here that we indeed have 57 continuous features and 1 binary classifier. Now, we split the data into a test and train set.

In [12]:
# Get the row indices of the dataframe 
samples = spam_df.index.tolist()   

# shuffle
np.random.shuffle(samples)

# divide this with a 5% / 95% split
train_ind = samples[:round(len(samples)*0.05)]
test_ind = samples[-round(len(samples)*0.95):]

print(f"Number of training samples: {len(train_ind)}")
print(f"Number of testings samples: {len(test_ind)}")

Number of training samples: 230
Number of testings samples: 4371


In [13]:
# split the dataset

# inputs only
X = spam_df.iloc[:, :-1]

# outputs only
y = spam_df["is_spam"]

# train-test split
X_train = X.iloc[train_ind]
X_test = X.iloc[test_ind]

y_train = y.iloc[train_ind]
y_test = y.iloc[test_ind]

Here, we want to check the label distribution in our entire dataset and the training set to ensure the training set isn't too skewed, potentially introducing bias into our model

In [41]:
label_dist_spam_df = spam_df["is_spam"].value_counts(normalize=True)
label_dist_y_train = y_train.value_counts(normalize=True)
print(f"Label distribution in total dataset: {label_dist_spam_df}")
print(f"Label distribution in Training Set: {label_dist_y_train}")

Label distribution in total dataset: is_spam
0    0.605955
1    0.394045
Name: proportion, dtype: float64
Label distribution in Training Set: is_spam
0    0.630435
1    0.369565
Name: proportion, dtype: float64


# Pre-Processing

NEED TO EXPAND ON THIS SECTION (ADD MORE PRE-PROCESSING)

In [14]:
# standardization
def standardize(df):
    """
    Standardizes each column of a dataframe, scaling the column to a standard Normal distribution.

    Parameters:
    df: the Pandas DataFrame to be standardized
    
    Returns: 
    standard_df: the standardized DataFrame
    """

    df_copy = df.copy()

    standard_df = (df_copy - df_copy.mean()) / df_copy.std()

    return standard_df


X_train = standardize(X_train)
X_test = standardize(X_test)

# Task 1
Starting with the code provided in the Logistic Regression Code Review, but implemented changes as needed for this assignment.

### Cost Function

Here, the cost function changes as we need to include L2 Regularization. 

In [15]:
# we use "lambda_" as lambda is a function already
def cost_fn(x, y, w, lambda_=1e-3):
    """
    Compute the binary cross-entropy cost (negative log-likelihood).
    
    Parameters:
    x: input features, shape (N, D)
    y: binary labels (0 or 1), shape (N,)
    w: weights, shape (D,)
    lamba_: regularization strength

    Note: input parameters are a subset of the total dataset (mini-batch)
    
    Returns:
    J: scalar cost value (lower is better)
    """
    N, D = x.shape
    
    # Compute logits: z = x @ w, shape (N,)
    # These are the raw predictions before applying sigmoid
    z = np.dot(x, w)
    
    # Binary cross-entropy loss (mean over all samples in mini-batch)
    # np.log1p(x) computes log(1 + x) with better numerical stability
    # For y=1: cost is log(1 + exp(-z)) which penalizes z being too negative
    # For y=0: cost is log(1 + exp(z)) which penalizes z being too positive
    J = np.mean(y * np.log1p(np.exp(-z)) + (1-y) * np.log1p(np.exp(z)))

    # now, add regularization penalty 
    J += (lambda_ / (2 * N)) * np.sum(w[1:]**2) 
    # here, we are using the divide by two convention. We also divide by N to use the mean cost, 
    # to keep this consistent with the mean binary cross-entropy loss implemented in the code review
    # we assume w[0] is a bias, so it is excluded here
    
    return J

### Stochastic Gradient Descent

Here, we must modify gradient descent to be stochastic (using mini-batches).

In [16]:
def gradient(self, x, y):
    """
    Compute the gradient of the cost function with respect to weights.
    
    The gradient tells us how to adjust weights to reduce the cost.
    
    Parameters:
    x: input features, shape (N, D)
    y: true binary labels, shape (N,)
    
    Returns:
    grad: gradient vector, shape (D,)
    """
    N, D = x.shape
    
    # Compute predictions using current weights
    # yh = σ(x @ w), shape (N,)
    # These are predicted probabilities in range (0, 1)
    yh = logistic(np.dot(x, self.w))
    
    # Compute gradient: x^T @ (yh - y) / N
    # x.T is (D, N), (yh - y) is (N,)
    # Result is (D,) - one gradient value per weight
    # Division by N because we use mean (not sum) in the cost function
    # Positive gradient means increasing that weight would increase cost
    grad = np.dot(x.T, yh - y) / N
    
    return grad

### Logistic Regression

This implementation of logistic regression utilizes a fixed number of epochs, has been modified to accommodate the new changes as well (SDG and L2 Regularization).

In [17]:
class LogisticRegression:
    
    def __init__(self, add_bias=True, batch_size=1, learning_rate=.1, regularization_strength=1e-3, 
                 epsilon=1e-4, max_iters=1e5, verbose=False):
        """
        Initialize Logistic Regression classifier.
        
        Parameters:
        add_bias: if True, adds an intercept term to the model
        batch_size: size of mini-batches used for SDG
        learing_rate: step size for gradient descent (also called eta)
        
        epsilon: convergence threshold - stop when ||gradient|| < epsilon
        max_iters: maximum number of gradient descent iterations
        verbose: if True, print optimization progress
        """
        self.add_bias = add_bias
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.max_iters = max_iters
        self.verbose = verbose
        
    def fit(self, x, y):
        """
        Fit the logistic regression model using gradient descent.
        
        Parameters:
        x: input features, shape (N,) or (N, D)
        y: binary labels (0 or 1), shape (N,)
        
        Returns:
        self: fitted model
        """
        # If x is 1D, convert to 2D column vector
        # x[:, None] adds a new axis: (N,) -> (N, 1)
        if x.ndim == 1:
            x = x[:, None]
        
        if self.add_bias:
            N = x.shape[0]
            # Add bias feature: append a column of ones
            # x goes from (N, D) to (N, D+1)
            x = np.column_stack([x, np.ones(N)])
        
        N, D = x.shape
        
        # Initialize weights to zero
        self.w = np.zeros(D)
        
        # Initialize gradient norm to infinity (ensures loop starts)
        g = np.inf
        
        # Iteration counter
        t = 0
        
        # Gradient descent loop
        # Stop when: (1) gradient is small enough, OR (2) max iterations reached
        while np.linalg.norm(g) > self.epsilon and t < self.max_iters:
            # Compute gradient at current weights
            g = self.gradient(x, y)
            
            # Update weights: move in opposite direction of gradient
            # w_new = w_old - learning_rate * gradient
            # This moves "downhill" on the cost surface
            self.w = self.w - self.learning_rate * g
            
            t += 1
        
        if self.verbose:
            print(f'terminated after {t} iterations, with norm of the gradient equal to {np.linalg.norm(g):.6f}')
            print(f'the weight found: {self.w}')
        
        return self
    
    def predict(self, x):
        """
        Predict class probabilities for input x.
        
        Parameters:
        x: input features, shape (N,) or (N, D)
        
        Returns:
        yh: predicted probabilities P(y=1|x), shape (N,)
        """
        # Convert 1D to 2D if needed
        if x.ndim == 1:
            x = x[:, None]
        
        Nt = x.shape[0]
        
        if self.add_bias:
            # Add bias feature column
            x = np.column_stack([x, np.ones(Nt)])
        
        # Compute probabilities: σ(x @ w)
        # Output is in range (0, 1)
        yh = logistic(np.dot(x, self.w))
        
        return yh

# Attach the gradient method to the LogisticRegression class
# This allows self.gradient(x, y) to work inside the fit method
LogisticRegression.gradient = gradient

# Task 2

# Task 3

# Task 4