Math 156 HW 3

Ex 3:
Creating a Logisitic Regression SGD algorithm

In [10]:
import numpy as np
import random

class LRSGD:
    def __init__(self, batch_size=32, learning_rate=0.01, max_iterations=1000):
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.max_iterations = max_iterations
        self.weights = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def cross_entropy(self, y_true, y_prob):
        eps = 1e-9  # Prevents log(0)
        return -np.mean(y_true * np.log(y_prob + eps) + (1 - y_true) * np.log(1 - y_prob + eps))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.random.randn(n_features) * 0.01

        for _ in range(self.max_iterations):
            shuffled_idx = np.random.permutation(n_samples)
            X, y = X[shuffled_idx], y[shuffled_idx]

            for start in range(0, n_samples, self.batch_size):
                end = min(start + self.batch_size, n_samples)
                X_batch, y_batch = X[start:end], y[start:end]

                y_pred = self.sigmoid(X_batch @ self.weights)
                grad = (X_batch.T @ (y_pred - y_batch)) / len(y_batch)

                self.weights -= self.learning_rate * grad

    def predict_proba(self, X):
        return self.sigmoid(X @ self.weights)

    def predict(self, X):
        return (self.predict_proba(X) >= 0.5).astype(int)



Question 4a -- Getting the dataset

In [11]:
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
X, y = data.data, data.target

4b -- splitting the dataset

In [13]:
from sklearn.model_selection import train_test_split

# After looking online, this was recommended as a roughly optimal split for dealing with this sized medical dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 1)

4c -- getting sizes

In [14]:
# joining dataframes
X_train_val, y_train_val = np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val))

unique, counts = np.unique(y_train_val, return_counts = True)
class_distribution = dict(zip(unique, counts))
print(class_distribution)

{0: 196, 1: 316}


In [28]:
# initialising variables
learning_rate = 0.00001
batch_size = 8
max_iterations = 1000
model = LRSGD(learning_rate = learning_rate, batch_size = batch_size, max_iterations = max_iterations)

model.fit(X_train, y_train)
model.predict(X_train)

array([0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,

In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_test_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print(f"Test Set Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Test Set Performance:
Accuracy: 0.9649
Precision: 0.9535
Recall: 1.0000
F1-Score: 0.9762


Summary: 
The tests came back as very accurate and precise with perfect recall and high F-1. This shows that our stochastic gradient descent algorithm worked very well, after adjusting the hyper parameters properly