# Question 3

Import necessary packages

In [1]:
import numpy as np
from numpy import linalg as LA
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

Define logistic regression equation as $$\sigma(z) = \frac{1}{1 + exp(-z)}$$

In [2]:
def logreg(z):
    #denom = 1 + np.exp(-z)
    return 1 / (1 + np.exp(-z))



Define the error term:

In [3]:
def error(X, y, w, n):
    left = y[n] * np.log(logreg(w.T @ X[n])) #np.clip
    right = (1 - y[n]) * np.log(1 - logreg(w.T @ X[n]))
    return -sum(left + right)

Gradient of the error function:

In [4]:
def gradient_error(X, y, w, n):
    return (logreg(w.T @ X[n]) - y[n]) * X[n]

Mini batch SGD function

In [5]:
def minibatch(X, y, iter = 1000, step = 0.0001, batchsize = 32):
    #initialize w0
    w = np.random.randn(X.shape[1])

    for i in range(iter):
        #set the error term from the minibatch
        batch = np.random.randint(0, X.shape[0] - 1, batchsize)
        target = 0
        for j in batch:
            target += gradient_error(X, y, w, j) #update error on summation of batch
        
        w = w - (step * target)

    return w

# Question 4

## (a)

In [18]:
from ucimlrepo import fetch_ucirepo
data = fetch_ucirepo(id=17)
X = data.data.features
y = data.data.targets


In [19]:
#convert into array
X = np.array(X)
y = np.array(y)

#convert y into binary variables
y = (y == 'M') #let malignant = 1, benign = 0
y = y.astype(int)


## (b)

In [20]:
X_train, X_sub, y_train, y_sub = train_test_split(X, y, test_size = 0.25, random_state = 42)
X_test, X_val, y_test, y_val = train_test_split(X_sub, y_sub, test_size = 15/25, random_state = 42)

## (c)

Training Set

In [22]:
print(f"There are {sum(y_train == 1)} malignant cases in the training set")
print(f"There are {sum(y_train == 0)} benign cases in the training set")

There are [158] malignant cases in the training set
There are [268] benign cases in the training set


Validation Set

In [23]:
print(f"There are {sum(y_val == 1)} malignant cases in the validation set")
print(f"There are {sum(y_val == 0)} benign cases in the validation set")

There are [33] malignant cases in the validation set
There are [53] benign cases in the validation set


## (d)

In [24]:
w = minibatch(X_train, y_train, iter = 10000, step = 0.000005, batchsize = 64)
w

array([ 0.31800829,  0.47701412, -2.27124288, -0.03096425,  0.92222673,
       -0.62992366, -0.63779954, -1.39656245,  0.52862093, -0.27208146,
        0.07234583, -0.89072588, -0.57104609,  0.45976401, -1.04340165,
       -2.61013561,  0.112101  ,  0.09160379,  0.42902001,  0.57760137,
       -0.4206864 ,  1.2245875 , -0.28509947,  0.27040151, -0.88835974,
       -1.11745201,  1.17589744, -0.43449628,  0.27052789,  0.34949469])

In [25]:
#prediction function
def pred(X):
    output = X @ w
    output = (output >= 0.5).astype(int).reshape(57, 1)
    return output

## (e)

In [26]:
#setting test model
X_test_pred = pred(X_test)

In [27]:
#accuracy
def accuracy(pred, res):
    return (sum(pred == res)) / (len(pred))
    
    
    

In [28]:
#precision
def precision(pred, res):
    tp = sum(res[pred == 1]) #subset results to match positive prediction, grab all positive values
    return tp / sum(pred)

In [29]:
#recall
def recall(pred, res):
    tp = sum(res[pred == 1])
    return tp / sum(res)

In [30]:
# F1 score
def f1(pred, res):
    numer = precision(pred, res) * recall(pred,res)
    denom = precision(pred, res) + recall(pred,res)
    return 2 * (numer / denom)

In [31]:
print(f"Accuracy = {accuracy(X_test_pred, y_test)}")

Accuracy = [0.94736842]


In [32]:
print(f"Precision = {precision(X_test_pred, y_test)}")

Precision = [0.90909091]


In [33]:
print(f"Recall = {recall(X_test_pred, y_test)}")

Recall = [0.95238095]


In [34]:
print(f"f1 score = {f1(X_test_pred, y_test)}")

f1 score = [0.93023256]


## (e)

- **Accuracy:** About 94.74% of the predicted data is accurate
- **Precision:** About 90.9% of the malignant predicted values are actually malignant
- **Recall:** About 95.24% of the true malignant values are predicted accurately
- **F1:** The harmonic mean of the precision and recall is about 0.9302