# Logistic Regression

Implement a logistic (binary) regression model and use your stochastic gradient descent approach from the last practicals to optimize the weights.

In [1]:
# You can import your code from a different notebook like follows (change it to your path)

# %run ..\5_SGD_Linear_Regression\SGD_Solution.ipynb

## Data for the logistic regression model

In [1]:
import numpy as np

In [2]:
# The resulting data for training and test are in (X_train, y_train) and (X_test, y_test), respectively.

# Lets assume we have some data points that define the data as 'fraud' or 'no fraud', e. g. in bank account scenarios.
# We therefore have three values given ('number of withdrawals per week', 'avg_withdrawal_amount', 'number_of_different_addressees').

data_amount = 15
frauds_amount = data_amount // 2
no_frauds_amount = data_amount - frauds_amount

### Generate (fake) 'fraud' data
# number of withdrawals per week (fraud) - between 0 and 0.5
X_1_f = np.random.rand(frauds_amount) / 2
# average withdrawal amount 'fraud' - between 0.5 and 1
X_2_f = (np.random.rand(frauds_amount) + 1) / 2
# number of different addressees 'fraud' - between 0 and 0.5
X_3_f = np.random.rand(frauds_amount) / 2

X_f = np.stack([X_1_f, X_2_f, X_3_f], axis=1)

# Labels of 'fraud' (1)
y_f = np.ones(frauds_amount)

### Generate (fake) 'no fraud' data - between 0.5 and 1
X_1_nf = (np.random.rand(no_frauds_amount) + 1) / 2
# average withdrawal amount 'no fraud' - between 0 and 0.5
X_2_nf = np.random.rand(no_frauds_amount) / 2
# number of different addressees 'no fraud' - between 0.5 and 1
X_3_nf = (np.random.rand(no_frauds_amount) + 1) / 2

X_nf = np.stack([X_1_nf, X_2_nf, X_3_nf], axis=1)

# Labels of 'no fraud' (0)
y_nf = np.zeros(no_frauds_amount)

In [4]:
# Show dataset of 'fraud' data
print('X_f =', X_f)
print('y_f =', y_f)

X_f = [[0.29168211 0.51926307 0.26936188]
 [0.35151149 0.98409152 0.31932494]
 [0.46512513 0.81715354 0.1295525 ]
 [0.02594845 0.65268757 0.29398976]
 [0.14478214 0.93431523 0.02151472]
 [0.32089127 0.88690067 0.0470629 ]
 [0.47828694 0.7788369  0.26245831]]
y_f = [1. 1. 1. 1. 1. 1. 1.]


In [5]:
# Show dataset of 'no fraud' data
print('X_nf =', X_nf)
print('y_nf =', y_nf)

X_nf = [[0.91175767 0.43251488 0.76972269]
 [0.97957189 0.11529604 0.9051713 ]
 [0.73743688 0.28186576 0.58604385]
 [0.60848767 0.06334995 0.8623376 ]
 [0.63037444 0.07323393 0.84732089]
 [0.68329046 0.29051951 0.6840774 ]
 [0.82340603 0.09748259 0.7003152 ]
 [0.81534687 0.20428444 0.73327069]]
y_nf = [0. 0. 0. 0. 0. 0. 0. 0.]


## Shuffle fraud and no fraud data to create a mixed dataset

In [6]:
# Combine them (concatenate)
X = np.concatenate((X_f, X_nf))
y = np.concatenate((y_f, y_nf))

# now randomly shuffle them
shuffled_indices = np.random.choice(y.shape[0], size=y.shape[0], replace=False)
X = X[shuffled_indices]
y = y[shuffled_indices]

## Split into train and test data

In [7]:
train_len = int(data_amount * 0.75)

# We train with the following data
X_train = X[:train_len]
y_train = y[:train_len]

# We test / evaluate with the following data
X_test = X[train_len:]
y_test = y[train_len:]

## Initialize the weights of your logistic regression model (see SGD exercise on how to do this with numpy - dont forget to initialize the bias nodes as well!)

In [8]:
# weights
w = np.random.randn(X_train.shape[1])

# bias
b = 0

## Define the loss function (derivative) for your logistic regression model

Binary Cross-Entropy Loss function:
$$L = -\frac{1}{N} \sum^{N}_{i=1} (y_i \cdot \log(y_{pred, i})) - (1-y_i) \cdot \log(1-y_{pred, i})$$

In [9]:
# Loss function: Binary Cross Entropy

def binary_cross_entropy(y, y_pred):
    epsilon = 1e-15 # to prevent log(0)
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon) # clipping y_pred where it will never be 0 [epsilon, 1-epsilon]
    
    return -np.mean( y * np.log(y_pred) - (1-y) * np.log(1-y_pred) )

## Define the activation function for your logistic regression model

$$\sigma(z) = \frac{1}{1 + e^{-z}}$$

In [11]:
# Hint:  How do you scale your output for logistic regression? What is the range?

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Range: [0, 1]

\begin{align*}
    \frac{\partial L}{\partial \hat{y}} & = \frac{y_i}{\hat{y}} - \frac{1 - y_i}{1 - \hat{y}}\\
    & = \frac{y_i (1 - \hat{y}) - \hat{y} (1 - y_i)}{\hat{y} (1 - \hat{y})}\\
    & = \frac{y_i - y_i \hat{y} - \hat{y} + y_i \hat{y}}{\hat{y} (1 - \hat{y})}\\
    & = \frac{y_i - \hat{y}}{\hat{y} (1 - \hat{y})}
\end{align*}

\begin{align*}
    \frac{\partial \hat{y}}{\partial z} & = \frac{\partial}{\partial z} (1 + e^{-z})^{-1}\\
    & = (1 + e^{-z})^{-2} \cdot -1 \cdot e^{-z} \cdot -1\\
    & = \frac{e^{-z}}{(1 + e^{-z})^2}\\
    & = \frac{1}{1 + e^{-z}} \cdot \frac{e^{-z}}{1 + e^{-z}}\\
    & = \hat{y} \cdot (1 - \hat{y})
\end{align*}

\begin{align*}
    \frac{\partial L}{\partial z} & = \frac{\partial L}{\partial \hat{y}} \cdot \frac{\partial \hat{y}}{\partial z}\\
    & = \frac{y_i - \hat{y}}{\hat{y} (1 - \hat{y})} \cdot (\hat{y} \cdot (1 - \hat{y}))\\
    & = (y_i - \hat{y})
\end{align*}

\begin{align*}
    z & = w \cdot x + b\\
    \\
    \frac{\partial z}{\partial w} & = x\\
    \frac{\partial z}{\partial b} & = 1
\end{align*}

\begin{align*}
    \frac{\partial L}{\partial w} & = \frac{\partial L}{\partial z} \cdot \frac{\partial z}{\partial w}\\
    & = (y_i - \hat{y}) \cdot x
\end{align*}

\begin{align*}
    \frac{\partial L}{\partial b} & = \frac{\partial L}{\partial z} \cdot \frac{\partial z}{\partial b}\\
    & = (y_i - \hat{y}) \cdot 1
\end{align*}

In [15]:
## Use your Stochastic Gradient Descent approach from the previous exercise and optimize your weights.
# If your SGD implementation cannot do this, adjust the function implementation until it is able to do it :)

learning_rate = 0.005
iterations = 1000

def sgd_logistic(X, y, w, b, learning_rate, iterations):
    for num in range(iterations):
        for i in range(len(y)):
            x_i = X[i]
            y_i = y[i]

            # Predicted output
            y_hat = sigmoid(np.dot(w, x_i) + b)

            # Compute gradients
            dL_dw = (y_i - y_hat) * x_i
            dL_db = (y_i - y_hat)

            # Update parameters
            w -= learning_rate * dL_dw
            b -= learning_rate * dL_db

        # Calculate loss
        y_pred = sigmoid(np.dot(X, w) + b)
        loss = binary_cross_entropy(y, y_pred)

    return w, b, loss

In [16]:
# Results

w, b, loss = sgd_logistic(X_train, y_train, w, b, learning_rate, iterations)
print(f'Final parameters: w = {w}, b = {b}')
print(f"Final training loss: {loss}")

Final parameters: w = [23.76052656  8.29844811 22.31744632], b = 29.582651400592844
Final training loss: -18.839768723095023


## Predict the values for the test data

In [22]:
# Prediction
y_test_pred = sigmoid(np.dot(X_test, w) + b)
y_test_pred_class = (y_test_pred >= 0.5).astype(int)
accuracy = np.mean(y_test_pred_class == y_test)

print(f"y_test: {y_test}")
print(f"y_test_pred: {y_test_pred}")
print(f'Test accuracy: {accuracy}')

y_test: [1. 1. 0. 0.]
y_test_pred: [1. 1. 1. 1.]
Test accuracy: 0.5
