In [36]:
import numpy as np
import pandas as pd
import copy
import numpy as np
from sklearn.metrics import accuracy_score

Logistic Regression using gradient Descent

- Y = Xi_0 * w_0 + Xi_1 * w_1 +... + Xi_n * w_n + b

- The logistic regression function predicts the probability that an observation is class 1, given the x features. It can be rearranged into the format of the sigmoid function.

- sigmoid(Y) = 1 / (1 + e^-Y)

- Loss function = Y*log(P(X)) 
- So, we need to calculate the gradient of the sigmoid function.


Assumptions of Logistic Regression
- Binary logistic regression requires the dependent variable to be binary.
- For a binary regression, the factor level 1 of the dependent variable should represent the desired outcome.
- Only the meaningful variables should be included.
- The independent variables should be independent of each other. That is, the model should have little or no multicollinearity.
- The independent variables are linearly related to the log odds.
- Logistic regression requires quite large sample sizes.

After you have the prediction, you can apply the basic gradient descent algorithm to optimize your model parameters, which are the weights and bias in this case. You do not use stochastic (or mini-batch gradient descent) in this notebook

- w_t+1 = w_t - lr * gradient of loss with respect to y, 
- where loss is the cross-entropy loss

- L(p,y) = (-1/m)(Sigma(ylog(p)+ (1-y)log(1-p)))

- gradient of loss = 1/m(p-y)  This can be used to update the weights & biases in gradient descent

1) calculate w*x + b
2) Take sigmoid and get pred = sigmoid(w*x + b)
3) loss = (-1/m)(Sigma(ylog(p)+ (1-y)log(1-p)))
4) Calculate gradient and get error_w, error_b = compute_gradients(x, y, pred)
5) update parameters(error_w, error_b)
6) Again predict
7) Calculate accuracy and loss

In [37]:
#some simulated data
Y = np.sort(np.random.randint(low=0, high=2, size=1000))
X1_0 = np.random.normal(loc=0, scale=1, size=500)
X1_1 = np.random.normal(loc=5, scale=1, size=500)
X1 = np.hstack((X1_0, X1_1))
X2_0 = np.random.normal(loc=0, scale=1, size=500)
X2_1 = np.random.normal(loc=2, scale=1, size=500)
X2 = np.hstack((X2_0, X2_1))
X = np.vstack((X1, X2))

df = pd.DataFrame([Y, X1, X2]).T
df.columns = ["labels", "X1", "X2"]
df["labels"] = df.labels.astype('int')
df.head(3)

Unnamed: 0,labels,X1,X2
0,0,-0.413564,0.730614
1,0,-0.770186,1.430121
2,0,-0.253314,1.363463


In [49]:
len(Y)

1000

In [64]:
class LogisticRegressionFromScratch():
    def __init__(self):
        self.losses = []
        self.train_accuracies = []
    def _sigmoid(self, x):
        return np.array([self._sigmoid_function(value) for value in x])

    def _sigmoid_function(self, x):
        if x >= 0:
            z = np.exp(-x)
            return 1 / (1 + z)
        else:
            z = np.exp(x)
            return z / (1 + z)


    def compute_loss(self, y_true, y_pred):
        # binary cross entropy
        y_zero_loss = y_true * np.log(y_pred + 1e-9)
        y_one_loss = (1-y_true) * np.log(1 - y_pred + 1e-9)
        return -np.mean(y_zero_loss + y_one_loss)

    def compute_gradients(self, x, y_true, y_pred):
        # derivative of binary cross entropy
        difference =  y_pred - y_true
        gradient_b = np.mean(difference)
        gradients_w = np.matmul(x.transpose(), difference)
        gradients_w = np.array([np.mean(grad) for grad in gradients_w])

        return gradients_w, gradient_b

    def update_model_parameters(self, error_w, error_b):
        self.weights = self.weights - 0.1 * error_w
        self.bias = self.bias - 0.1 * error_b
    
    def _transform_x(self, x):
        x = copy.deepcopy(x)
        return x

    def _transform_y(self, y):
        y = copy.deepcopy(y)
        return y.reshape(y.shape[0], 1)

    def fit(self, x, y, epochs):
        x = self._transform_x(x)
        y = self._transform_y(y)

        self.weights = np.zeros(x.shape[1])
        self.bias = 0

        for i in range(epochs):
            x_dot_weights = np.matmul(self.weights, x.transpose()) + self.bias
            pred = self._sigmoid(x_dot_weights)
            loss = self.compute_loss(y, pred)
            error_w, error_b = self.compute_gradients(x, y, pred)
            self.update_model_parameters(error_w, error_b)

            pred_to_class = [1 if p > 0.5 else 0 for p in pred]
            self.train_accuracies.append(accuracy_score(y, pred_to_class))
            self.losses.append(loss)
            print(f"Epoch: {i}, Loss: {loss}, Train Accuracy: {self.train_accuracies[-1]}")

    def predict(self, x):
        x_dot_weights = np.matmul(x, self.weights.transpose()) + self.bias
        probabilities = self._sigmoid(x_dot_weights)
        return [1 if p > 0.5 else 0 for p in probabilities]


In [65]:
# Ref: https://developer.ibm.com/articles/implementing-logistic-regression-from-scratch-in-python/

In [72]:
df.head(2)

Unnamed: 0,labels,X1,X2
0,0,-0.413564,0.730614
1,0,-0.770186,1.430121


In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split
# from sklearn.datasets import load_breast_cancer

def sklearn_to_df(X_data, feature_names, label):
    
    x = pd.DataFrame(X_data, columns=feature_names)

    y_data = label
    y = pd.Series(y_data, name="label")

    return x, y

x, y = sklearn_to_df(df[["X1", "X2"]], ["X1", "X2"], df["labels"])

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

In [71]:
x_train.head(2)

Unnamed: 0,X1,X2
29,-1.163413,-0.364882
535,4.126459,3.610493


In [70]:
model = LogisticRegressionFromScratch()
model.fit(x_train.values, y_train.values, 2)
pred2 = model.predict(x_test)
accuracy2 = accuracy_score(y_test, pred2)
print(accuracy2)

Epoch: 0, Loss: 0.6931471785599465, Train Accuracy: 0.535
Epoch: 1, Loss: 10.3029516138035, Train Accuracy: 0.735
0.75
