In [1]:
%load_ext autoreload
%autoreload 2

In [26]:
import numpy as np

from src.evaluation.classification_metrics import ClassificationMetrics

class LogisticRegression:

    def __init__(self, solver='lbfgs', max_iter=1000, C=1.0, random_state=None, tol=1e-4, learning_rate=0.01):
        self.solver = solver
        self.max_iter = max_iter
        self.C = C
        self.random_state = random_state
        self.tol = tol
        self.learning_rate = learning_rate
        self.coef_ = None
        self.intercept_ = None
        self.losses_ = []

    def _sigmoid(self, z):
        z = np.clip(z, -709, 709)  # Avoid overflow
        return 1 / (1 + np.exp(-z))

    def _lbfgs(self, X, y):
        n_samples, n_features = X.shape
        self.coef_ = np.zeros(n_features)
        self.intercept_ = 0
        y = np.squeeze(y)  # Ensure y is 1D array

        for _ in range(self.max_iter):
            # Forward pass
            linear_model = np.dot(X, self.coef_) + self.intercept_
            y_predicted = self._sigmoid(linear_model)

            # Gradient descent
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            # L2 regularization (without applying to the intercept)
            dw += (1 / self.C) * self.coef_

            # Update weights and bias
            self.coef_ -= self.learning_rate * dw
            self.intercept_ -= self.learning_rate * db

            # Compute loss
            loss = self._compute_loss(y, y_predicted, n_samples)
            self.losses_.append(loss)

            # Early stopping
            if len(self.losses_) > 1 and abs(self.losses_[-1] - self.losses_[-2]) < self.tol:
                break

    def _compute_loss(self, y, y_predicted, n_samples):
        epsilon = 1e-15
        y_predicted = np.clip(y_predicted, epsilon, 1 - epsilon)  # Avoid division by zero
        loss = (-1 / n_samples) * np.sum(y * np.log(y_predicted) + (1 - y) * np.log(1 - y_predicted))
        # Add L2 regularization term (not applied to intercept)
        return loss + (1 / (2 * self.C)) * np.sum(self.coef_**2)

    def fit(self, X, y):
        X = np.asarray(X, dtype=np.float64)
        y = np.asarray(y, dtype=np.float64).reshape(-1, 1)

        if self.solver == 'lbfgs':
            self._lbfgs(X, y)
        else:
            raise ValueError(f"Solver '{self.solver}' not supported.")

    def predict(self, X):
        X = np.asarray(X, dtype=np.float64)
        linear_model = np.dot(X, self.coef_) + self.intercept_
        probabilities = self._sigmoid(linear_model)
        return np.where(probabilities > 0.5, 1, 0).astype(np.int64)

    def predict_proba(self, X):
        X = np.asarray(X, dtype=np.float64)
        linear_model = np.dot(X, self.coef_) + self.intercept_
        return self._sigmoid(linear_model)
    
    
    def evaluate(self, X, y):
        y_predicted = self.predict(X)
        metrics_obj = ClassificationMetrics(y, y_predicted, self.predict_proba(X))
        return metrics_obj.get_metrics()


In [27]:
import numpy as np
from src.data.load_dataset import load_spambase

from sklearn.model_selection import train_test_split

In [28]:
X, y = load_spambase()
# Split the dataset into training+validation and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Further split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, stratify=y_train_val, random_state=42) # 0.25 x 0.8 = 0.2

X_train.shape, X_val.shape, X_test.shape

((2760, 57), (920, 57), (921, 57))

In [38]:
log_reg = LogisticRegression(solver='lbfgs', max_iter=500000, C=1.0, random_state=42, tol=1e-4, learning_rate=0.01)
log_reg.fit(X, y)

In [40]:
y_pred = log_reg.predict(X)
np.mean(y_pred == y)

0.7291893066724625

0.41925668332971094