## Task 1: Logistic Regression

In [71]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

1a) Own Implementation of Logistic Regression

In [72]:
def sigmoid(z):
    '''
    Sigmoid function used to convert the output of a linear function into a probability, which is then used to make a binary decision. 

    Parameters
    ----------
    z : Real number

    Returns
    -------
    _: float
        Output value between 0 and 1
    '''
    
    return 1.0 / (1 + np.exp(-z))

In [73]:
def loss(y, y_hat):
    '''
    The loss function computes the loss for logistic regression for a single training example. 

    Parameters
    ----------
    y : numpy array
    y_hat : numpy array

    Returns
    -------
    _ : float
        the loss value of that particular training example
    '''
    
    return np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))

In [74]:
def gradients(X, y, y_hat):
    '''
    The gradients function calculates the partial derivatives of the loss function with respect to the weights and bias.
    This partial derivative is used to update the weights and bias in the direction that minimizes the loss function.

    Parameters
    ----------
    X, inputs : numpy array
    y, actual values : numpy array
    y_hat, hypothesis/predictions : numpy array

    Returns
    -------
    dw : float
        partial derivative of the loss function with respect to the weights
    db : float
        partial derivative of the loss function with respect to the bias
    '''

    m = X.shape[0]
    
    dw = (1 / m) * np.dot(X.T, (y_hat - y))
    db = (1 / m) * np.sum((y_hat - y))

    return dw, db

In [75]:
def normalize(X):
    '''
    The normalise function normalises the input features by subtracting the mean and dividing by the standard deviation.
    This helps to scale down the input features to a common scale, which helps in faster convergence of the gradient 
    descent algorithm, and reduces the magnitude of the weights.

    Parameters
    ----------
    X, inputs : numpy array

    Returns
    -------
    X : numpy array
        NumPy array of normalised input features
    '''

    epsilon=1e-8
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X_normalized = (X - mean) / (std + epsilon)
    
    return X_normalized

In [76]:
def train(X, y, bs, epochs, lr):
    '''
    The train function trains the logistic regression model using the input features and target values. It uses the sigmoid function

    Parameters
    ----------
    X, inputs : numpy array
    y, actual values : numpy array
    bs, batch size : int
    epochs, number of iterations : int
    lr, learning rate : float

    Returns
    -------
    w : numpy.ndarray
        The learned weights of the logistic regression model (shape: (n, 1)).
    b : float
        The learned bias term of the logistic regression model.
    losses : list of floats
        A list containing the loss values for each epoch during training.
    '''
    # m: no. of training examples
    # n: no. of features 
    m, n = X.shape
    # weight
    w = np.zeros((n, 1))
    # bias
    b = 0
    y = y.reshape(m, 1)
    X = normalize(X)
    losses = []

    for _ in range(epochs):
        for i in range((m + bs - 1) // bs):
            start_i = i * bs
            end_i = start_i + bs
            # xb: batch of input features for the specific batch
            # yb: batch of target values for the specific batch
            xb = X[start_i:end_i]
            yb = y[start_i:end_i]
            y_hat = sigmoid(np.dot(xb, w) + b)
            dw, db = gradients(xb, yb, y_hat)
            w -= lr * dw
            b -= lr * db

        l = loss(y, sigmoid(np.dot(X, w) + b))
        losses.append(-l)

    return w, b, losses

In [77]:
def predict(X):
    '''
    The predict function uses the learned weights and bias to make predictions on the input features.
    The inputs should be either the validation set or the test set.

    Parameters
    ----------
    X, inputs : numpy array
    w, learned weights : numpy array
    b, learned bias : float

    Returns
    -------
    _ : numpy array
        the predicted output contating 0s and 1s.
    '''

    global w, b
    
    X  = normalize(X)

    preds = sigmoid(np.dot(X, w) + b)
    pred_class = [1 if i >= 0.5 else 0 for i in preds]
    
    return np.array(pred_class)

In [78]:
df_train = pd.read_csv("./data/train_tfidf_features.csv")
X = df_train.drop(['label', 'id'], axis=1)
y = df_train['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

1b) Predictions

In [79]:
w, b, losses = train(X_train.values, y_train.values, bs=32, epochs=100, lr=0.01)

self_y_pred = predict(X_val.values)
num_ones = np.count_nonzero(self_y_pred)
num_zeros = len(self_y_pred) - num_ones
print("Number of 1s for own implementation: ", num_ones)
print("Number of 0s for own implementation: ", num_zeros)

  return np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
  return np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))


Number of 1s for own implementation:  1254
Number of 0s for own implementation:  2183


SkLearn Version (for comparison only)

In [80]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, ClassifierMixin
import pandas as pd
import numpy as np

model = Pipeline(steps=[('regressor', LogisticRegression())])

model.fit(X_train, y_train)
sklearn_y_pred = model.predict(X_val)

num_ones = np.count_nonzero(sklearn_y_pred)
num_zeros = len(sklearn_y_pred) - num_ones
print("Number of 1s for SKLearn: ", num_ones)
print("Number of 0s for SKLearn: ", num_zeros)

Number of 1s for SKLearn:  914
Number of 0s for SKLearn:  2523


In [81]:
for y_pred in [self_y_pred, sklearn_y_pred]:
    for metric in [accuracy_score, precision_score, recall_score, f1_score, confusion_matrix]:
        print(f"{metric.__name__}: {metric(y_val, y_pred)}")

accuracy_score: 0.6718068082630201
precision_score: 0.5781499202551834
recall_score: 0.547583081570997
f1_score: 0.5624515128006207
confusion_matrix: [[1584  529]
 [ 599  725]]
accuracy_score: 0.7131219086412569
precision_score: 0.6849015317286652
recall_score: 0.472809667673716
f1_score: 0.5594280607685433
confusion_matrix: [[1825  288]
 [ 698  626]]


H0: There is no significant difference in the mean prediction errors between the two models.

In [85]:
class ImplementedLogisticRegressionModel(BaseEstimator, ClassifierMixin):
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.loss_history = []

    def sigmoid(self, z):
        return 1.0 / (1 + np.exp(-z))
    
    def loss(self, y, y_hat):
        return -np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)) / len(y)
    
    def gradients(self, X, y, y_hat):
        m = X.shape[0]
        dw = (1 / m) * np.dot(X.T, (y_hat - y))
        db = (1 / m) * np.sum(y_hat - y)
        return dw, db
    
    def normalize(self, X):
        epsilon = 1e-8
        mean = X.mean(axis=0)
        std = X.std(axis=0)
        X_normalized = (X - mean) / (std + epsilon)
        return X_normalized

    def fit(self, X, y):
        X = self.normalize(X)
        m, n = X.shape
        self.weights = np.zeros((n, 1))
        self.bias = 0
        y = y.reshape(m, 1)
        self.classes_ = np.unique(y)

        for _ in range(self.num_iterations):
            z = np.dot(X, self.weights) + self.bias
            y_hat = self.sigmoid(z)
            loss = self.loss(y, y_hat)
            dw, db = self.gradients(X, y, y_hat)
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            self.loss_history.append(loss)

    def predict(self, X):
        X = self.normalize(X)
        z = np.dot(X, self.weights) + self.bias
        y_hat = self.sigmoid(z)
        y_pred = np.round(y_hat)
        return y_pred

    def get_params(self, deep=True):
        return {"learning_rate": self.learning_rate, "num_iterations": self.num_iterations}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [86]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
import numpy as np

scoring = make_scorer(f1_score)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

your_model_scores = cross_val_score(ImplementedLogisticRegressionModel(), X.values, y.values, cv=skf, scoring=scoring)
sklearn_model_scores = cross_val_score(LogisticRegression(), X, y, cv=skf, scoring=scoring)

print(f"Implemented model scores: {your_model_scores}")
print(f"Scikit-learn model scores: {sklearn_model_scores}")

Implemented model scores: [0.58862325 0.58794385 0.58718573 0.60260586 0.61097461 0.60635697
 0.60967742 0.60661157 0.59868421 0.58536585]
Scikit-learn model scores: [0.55813953 0.58221024 0.57641921 0.62412587 0.59358289 0.58145815
 0.5844504  0.59127337 0.57506824 0.55197133]


In [87]:
from scipy.stats import ttest_ind

t_stat, p_value = ttest_ind(your_model_scores, sklearn_model_scores)
print("T-statistic:", t_stat)
print("P-value:", p_value)

T-statistic: 2.3418865540881373
P-value: 0.030886855352741792
