Task 1: Logistic Regression

In [16]:
import numpy as np
import pandas as pd

Sigmoid / Logistic Function

In [17]:
'''
The sigmoid function squishes all input values between 0 and 1. It is used to convert the output of a linear function
into a probability, which is then used to make a binary decision. 

Parameters
----------
z : numpy array

Returns
-------
_ : numpy array
    the predicted output
'''

def sigmoid(z):
    
    return 1.0 / (1 + np.exp(-z))

Loss Function

In [18]:
'''
The loss function is the overall loss function for logistic regression for this specific iteration. 

Parameters
----------
y : numpy array
y_hat : numpy array

Returns
-------
_ : float
    the loss value of that particular iteration
'''

def log_likelihood(y, y_hat):
    
    return np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))

Gradient Descent

In [19]:
'''
The gradients function calculates the partial derivatives of the loss function with respect to the weights and bias.
This partial derivative is used to update the weights and bias in the direction that minimizes the loss function.

Parameters
----------
X, inputs : numpy array
y, actual values : numpy array
y_hat, hypothesis/predictions : numpy array

Returns
-------
dw : float
    partial derivative of the loss function with respect to the weights
db : float
    partial derivative of the loss function with respect to the bias
'''

def gradients(X, y, y_hat):

    m = X.shape[0]
    
    dw = (1 / m) * np.dot(X.T, (y_hat - y))
    db = (1 / m) * np.sum((y_hat - y))

    return dw, db

Normalise

In [20]:
'''
The normalise function normalises the input features by subtracting the mean and dividing by the standard deviation.
This helps to scale down the input features to a common scale, which helps in faster convergence of the gradient 
descent algorithm, and reduces the magnitude of the weights.

Parameters
----------
X, inputs : numpy array

Returns
-------
X : numpy array
    NumPy array of normalised input features
'''

def normalize(X):

    epsilon=1e-8
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X_normalized = (X - mean) / (std + epsilon)
    
    return X_normalized

Train Function

In [21]:
'''
The train function trains the logistic regression model using the input features and target values. It uses the sigmoid 

Parameters
----------
X, inputs : numpy array
y, actual values : numpy array
bs, batch size : int
epochs, number of iterations : int
lr, learning rate : float

Returns
-------
w : numpy.ndarray
    The learned weights of the logistic regression model (shape: (n, 1)).
b : float
    The learned bias term of the logistic regression model.
losses : list of floats
    A list containing the loss values for each epoch during training.
'''

def train(X, y, bs, epochs, lr):
    '''

    m -> number of training examples
    n -> number of features 
    w -> weights
    b -> bias
    losses -> list to store loss values
    xb -> batch of input features for the specific batch
    yb -> batch of target values for the specific batch

    '''
    m, n = X.shape
    w = np.zeros((n, 1))
    b = 0
    y = y.reshape(m, 1)
    X = normalize(X)
    losses = []

    for _ in range(epochs):
        for i in range((m + bs - 1) // bs):
            start_i = i * bs
            end_i = start_i + bs
            xb = X[start_i:end_i]
            yb = y[start_i:end_i]
            y_hat = sigmoid(np.dot(xb, w) + b)
            dw, db = gradients(xb, yb, y_hat)
            w -= lr * dw
            b -= lr * db

        l = log_likelihood(y, sigmoid(np.dot(X, w) + b))
        losses.append(-l)

    return w, b, losses

Predict

In [22]:
'''
The predict function uses the learned weights and bias to make predictions on the input features.
The inputs should be either the validation set or the test set.

Parameters
----------
X, inputs : numpy array
w, learned weights : numpy array
b, learned bias : float

Returns
-------
_ : numpy array
    the predicted output contating 0s and 1s.
'''

def predict(X, w, b):
    
    X  = normalize(X)

    preds = sigmoid(np.dot(X, w) + b)
    pred_class = [1 if i > 0.5 else 0 for i in preds]
    
    return np.array(pred_class)

In [23]:
df_train = pd.read_csv("./data/train_tfidf_features.csv")
X_train = df_train.drop(['label', 'id'], axis=1)
y_train = df_train['label']

df_test = pd.read_csv("./data/test_tfidf_features.csv")
X_test = df_test.drop(['id'], axis=1)

Logistic Regression Main 

In [24]:
w, b, losses = train(X_train.values, y_train.values, bs=32, epochs=100, lr=0.01)

y_hat = predict(X_test.values, w, b)
num_ones = np.count_nonzero(y_hat)
num_zeros = len(y_hat) - num_ones
print("Number of 1s: ", num_ones)
print("Number of 0s: ", num_zeros)
print(losses)

Number of 1s:  1560
Number of 0s:  2736
[7097.788773589673, 6436.864793769081, 6135.905738817253, 5957.842324917137, 5838.262225236421, 5751.6554347193205, 5685.690592051076, 5633.6012277269265, 5591.335792802689, 5556.306448448964, 5526.775688721617, 5501.529592332729, 5479.692132198229, 5460.613950994216, 5443.80274765077, 5428.878045349482, 5415.540847088722, 5403.552718862369, 5392.721035450787, 5382.888364706548, 5373.924694763487, 5365.7216523030365, 5358.188139204538, 5351.246995368931, 5344.832414691791, 5338.8879212882375, 5333.364767777896, 5328.220655317268, 5323.418701643926, 5318.926602279795, 5314.715943620554, 5310.761636519208, 5307.041446244013, 5303.535600102161, 5300.226458089927, 5297.098235020761, 5294.136764952436, 5291.329300567021, 5288.664341586018, 5286.131487425136, 5283.721310180438, 5281.425244743685, 5279.23549341012, 5277.144942796976, 5275.14709125943, 5273.235985290319, 5271.40616363484, 5269.652608052571, 5267.970699824945, 5266.356181243597, 5264.8051

Comparing with SkLearn

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

model = Pipeline(steps=[('regressor', LogisticRegression())])
df_train = pd.read_csv("./data/train_tfidf_features.csv")
X_train = df_train.drop(['label', 'id'], axis=1)
y_train = df_train['label']

df_test = pd.read_csv("./data/test_tfidf_features.csv")
X_test = df_test.drop(['id'], axis=1)
model.fit(X_train, y_train)
y_hat = model.predict(X_test)

num_ones = np.count_nonzero(y_hat)
num_zeros = len(y_hat) - num_ones
print("Number of 1s: ", num_ones)
print("Number of 0s: ", num_zeros)

Number of 1s:  1167
Number of 0s:  3129
