# AMMI_2024_NLP - Week 1

#Lab 1: Part 2

# (B) Logistic Regression Model

In this second part of the lab, we will implement a language identifier trained on the same data, but using Logistic Regression instead of Naive Bayes.

In [None]:
import io, sys, math
import numpy as np
from collections import defaultdict
from tqdm.notebook import tqdm,trange
from typing import Tuple, List, Dict
import random

This function is used to build the dictionary, or vocabulary, which is a mapping from strings (or words) to integers (or indices). This will allow to build vector representations of documents.

In [None]:
def build_dict(filename:str, threshold:int=1)->Tuple[Dict]:
    """
    Input:
    - filename: the name of the data file.
    - threshold: is the minimum number of times the word has to appear in the data to be added to the vocabulary.
    Output:
    - word_dict: the vocabulary generated from the dataset.
    - label_dict: the dictionary of the labels, with labels as keys and their indices as values of these keys.
    """
    fin = io.open(filename, 'r', encoding='utf-8')
    word_dict, label_dict = {}, {}
    counts = defaultdict(lambda: 0)
    for line in tqdm(fin):
        tokens = line.split()
        label = tokens[0]

        if not label in label_dict:
            label_dict[label] = len(label_dict)

        for w in tokens[1:]:
            counts[w] += 1

    for k, v in counts.items():
        if v > threshold:
            word_dict[k] = len(word_dict)
    return word_dict, label_dict

This function is used to load the training dataset, and build vector representations of the training examples. In particular, a document or sentence is represented as a bag of words. Each example correspond to a sparse vector ` x` of dimension `V`, where `V` is the size of the vocabulary. The element `j` of the vector `x` is the number of times the word `j` appears in the document.

In [None]:
def load_data(filename:str, word_dict:Dict, label_dict:Dict)->List[Tuple]:
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    dim = len(word_dict) #The size of the vocabulary.
    for line in tqdm(fin):
        tokens = line.split() #Consider tokenization by space in this case.
        label = tokens[0]

        yi = label_dict[label]
        xi = np.zeros(dim)
        for word in tokens[1:]:
            if word in word_dict:
                wid = word_dict[word]
                xi[wid] += 1.0
        data.append((yi, xi))
    return data

First, let's implement the softmax function. Don't forget numerical stability!

In [None]:
def softmax(x:np.ndarray)->np.ndarray:

    c = np.max(x)
    
    log_sum_exp = c + np.log(np.sum(np.exp(x - c),-1,keepdims=True))

    return np.exp(x - log_sum_exp)
  

Now, let's implement the main training loop, by using stochastic gradient descent. The function will iterate over the examples of the training set. For each example, we will first compute the loss, before computing the gradient and performing the update.

In [None]:
def sgd(w:np.ndarray, data:List[Tuple], niter:int, lr:float = 0.01)->np.ndarray:
    """
    Input:
    - w: the weight matrix of shape (length of label dictionary, length of word dictionary)
    - data: the dataset.
    - niter: number of epochs, or number of passes on the all dataset.
    - lr: the learning rate.

    Output:
    - w: the weight matrix.
    """
    random.seed(123)
    nlabels, dim = w.shape
    loss_lis = []

    for iter in trange(niter):

        total_loss = 0.0
        np.random.shuffle(data)
        
        for label,features in data:
            
            label_pred = predict(w,features)
            
            loss = -np.log(label_pred[label])
            total_loss += loss
            
            grads = label_pred.copy()
            grads[label] -= 1
            
            w -= lr * np.outer(grads,features)
        avg_loss = total_loss / len(data)
        print(f"Epoch {iter+1}: train loss -----{avg_loss}")

    return w 

The next function will predict the most probable label corresponding to example `x`, given the trained classifier `w`.

In [None]:
def predict(w:np.ndarray, x:np.ndarray)->np.ndarray:
    z = np.dot(w,x)
    label_pred = softmax(z)
    
    return label_pred
 

Finally, this function will compute the accuracy of a trained classifier `w` on a validation set.

In [None]:
def compute_accuracy(w:np.ndarray, valid_data:List[Tuple])->float:

    accuracy = 0.0
    for sample in valid_data:
        label,features = sample
        label_pred = predict(w,features)
        
        if np.argmax(label_pred) == label:
            accuracy += 1

    accuracy = accuracy / len(valid_data)
    
    return accuracy
        

In [None]:
print("")
print("** Logistic Regression on dataset 1 **")
print("")

word_dict, label_dict = build_dict("train1.txt")
train_data = load_data("train1.txt", word_dict, label_dict)
valid_data = load_data("valid1.txt", word_dict, label_dict)
nlabels = len(label_dict)

dim = len(word_dict)
w = np.zeros([nlabels, dim])
w = sgd(w, train_data, 25)
print("")
print("Validation accuracy: %.3f" % compute_accuracy(w, valid_data))
print("")

In [None]:
print("")
print("** Logistic Regression on dataset 2 **")
print("")

word_dict, label_dict = build_dict("train2.txt")
train_data = load_data("train2.txt", word_dict, label_dict)
valid_data = load_data("valid2.txt", word_dict, label_dict)
nlabels = len(label_dict)

dim = len(word_dict)
w = np.zeros([nlabels, dim])
w = sgd(w, train_data, 25)
print("")
print("Validation accuracy: %.3f" % compute_accuracy(w, valid_data))
print("")

# Recommended Reading:

- https://people.tamu.edu/~sji/classes/LR.pdf
