<a href="https://colab.research.google.com/github/haidodev/NLP/blob/main/week1/Hierarchical_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Download a file named glove.6B.zip from the specified URL, which contains pre-trained word embeddings called GloVe embeddings.

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2024-04-21 14:24:32--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-04-21 14:24:32--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-04-21 14:24:33--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
!unzip glove*.zip
!ls
!pwd

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       
glove.6B.100d.txt  glove.6B.300d.txt  glove.6B.zip
glove.6B.200d.txt  glove.6B.50d.txt   sample_data
/content


In [None]:
import os
import csv
import pandas as pd
import torch
from torch.utils.data import Dataset
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.datasets import fetch_20newsgroups

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
class News20Dataset(Dataset):
  def __init__(self, word_map_path, max_sent_length=150, max_doc_length=40, is_train=True):
    """
    Initialize the News20Dataset object.

    Args:
        word_map_path (str): Path to the word map file.
        max_sent_length (int): Maximum length of a sentence.
        max_doc_length (int): Maximum length of a document.
        is_train (bool): Flag indicating whether the dataset is for training or testing.
    """
    self.max_sent_length = max_sent_length
    self.max_doc_length = max_doc_length
    self.split = 'train' if is_train else 'test'

    # Fetch data from 20 newsgroups dataset
    self.data = fetch_20newsgroups(
        subset=self.split,
        categories=['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space'],
        shuffle=False,
        remove=('headers', 'footers', 'quotes')
    )

    # Load vocabulary from word map file
    self.vocab = pd.read_csv(
        filepath_or_buffer=word_map_path,
        header=None,
        sep=' ',
        quoting=csv.QUOTE_NONE,
        usecols=[0]
    ).values[:50000]

    # Create vocabulary list
    self.vocab = ['<pad>', '<unk>'] + [word[0] for word in self.vocab]

  def transform(self, text):
    """
    Transform text into numerical representation using the vocabulary.

    Args:
        text (str): Input text.

    Returns:
        list, int, list: Transformed document, number of sentences, number of words per sentence.
    """
    doc = [
        [self.vocab.index(word) if word in self.vocab else 1 for word in word_tokenize(text=sent)]
        for sent in sent_tokenize(text=text)
    ]
    doc = [sent[:self.max_sent_length] for sent in doc][:self.max_doc_length]
    num_sents = min(len(doc), self.max_doc_length)
    if num_sents == 0:
      return None, -1, None

    num_words = [min(len(sent), self.max_sent_length) for sent in doc][:self.max_doc_length]

    return doc, num_sents, num_words

  def __getitem__(self, i):
    label = self.data['target'][i]
    text = self.data['data'][i]

    doc, num_sents, num_words = self.transform(text)

    if num_sents == -1:
      return None

    return doc, label, num_sents, num_words

  def __len__(self):
    return len(self.data['data'])

  @property
  def vocab_size(self):
    """
    Get the size of the vocabulary.

    Returns:
        int: Size of the vocabulary.
    """
    return len(self.vocab)

  @property
  def num_classes(self):
    """
    Get the number of classes in the dataset.

    Returns:
        int: Number of classes.
    """
    return 4


Collate function organizes and processes variable-length input data into uniform batches, essential for training neural networks efficiently

The below collate function takes a batch of data samples, filters out any None values, organizes them into tensors, and pads them to create uniform batch sizes for efficient processing.

In [None]:
def collate_fn(batch):
  """
  Collate function for batching data.

  Args:
      batch (list): List of tuples containing (document, label, number of sentences, number of words per sentence).

  Returns:
      torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor: Batched tensors for documents, labels, document lengths, and sentence lengths.
  """
  # Filter out None values from the batch
  batch = filter(lambda x: x is not None, batch)
  # Unzip batch elements
  docs, labels, doc_lengths, sent_lengths = list(zip(*batch))

  bsz = len(labels)
  batch_max_doc_length = max(doc_lengths)
  batch_max_sent_length = max([max(sl) if sl else 0 for sl in sent_lengths])

  # Initialize tensors for documents and sentence lengths
  docs_tensor = torch.zeros([bsz, batch_max_doc_length, batch_max_sent_length]).long()
  sent_lengths_tensor = torch.zeros([bsz, batch_max_doc_length]).long()

  # Fill in tensors with data from batch
  for doc_idx, doc in enumerate(docs):
    doc_length = doc_lengths[doc_idx]
    sent_lengths_tensor[doc_idx, :doc_length] = torch.LongTensor(sent_lengths[doc_idx])
    for sent_idx, sent in enumerate(doc):
      sent_length = sent_lengths[doc_idx][sent_idx]
      docs_tensor[doc_idx, sent_idx, :sent_length] = torch.LongTensor(sent)

  return docs_tensor, torch.LongTensor(labels), torch.LongTensor(doc_lengths), sent_lengths_tensor

Prepare Dataloader

In [None]:
from torch.utils.data import DataLoader
from torch.utils.data.sampler import RandomSampler

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
class MyDataLoader(DataLoader):
  def __init__(self, dataset, batch_size):
    """
    Custom DataLoader class for handling data loading.

    Args:
        dataset (Dataset): The dataset to load.
        batch_size (int): Batch size for data loading.
    """
    self.n_samples = len(dataset)
    self.sampler = RandomSampler(dataset)

    # Define DataLoader initialization arguments
    self.init_kwargs = {
        'dataset': dataset,
        'batch_size': batch_size,
        'pin_memory': True,  # Pin memory for faster GPU transfers if available
        'collate_fn': collate_fn,  # Collate function for batching data
        'shuffle': False  # Disable shuffling to maintain consistency during evaluation
    }

    # Initialize DataLoader using superclass constructor
    super().__init__(sampler=self.sampler, **self.init_kwargs)


In [None]:
dataset = News20Dataset('glove.6B.100d.txt', is_train=True)
data_loader = MyDataLoader(dataset, 64)

Iterates through batches from a data loader, printing information about each batch. It displays the batch index, size, shapes of document and label tensors, document lengths, and shapes of sentence length tensors

In [None]:
for batch_idx, (docs_tensor, labels, doc_lengths, sent_lengths_tensor) in enumerate(data_loader):
  print(f'Batch {batch_idx}:')
  print(f'Batch size: {labels.size(0)}')  # Print batch size
  print(f'Doc tensor shape: {docs_tensor.shape}')  # Print shape of document tensor
  print(f'Docs tensor: {docs_tensor}')  # Print document tensor
  print(f'Lables tensor shape: {labels.shape}')  # Print shape of labels tensor
  print(f'Document lengths: {doc_lengths}')  # Print document lengths
  print(f'Sentence lengths tensor shape: {sent_lengths_tensor.shape}')  # Print shape of sentence length tensor
  print(f'Sentence lengths tensor: {sent_lengths_tensor}')  # Print sentence length tensor


Batch 0:
Batch size: 61
Doc tensor shape: torch.Size([61, 40, 150])
Docs tensor: tensor([[[    1,     4,     0,  ...,     0,     0,     0],
         [ 1115,     1,    40,  ...,     0,     0,     0],
         [    1,  1572,   171,  ...,     0,     0,     0],
         ...,
         [    0,     0,     0,  ...,     0,     0,     0],
         [    0,     0,     0,  ...,     0,     0,     0],
         [    0,     0,     0,  ...,     0,     0,     0]],

        [[    1,     3,    24,  ...,     0,     0,     0],
         [    1,     0,     0,  ...,     0,     0,     0],
         [    0,     0,     0,  ...,     0,     0,     0],
         ...,
         [    0,     0,     0,  ...,     0,     0,     0],
         [    0,     0,     0,  ...,     0,     0,     0],
         [    0,     0,     0,  ...,     0,     0,     0]],

        [[    1,     3,    14,  ...,     0,     0,     0],
         [    1,   221,  1411,  ...,     0,     0,     0],
         [    1,     3,    20,  ...,     0,     0,     0],
  

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, PackedSequence






*   Initialize embeddings with pre-trained weights or random initialization.

*   Optionally freeze embeddings during training.
*   Sort sentences by length for efficient processing.
*   Pack sequences for dynamic sequence handling.
*   Pass packed sequences through bidirectional GRU layer.
*   Optionally apply layer normalization.
*   Mục danh sách
*   Mục danh sách
*   Mục danh sách
*   Mục danh sách
*   Mục danh sách






Compute attention weights and normalize them.
Apply attention to GRU outputs and aggregate attended representations.
Return sentence embeddings and attention weights.



In [None]:
class WordAttention(nn.Module):
  def __init__(self, vocab_size, embed_dim, gru_hidden_dim, gru_num_layers, att_dim, use_layer_norm, dropout):
    """
    Initialize the WordAttention module.

    Args:
        vocab_size (int): Size of the vocabulary.
        embed_dim (int): Dimension of word embeddings.
        gru_hidden_dim (int): Dimension of GRU hidden states.
        gru_num_layers (int): Number of layers in the GRU.
        att_dim (int): Dimension of attention vectors.
        use_layer_norm (bool): Whether to use layer normalization.
        dropout (float): Dropout probability.
    """
    super(WordAttention, self).__init__()
    self.embeddings = nn.Embedding(vocab_size, embed_dim)
    self.gru = nn.GRU(
        embed_dim,
        gru_hidden_dim,
        num_layers=gru_num_layers,
        batch_first=True,
        bidirectional=True,
        dropout=dropout
    )
    self.use_layer_norm = use_layer_norm
    if use_layer_norm:
      self.layer_norm = nn.LayerNorm(2 * gru_hidden_dim, elementwise_affine=True)
    self.dropout = nn.Dropout(dropout)

    self.attention = nn.Linear(2 * gru_hidden_dim, att_dim)

    self.context_vector = nn.Linear(att_dim, 1, bias=False)

  def init_embeddings(self, embeddings):
    """
    Initialize the embedding layer with pre-trained embeddings.

    Args:
        embeddings (torch.Tensor): Pre-trained word embeddings.
    """
    self.embeddings.weight = nn.Parameter(embeddings)

  def freeze_embeddings(self, freeze=False):
    """
    Freeze or unfreeze the embedding layer.

    Args:
        freeze (bool): Whether to freeze the embedding layer.
    """
    self.embeddings.weight.requires_grad = not freeze

  def forward(self, sents, sent_lengths):
    """
    Forward pass of the WordAttention module.

    Args:
        sents (torch.Tensor): Input sentences.
        sent_lengths (torch.Tensor): Lengths of input sentences.

    Returns:
        torch.Tensor, torch.Tensor: Sentence embeddings and attention weights.
    """
    # Sort sentences by length for efficient processing
    sent_lengths, sent_perm_idx = sent_lengths.sort(dim=0, descending=True)
    sents = sents[sent_perm_idx]

    # Pass sentences through embedding layer and apply dropout
    sents = self.embeddings(sents)
    sents = self.dropout(sents)

    # Pack sequences for dynamic sequence handling
    packed_words = pack_padded_sequence(sents, lengths=sent_lengths.tolist(), batch_first=True)

    valid_bsz = packed_words.batch_sizes

    # Pass packed sequences through bidirectional GRU layer
    packed_words, _ = self.gru(packed_words)

    # Optionally apply layer normalization
    if self.use_layer_norm:
      normed_words = self.layer_norm(packed_words.data)
    else:
      normed_words = packed_words

    # Compute attention weights
    att = torch.tanh(self.attention(normed_words.data))
    att = self.context_vector(att).squeeze(1)
    val = att.max()
    att = torch.exp(att - val)
    att, _ = pad_packed_sequence(PackedSequence(att, valid_bsz), batch_first=True)
    att_weights = att / torch.sum(att, dim = 1, keepdim=True)

    # Apply attention to GRU outputs and aggregate attended representations
    sents, _ = pad_packed_sequence(packed_words, batch_first=True)
    sents = sents * att_weights.unsqueeze(2)
    sents = sents.sum(dim=1)

    # Restore original order of sentences
    _, sent_unperm_idx = sent_perm_idx.sort(dim=0, descending=False)
    sents = sents[sent_unperm_idx]
    att_weights = att_weights[sent_unperm_idx]

    return sents, att_weights




*   Sorting: Documents are sorted by decreasing order of length for efficient processing.
*   Word-level Attention: Each document's sentences are passed through the word-level attention module to obtain word-level attention weights and sentence embeddings.
*   Dropout: Optionally, dropout is applied to the sentence embeddings.
*   Packing: Sentences are packed into a long batch by removing pad-sentences for dynamic sequence handling.
*   Sentence-level GRU: The packed sentence embeddings are passed through a bidirectional sentence-level GRU to capture contextual information.
*   Layer Normalization (Optional): If enabled, layer normalization is applied to the sentence embeddings.
*   Sentence-level Attention: The sentence embeddings are passed through a linear layer followed by a context vector to compute sentence-level attention weights.
*   Normalization: Attention weights are normalized.
*   Repadding: Documents are restored by repadding the packed sentence embeddings.
*   Document Computation: Document vectors are computed by weighting the sentence embeddings with the sentence-level attention weights and summing them.
*   Finalization: Attention weights of words and sentences are restored to their original order.

In [None]:
class SentenceAttention(nn.Module):
  """
  Sentence-level attention module with a word-level attention module.
  """
  def __init__(self, vocab_size, embed_dim, word_gru_hidden_dim, sent_gru_hidden_dim,
              word_gru_num_layers, sent_gru_num_layers, word_att_dim, sent_att_dim, use_layer_norm, dropout):
      """
      Initialize the SentenceAttention module.

      Args:
          vocab_size (int): Size of the vocabulary.
          embed_dim (int): Dimension of word embeddings.
          word_gru_hidden_dim (int): Dimension of word-level GRU hidden states.
          sent_gru_hidden_dim (int): Dimension of sentence-level GRU hidden states.
          word_gru_num_layers (int): Number of layers in the word-level GRU.
          sent_gru_num_layers (int): Number of layers in the sentence-level GRU.
          word_att_dim (int): Dimension of word-level attention vectors.
          sent_att_dim (int): Dimension of sentence-level attention vectors.
          use_layer_norm (bool): Whether to use layer normalization.
          dropout (float): Dropout probability.
      """
      super(SentenceAttention, self).__init__()

      # Word-level attention module
      self.word_attention = WordAttention(vocab_size, embed_dim, word_gru_hidden_dim, word_gru_num_layers,
                                          word_att_dim, use_layer_norm, dropout)

      # Bidirectional sentence-level GRU
      self.gru = nn.GRU(2 * word_gru_hidden_dim, sent_gru_hidden_dim, num_layers=sent_gru_num_layers,
                        batch_first=True, bidirectional=True, dropout=dropout)

      # Optionally apply layer normalization
      self.use_layer_norm = use_layer_norm
      if use_layer_norm:
          self.layer_norm = nn.LayerNorm(2 * sent_gru_hidden_dim, elementwise_affine=True)
      self.dropout = nn.Dropout(dropout)

      # Sentence-level attention
      self.sent_attention = nn.Linear(2 * sent_gru_hidden_dim, sent_att_dim)

      # Sentence context vector u_s to take dot product with
      self.sentence_context_vector = nn.Linear(sent_att_dim, 1, bias=False)

  def forward(self, docs, doc_lengths, sent_lengths):
      """
      Forward pass of the SentenceAttention module.

      Args:
          docs (torch.Tensor): Encoded document-level data; LongTensor (num_docs, padded_doc_length, padded_sent_length).
          doc_lengths (torch.Tensor): Unpadded document lengths; LongTensor (num_docs).
          sent_lengths (torch.Tensor): Unpadded sentence lengths; LongTensor (num_docs, padded_doc_length).

      Returns:
          torch.Tensor, torch.Tensor, torch.Tensor: Document embeddings, attention weights of words, attention weights of sentences.
      """
      # Sort documents by decreasing order in length
      doc_lengths, doc_perm_idx = doc_lengths.sort(dim=0, descending=True)
      docs = docs[doc_perm_idx]
      sent_lengths = sent_lengths[doc_perm_idx]

      # Make a long batch of sentences by removing pad-sentences
      packed_sents = pack_padded_sequence(docs, lengths=doc_lengths.tolist(), batch_first=True)

      # effective batch size at each timestep
      valid_bsz = packed_sents.batch_sizes

      # Make a long batch of sentence lengths by removing pad-sentences
      packed_sent_lengths = pack_padded_sequence(sent_lengths, lengths=doc_lengths.tolist(), batch_first=True)

      # Word attention module
      sents, word_att_weights = self.word_attention(packed_sents.data, packed_sent_lengths.data)

      # Optionally apply dropout
      sents = self.dropout(sents)

      # Sentence-level GRU over sentence embeddings
      packed_sents, _ = self.gru(PackedSequence(sents, valid_bsz))

      # Optionally apply layer normalization
      if self.use_layer_norm:
          normed_sents = self.layer_norm(packed_sents.data)
      else:
          normed_sents = packed_sents

      # Sentence attention
      att = torch.tanh(self.sent_attention(normed_sents))
      att = self.sentence_context_vector(att).squeeze(1)

      # Normalize attention weights
      val = att.max()
      att = torch.exp(att - val)
      att, _ = pad_packed_sequence(PackedSequence(att, valid_bsz), batch_first=True)
      sent_att_weights = att / torch.sum(att, dim=1, keepdim=True)

      # Restore as documents by repadding
      docs, _ = pad_packed_sequence(packed_sents, batch_first=True)

      # Compute document vectors
      docs = docs * sent_att_weights.unsqueeze(2)
      docs = docs.sum(dim=1)

      # Restore as documents by repadding
      word_att_weights, _ = pad_packed_sequence(PackedSequence(word_att_weights, valid_bsz), batch_first=True)

      # Restore the original order of documents (undo the first sorting)
      _, doc_unperm_idx = doc_perm_idx.sort(dim=0, descending=False)
      docs = docs[doc_unperm_idx]
      word_att_weights = word_att_weights[doc_unperm_idx]
      sent_att_weights = sent_att_weights[doc_unperm_idx]

      return docs, word_att_weights, sent_att_weights








*   Initialization: The constructor initializes the model with parameters such as the number of classes, vocabulary size, embedding dimensions, GRU hidden dimensions, number of layers, attention dimensions, layer normalization usage, and dropout probability.
*   Sentence Attention Module: It contains an instance of the SentenceAttention module, which is responsible for computing document embeddings and attention weights at both word and sentence levels.
*   Fully Connected Layer: The model has a fully connected layer (fc) for classification, which takes the document embeddings as input and produces class scores.
*   Forward Pass: In the forward method, input documents along with their lengths (both document lengths and sentence lengths) are passed through the SentenceAttention module to compute document embeddings (doc_embed) and attention weights (word_att_weights and sent_att_weights). Then, the document embeddings are fed into the fully connected layer to obtain class scores (scores).
*   Return: The method returns the class scores along with the word-level and sentence-level attention weights.



In [None]:
class HierarchicalAttentionNetwork(nn.Module):
  """
  Hierarchical Attention Network for document classification.
  """
  def __init__(self,
               num_classes,
               vocab_size,
               embed_dim,
               word_gru_hidden_dim,
               sent_gru_hidden_dim,
               word_gru_num_layers,
               sent_gru_num_layers,
               word_att_dim,
               sent_att_dim,
               use_layer_norm,
               dropout):
    """
    Initialize the HierarchicalAttentionNetwork module.

    Args:
        num_classes (int): Number of classes for classification.
        vocab_size (int): Size of the vocabulary.
        embed_dim (int): Dimension of word embeddings.
        word_gru_hidden_dim (int): Dimension of word-level GRU hidden states.
        sent_gru_hidden_dim (int): Dimension of sentence-level GRU hidden states.
        word_gru_num_layers (int): Number of layers in the word-level GRU.
        sent_gru_num_layers (int): Number of layers in the sentence-level GRU.
        word_att_dim (int): Dimension of word-level attention vectors.
        sent_att_dim (int): Dimension of sentence-level attention vectors.
        use_layer_norm (bool): Whether to use layer normalization.
        dropout (float): Dropout probability.
    """
    super(HierarchicalAttentionNetwork, self).__init__()
    self.sent_attention = SentenceAttention(
        vocab_size,
        embed_dim,
        word_gru_hidden_dim,
        sent_gru_hidden_dim,
        word_gru_num_layers,
        sent_gru_num_layers,
        word_att_dim,
        sent_att_dim,
        use_layer_norm,
        dropout
    )

    # Fully connected layer for classification
    self.fc = nn.Linear(2 * sent_gru_hidden_dim, num_classes)

    self.use_layer_norm = use_layer_norm
    self.dropout = dropout

  def forward(self, docs, doc_lengths, sent_lengths):
    """
    Forward pass of the HierarchicalAttentionNetwork module.

    Args:
        docs (torch.Tensor): Encoded document-level data; LongTensor (num_docs, padded_doc_length, padded_sent_length).
        doc_lengths (torch.Tensor): Unpadded document lengths; LongTensor (num_docs).
        sent_lengths (torch.Tensor): Unpadded sentence lengths; LongTensor (num_docs, padded_doc_length).

    Returns:
        torch.Tensor, torch.Tensor, torch.Tensor: Scores, word-level attention weights, sentence-level attention weights.
    """
    # Compute document embeddings and attention weights
    doc_embed, word_att_weights, sent_att_weights = self.sent_attention(docs, doc_lengths, sent_lengths)

    # Pass document embeddings through the fully connected layer for classification
    scores = self.fc(doc_embed)

    return scores, word_att_weights, sent_att_weights


In [None]:
import os
import torch
from tqdm import tqdm
from pylab import *
from nltk.tokenize import word_tokenize, sent_tokenize

import matplotlib
import matplotlib.pyplot as plt

In [None]:
class MetricTracker(object):
  """
  Class to track and compute metrics during training or evaluation.
  """
  def __init__(self):
    """
    Initialize the MetricTracker.
    """
    self.reset()

  def reset(self):
    """
    Reset all metrics to zero.
    """
    self.val = 0
    self.avg = 0
    self.sum = 0
    self.count = 0

  def update(self, summed_val, n=1):
    """
    Update metrics based on new values.

    Args:
        summed_val (float): Summed value of the metric over a batch or epoch.
        n (int): Number of samples in the batch or epoch.
    """
    # Compute average value for the current batch or epoch
    self.val = summed_val / n
    # Update sum of all values encountered so far
    self.sum += summed_val
    # Increment count of total number of samples seen
    self.count += n
    # Compute running average of all values encountered so far
    self.avg = self.sum / self.count


Evaluation Method: The eval method performs the evaluation process. It sets the model to evaluation mode (eval()) and disables gradient calculation (torch.no_grad()). It resets the accuracy tracker (accs) and iterates over batches of data from the evaluation data loader. For each batch, it moves the data to the appropriate device, computes scores using the model, calculates predictions, and updates the accuracy tracker with the number of correct predictions. After iterating over all batches, it prints the average accuracy on the test set and updates the best_acc attribute if the current average accuracy is higher than the previous best accuracy.


In [None]:
class Evaluation:
  """
  Class to evaluate a model on a dataset.
  """
  def __init__(self, config, model):
    """
    Initialize the Evaluation instance.

    Args:
        config (dict): Configuration dictionary.
        model: Trained model for evaluation.
    """
    self.config = config
    self.model = model
    self.device = next(self.model.parameters()).device

    # Initialize dataset and data loader for evaluation
    self.dataset = News20Dataset(config['vocab_path'], is_train=False)
    self.dataloader = torch.utils.data.DataLoader(self.dataset, batch_size=config['batch_size'], shuffle=False, collate_fn=collate_fn)

    # Initialize accuracy tracker
    self.accs = MetricTracker()
    self.best_acc = 0

  def eval(self):
    """
    Perform evaluation on the dataset.
    """
    # Set model to evaluation mode
    self.model.eval()
    # Disable gradient calculation
    with torch.no_grad():
      # Reset accuracy tracker
      self.accs.reset()

      # Iterate over batches of data
      for (docs, labels, doc_lengths, sent_lengths) in self.dataloader:
        batch_size = labels.size(0)

        # Move data to device
        docs = docs.to(self.device)
        sent_lengths = sent_lengths.to(self.device)
        labels = labels.to(self.device)
        doc_lengths = doc_lengths.to(self.device)

        # Forward pass through the model
        scores, word_at_weights, sentence_att_weights = self.model(docs, doc_lengths, sent_lengths)

        # Compute predictions
        predictions = scores.max(dim=1)[1]

        # Calculate accuracy for the batch
        correct_predictions = torch.eq(predictions, labels).sum().item()
        acc = correct_predictions

        # Update accuracy tracker
        self.accs.update(acc, batch_size)

      # Update best accuracy if current average accuracy is higher
      self.best_acc = max(self.best_acc, self.accs.avg)

      # Print test average accuracy
      print('Test Average Accuracy: {acc.avg:.4f}'.format(acc=self.accs))





*   Training Method: The train method performs the training process. It iterates over epochs specified in the configuration. For each epoch, it calls the _train_epoch method to train the model for one epoch. After each epoch, it prints the average loss and accuracy for that epoch. It then evaluates the model using the Evaluation class (tester) and saves the model if the current accuracy is the best so far.
*   Epoch Training Method: The _train_epoch method trains the model for one epoch. It sets the model to training mode, resets the loss and accuracy trackers, and iterates over batches of training data. For each batch, it moves the data to the appropriate device, performs a forward pass through the model to compute scores, computes the loss, performs backpropagation, and updates model parameters using the optimizer. It also calculates accuracy for the batch and updates the loss and accuracy trackers. Additionally, it prints the loss and accuracy for each batch during training.



In [None]:
class Trainer:
  """
  Class to train a model using specified optimizer and criterion.
  """
  def __init__(self, config, model, optimizer, criterion, dataloader):
    """
    Initialize the Trainer.

    Args:
        config (dict): Configuration dictionary.
        model: Model to train.
        optimizer: Optimizer for updating model parameters.
        criterion: Criterion (loss function) for training.
        dataloader: DataLoader for training data.
    """
    self.config = config
    self.model = model
    self.optimizer = optimizer
    self.criterion = criterion
    self.dataloader = dataloader
    self.device = next(self.model.parameters()).device

    # Initialize metric trackers for loss and accuracy
    self.losses = MetricTracker()
    self.accs = MetricTracker()

    # Initialize evaluation instance for testing
    self.tester = Evaluation(self.config, self.model)

  def train(self):
    """
    Train the model.
    """
    # Iterate over epochs
    for epoch in range(self.config['num_epochs']):
      # Train for one epoch
      result = self._train_epoch(epoch)
      # Print epoch-level results
      print('Epoch: [{0}]\t Avg Loss {loss:.4f}\t Avg Accuracy {acc:.3f}'.format(epoch, loss=result['loss'], acc=result['acc']))

      # Evaluate the model
      self.tester.eval()
      # Save the model if the current accuracy is the best so far
      if self.tester.best_acc == self.tester.accs.avg:
        print('Saving Model...')
        torch.save({
            'epoch': epoch,
            'model': self.model,
            'optimizer': self.optimizer
        }, 'best_model/model.pth.tar')

  def _train_epoch(self, epoch_idx):
    """
    Train the model for one epoch.

    Args:
        epoch_idx (int): Index of the current epoch.

    Returns:
        dict: Dictionary containing average loss and accuracy for the epoch.
    """
    # Set model to training mode
    self.model.train()

    # Reset loss and accuracy trackers
    self.losses.reset()
    self.accs.reset()

    # Iterate over batches of training data
    for batch_idx, (docs, labels, doc_lengths, sent_lengths) in enumerate(self.dataloader):
        batch_size = labels.size(0)

        # Move data to device
        docs = docs.to(self.device)
        labels = labels.to(self.device)
        sent_lengths = sent_lengths.to(self.device)
        doc_lengths = doc_lengths.to(self.device)

        # Zero the gradients
        self.optimizer.zero_grad()

        # Forward pass through the model
        scores, word_att_weights, sentence_att_weights = self.model(docs, doc_lengths, sent_lengths)

        # Calculate the loss
        loss = self.criterion(scores, labels)
        loss.backward()

        # Clip gradients if specified
        if self.config['max_grad_norm'] is not None:
          torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config['max_grad_norm'])

        # Update model parameters
        self.optimizer.step()

        # Compute accuracy for the batch
        predictions = scores.max(dim=1)[1]
        correct_predictions = torch.eq(predictions, labels).sum().item()
        acc = correct_predictions

        # Update loss and accuracy trackers
        self.losses.update(loss.item(), batch_size)
        self.accs.update(acc, batch_size)

        # Print batch-level results
        print('Epoch: [{0}][{1}/{2}]\t Loss {loss.val:.4f} ({loss.avg:.4f})\t Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(epoch_idx, batch_idx, len(self.dataloader), loss=self.losses, acc=self.accs))

    # Return epoch-level results
    log = {
        'loss': self.losses.avg,
        'acc': self.accs.avg
    }
    return log





*   Dataset and DataLoader Setup: It initializes the training dataset using News20Dataset and creates a DataLoader (dataloader) using the custom DataLoader class MyDataLoader.
*   Model Initialization: It initializes the model (HierarchicalAttentionNetwork) with the specified parameters and moves it to the specified device.
Optimizer Initialization: It initializes the optimizer (Adam) for updating model parameters.
*   Loss Function Initialization: It initializes the criterion (CrossEntropyLoss) for computing the loss during training.
*   Pretrained Embeddings Initialization (Optional): If pretrain is set to True in the configuration, it retrieves pretrained embeddings (glove_pretrained) using the dataset's vocabulary and initializes the model's embeddings with these pretrained weights. It also freezes the embeddings if specified in the configuration.
*   Trainer Initialization: It initializes the Trainer instance with the configured model, optimizer, criterion, and dataloader.
*   Training: It starts the training process by calling the train method of the Trainer instance.








In [None]:
def train(config, device):
  """
  Train a hierarchical attention network model.

  Args:
      config (dict): Configuration dictionary containing hyperparameters and settings.
      device: Device to use for training (e.g., 'cuda' for GPU or 'cpu').

  """
  # Initialize training dataset and dataloader
  dataset = News20Dataset(config['vocab_path'], is_train=True)
  dataloader = MyDataLoader(dataset, batch_size=config['batch_size'])

  # Initialize the model
  model = HierarchicalATtentionNetwork(
        num_classes=dataset.num_classes,
        vocab_size=dataset.vocab_size,
        embed_dim=config['embed_dim'],
        word_gru_hidden_dim=config['word_gru_hidden_dim'],
        sent_gru_hidden_dim=config['sent_gru_hidden_dim'],
        word_gru_num_layers=config['word_gru_num_layers'],
        sent_gru_num_layers=config['sent_gru_num_layers'],
        word_att_dim=config['word_att_dim'],
        sent_att_dim=config['sent_att_dim'],
        use_layer_norm=config['use_layer_norm'],
        dropout=config['dropout']
    ).to(device)

  # Initialize optimizer
  optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, model.parameters()), lr=config['lr'])

  # Initialize loss function
  criterion = nn.CrossEntropyLoss(reduction='sum').to(device)

  # Initialize pretrained embeddings if specified
  if config['pretrain']:
    glove_pretrained = get_pretrained_weights(dataset.vocab, config['embed_dim'], device)
    model.sent_attention.word_attention.init_embeddings(glove_pretrained)

  # Freeze embeddings if specified
  model.sent_attention.word_attention.freeze_embeddings(config['freeze'])

  # Initialize trainer
  trainer = Trainer(config, model, optimizer, criterion, dataloader)

  # Start training
  trainer.train()


In [None]:
os.makedirs('best_model', exist_ok=True)


In [None]:
config = {
  "batch_size": 64,
    "num_epochs": 25,
    "lr": 3e-3,
    "max_grad_norm": 5.0,
    "embed_dim": 100,
    "word_gru_hidden_dim": 100,
    "sent_gru_hidden_dim": 100,
    "word_gru_num_layers": 1,
    "sent_gru_num_layers": 1,
    "word_att_dim": 200,
    "sent_att_dim": 200,
    "vocab_path": "glove.6B.100d.txt",
    "pretrain": True,
    "freeze": False,
    "use_layer_norm": True,
    "dropout": 0.1
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train(config, device)



Epoch: [0][0/38]	 Loss 1.3773 (1.3773)	 Accuracy 0.349 (0.349)
Epoch: [0][1/38]	 Loss 1.6421 (1.5108)	 Accuracy 0.188 (0.268)
Epoch: [0][2/38]	 Loss 1.5068 (1.5094)	 Accuracy 0.286 (0.274)
Epoch: [0][3/38]	 Loss 1.3631 (1.4730)	 Accuracy 0.381 (0.300)
Epoch: [0][4/38]	 Loss 1.3521 (1.4489)	 Accuracy 0.333 (0.307)
Epoch: [0][5/38]	 Loss 1.3215 (1.4280)	 Accuracy 0.323 (0.310)
Epoch: [0][6/38]	 Loss 1.3207 (1.4137)	 Accuracy 0.500 (0.335)
Epoch: [0][7/38]	 Loss 1.2724 (1.3961)	 Accuracy 0.468 (0.351)
Epoch: [0][8/38]	 Loss 1.3341 (1.3892)	 Accuracy 0.365 (0.353)
Epoch: [0][9/38]	 Loss 1.2489 (1.3750)	 Accuracy 0.397 (0.357)
Epoch: [0][10/38]	 Loss 1.1936 (1.3584)	 Accuracy 0.492 (0.370)
Epoch: [0][11/38]	 Loss 1.2182 (1.3468)	 Accuracy 0.468 (0.378)
Epoch: [0][12/38]	 Loss 1.0642 (1.3252)	 Accuracy 0.581 (0.393)
Epoch: [0][13/38]	 Loss 0.9875 (1.3008)	 Accuracy 0.635 (0.411)
Epoch: [0][14/38]	 Loss 1.0421 (1.2837)	 Accuracy 0.548 (0.420)
Epoch: [0][15/38]	 Loss 0.9477 (1.2631)	 Accuracy 

In [None]:
def get_pretrained_weights(corpus_vocab, embed_dim, device):
  save_dir = os.path.join('glove_pretrained.pt')
  if os.path.exists(save_dir):
    return torch.load(save_dir, map_location=device)

  corpus_set = set(corpus_vocab)
  pretrained_vocab = set()
  glove_pretrained = torch.zeros(len(corpus_vocab), embed_dim)
  with open(os.path.join('glove.6B.100d.txt'), 'rb') as f:
    for l in tqdm(f):
      line = l.decode().split()
      if line[0] in corpus_set:
        pretrained_vocab.add(line[0])
        glove_pretrained[corpus_vocab.index(line[0])] = torch.from_numpy(np.array(line[1:]).astype(float))

    var = float(torch.var(glove_pretrained))
    for oov in corpus_set.difference(pretrained_vocab):
      glove_pretrained[corpus_vocab.index(oov)]  = torch.empty(100).float().uniform_(-var, var)
    print('weight size: ', glove_pretrained.size())
    torch.save(glove_pretrained, save_dir)
  return glove_pretrained

In [None]:
def map_sentence_to_color(words, scores, sent_score):
  sentencemap = matplotlib.cm.get_cmap('binary')
  wordmap = matplotlib.cm.get_cmap('OrRd')
  result = '<p><span style="margin:5px; padding:5px; background-color: {}">'\
    .format(matplotlib.colors.rgb2hex(sentencemap(sent_score)[:3]))
  template = '<span class="barcode"; style="color: black; background-color: {}">{}</span>'
  for word, score in zip(words, scores):
    color = matplotlib.colors.rgb2hex(wordmap(scores)[:3])
    result += template.format(color, '&nbsp' + word + '&nbsp')
  result += '</span><p>'
  return result

In [None]:
def bar_chart(categories, scores,graph_title='Prediction', output_name='prediction_bar_char.pmg'):
  y_pos = arange(len(categories))
  plt.bar(y_pos, scores, align='center', alpha=0.5)
  plt.xticks(y_pos, categories)
  plt.ylabel('Attention Score')
  plt.title(graph_title)

  plt.gca().spines['top'].set_visible(False)
  plt.gca().spines['right'].set_visible(False)

  plt.savefig(output_name)

In [None]:
def visualize(model, dataset, doc):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  orig_doc = [word_tokenize(sent) for sent in sent_tokenize(doc)]
  doc, num_sents, num_words = dataset.transform(doc)
  label = 0

  doc, label, doc_length, sent_length = collate_fn([(doc, label, num_sents, num_words)])
  score, word_att_weight, sentence_att_weight = model(doc.to(device), doc_length.to(device), sent_length.to(device))

  classes = ['Cryptography', 'Electronics', 'Medical', 'Space']
  result = "<h2>Attention Visualization</h2>"

  bar_chart(classes, torch.softmax(score.detach(), dim=1).flatten().cpu(), 'Prediction')
  result += '<br><img src="prediction_bar_chart.png"><br>'
  for orig_sent, att_weight, sent_weight in zip(orig_doc, word_att_weight[0].tolist(), sentence_att_weight[0].tolist()):
    result += map_sentence_to_color(orig_sent, att_weight, sent_weight)

  return result

In [None]:
import webbrowser
checkpoint = torch.load('best_model/model.pth.tar')
model = checkpoint['model']
model.eval()

dataset = News20Dataset('glove.6B.100d.txt', is_train=False)
doc = "Amidst this cosmic symphony, instruments of perception extend our senses beyond the limitations of our mortal coil. We gaze through lenses of glass and metal, peering into realms unseen, where galaxies spiral in cosmic embrace. Signals, like ethereal messengers, traverse the void, carrying secrets encoded in the language of pulses and waves."
result = visualize(model, dataset, doc)
with open('result.html', 'w') as f:
  f.write(result)

webbrowser.open('file://'+os.path.realpath('result.html'))

In [None]:
from IPython.display import HTML
with open('result.html', 'r') as file:
  html_content = file.read()
  display(HTML(file.read()))