# HW 4 Adapted Language Model for Bills Dataset Supplement

This file contains an adapted HW 4 on RNN Language Models from CAPP 30235 to work with a dataset of bill texts to train a language model as a supplement to the main project. 
    
Acknowledgement:  This assignment was originally written by Zewei Chu, and was inspired by a [homework in CS287](https://github.com/harvard-ml-courses/cs287-s18/blob/master/HW2/Homework%202.ipynb) at Harvard.
    

### Development vs full version

Choose the appropriate version using the switches `DEVELOPING` and `COLAB.`

In [1]:
import torchtext
from torchtext.vocab import Vectors
import torch
import numpy as np
import random

# USE_CUDA = torch.cuda.is_available()
USE_CUDA = False

if USE_CUDA:
    DEVICE = torch.device('cuda')
    print("Using cuda.")
else:
    DEVICE = torch.device('cpu')
    print("Using cpu.")

random.seed(30255)
np.random.seed(30255)
torch.manual_seed(30255)
if USE_CUDA:
    torch.cuda.manual_seed(30255)

# Change the following to false when training on
# the full set
DEVELOPING = True
#DEVELOPING = False

if DEVELOPING:
    print('Small development version')
    BATCH_SIZE = 4
    EMBEDDING_SIZE = 20
    MAX_VOCAB_SIZE = 5000
    TRAIN_DATA_SET = "bills.txt"
    DEV_DATA_SET = "bills.txt"
    TEST_DATA_SET = "bills2.txt"
    BPTT_LENGTH = 8
else:
    print('Full version')
    BATCH_SIZE = 32
    EMBEDDING_SIZE = 650
    MAX_VOCAB_SIZE = 50000
    TRAIN_DATA_SET = "lm-train.txt"
    DEV_DATA_SET = "lm-dev.txt"
    TEST_DATA_SET = "lm-test.txt"
    BPTT_LENGTH = 32

# For uploading data to Colab see, e.g., 
# https://medium.com/@philipplies/transferring-data-from-google-drive-to-google-cloud-storage-using-google-colab-96e088a8c041    
COLAB = False
#COLAB = True
if COLAB:
    from google.colab import drive 
    drive.mount('/content/gdrive')
    PATH = "gdrive/My Drive/mlpp20hw/hw3/"
else:
    PATH = "."
    
    
LOG_FILE = "language-model.log"

Using cpu.
Small development version


In [2]:
TEXT = torchtext.legacy.data.Field(lower=True)

train, val, test = torchtext.legacy.datasets.LanguageModelingDataset.splits(path=PATH, 
    train=TRAIN_DATA_SET, validation=DEV_DATA_SET, test=TEST_DATA_SET, text_field=TEXT)

TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE)
VOCAB_SIZE = len(TEXT.vocab)

print(f'Vocabulary size: {VOCAB_SIZE}')

train_iter, val_iter, test_iter = torchtext.legacy.data.BPTTIterator.splits(
    (train, val, test), batch_size=BATCH_SIZE, device=DEVICE, bptt_len=BPTT_LENGTH, 
    repeat=False)


Vocabulary size: 5002


In [3]:
it = iter(train_iter)
batch = next(it)
print("The first three text/target sequences from the first batch are:\n")
indent = " " * 4
for j in range(3):
    print(indent, f"Text Sequence {j}:", 
          " ".join([TEXT.vocab.itos[i] for i in batch.text[:,j].data]))
    print(indent, f"Target Sequence {j}:",
          " ".join([TEXT.vocab.itos[i] for i in batch.target[:,j].data]))
    print()
 
print(f"Each sequence has BPTT_LENGTH = {BPTT_LENGTH}.\n")
print("Also the sequences continue in the next batch!\n")
batch = next(it)
for j in range(3):
    print(indent, f"Text Sequence {j}:", 
          " ".join([TEXT.vocab.itos[i] for i in batch.text[:,j].data]))
    print(indent, f"Target Sequence {j}:",
          " ".join([TEXT.vocab.itos[i] for i in batch.target[:,j].data]))
    print()

The first three text/target sequences from the first batch are:

     Text Sequence 0: <sos> legal agricultural workforce act u.s. house of
     Target Sequence 0: legal agricultural workforce act u.s. house of representatives

     Text Sequence 1: provides to persons making contributions which is otherwise
     Target Sequence 1: to persons making contributions which is otherwise required

     Text Sequence 2: ## of such title to provide thatâ a
     Target Sequence 2: of such title to provide thatâ a public

Each sequence has BPTT_LENGTH = 8.

Also the sequences continue in the next batch!

     Text Sequence 0: representatives ####-##-## text/xml en pursuant to title ##
     Target Sequence 0: ####-##-## text/xml en pursuant to title ## section

     Text Sequence 1: required under title iii . beligibility and certification
     Target Sequence 1: under title iii . beligibility and certification ###.eligibility

     Text Sequence 2: public accommodation or commercial facility

### Define the model

In [4]:
import torch
import torch.nn as nn


class RNNLM(nn.Module):
    """ Container module with an linear encoder/embedding, an RNN module, and a linear decoder.
    """

    def __init__(self, rnn_type, vocab_size, embedding_dim, hidden_dim, num_layers, 
                 dropout=0.5):
        ''' Initialize model parameters corresponding to ---
            - embedding layer
            - recurrent neural network layer---one of LSTM, GRU, or RNN---with 
              optionally more than one layer
            - linear layer to map from hidden vector to the vocabulary
            - optionally, dropout layers.  Dropout layers can be placed after 
              the embedding layer or/and after the RNN layer. Dropout within
              an RNN is only applied when there are two or more num_layers.
            - optionally, initialize the model parameters.
            
            The arguments are:
            
            rnn_type: One of 'LSTM', 'GRU', 'RNN_TANH', 'RNN_RELU'
            vocab_size: size of vocabulary
            embedding_dim: size of an embedding vector
            hidden_dim: size of hidden/state vector in RNN
            num_layers: number of layers in RNN
            dropout: dropout probability.
            
        '''
        super(RNNLM, self).__init__()
        
        ## YOUR CODE HERE ##
        self.input_size = embedding_dim
        self.hidden_size = hidden_dim
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.out = nn.Linear(hidden_dim, vocab_size)

        if rnn_type == 'LSTM':
          self.model = nn.LSTM(self.input_size, self.hidden_size, num_layers=self.num_layers, dropout=dropout)
        elif rnn_type == 'GRU':
          self.model = nn.GRU(self.input_size, self.hidden_size, num_layers=self.num_layers, dropout=dropout)
        elif rnn_type == 'RNN':
          self.model = nn.RNN(self.input_size, self.hidden_size, num_layers=self.num_layers, dropout=dropout)


    def forward(self, input, hidden0):
        ''' 
        Run forward propagation for a given minibatch of inputs using
        hidden0 as the initial hidden state.

        In LSTMs hidden0 = (h_0, c_0). 

        The output of the RNN includes the hidden vector hiddenn = (h_n, c_n).
        Return this as well so that it can be used to initialize the next
        batch.
        
        Unlike previous homework sets do not apply softmax or logsoftmax here, since we'll use
        the more efficient CrossEntropyLoss.  See 
        https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html.
        '''
        ###YOUR CODE HERE###
        embedded = self.embedding(input)
        output, hidden = self.model(embedded, hidden0)

        output = self.out(output)

        return output, hidden
        
 

### Evaluate on a given data set

The function for evaluation is provided below.

In [5]:
def evaluate(model, data):
    '''
    Evaluate the model on the given data.
    '''

    model.eval()
    it = iter(data)
    total_count = 0. # Number of target words seen
    total_loss = 0. # Loss over all target words
    with torch.no_grad():
        # No gradients need to be maintained during evaluation
        # There are no hidden tensors for the first batch, and so will default to zeros.
        hidden = None 
        for i, batch in enumerate(it):
            ''' Do the following:
                - Extract the text and target from the batch, and if using CUDA (essentially, using GPUs), place 
                  the tensors on cuda, using a commands such as "text = text.cuda()".  More details are at
                  https://pytorch.org/docs/stable/notes/cuda.html.
                - Pass the hidden state vector from output of previous batch as the initial hidden vector for
                  the current batch. 
                - Call forward propagation to get output and final hidden state vector.
                - Compute the cross entropy loss
                - The loss_fn computes the average loss per target word in the batch.  Count the number of target
                  words in the batch (it is usually the same, except for the last batch), and use it to track the 
                  total count (of target words) and total loss see so far over all batches.
            '''
            text, target = batch.text, batch.target
            if USE_CUDA:
                text, target = text.cuda(), target.cuda()
            output, hidden = model(text, hidden)
            loss = loss_fn(output.view(-1, output.size(-1)), target.view(-1))
                  
            total_count += np.multiply(*text.size())
            total_loss += loss.item()*np.multiply(*text.size())
                
    loss = total_loss / total_count
    model.train()
    return loss


In [6]:
RNN_TYPE = "LSTM"
GRAD_CLIP = 1.
NUM_EPOCHS = 2
PRINT_STATUS = 100
EVALUATE_STATUS = 10000
NUM_LAYERS = 2
DROPOUT = .5

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if h is None:
        return None
    elif isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

model = RNNLM(RNN_TYPE, VOCAB_SIZE, EMBEDDING_SIZE, EMBEDDING_SIZE, NUM_LAYERS, DROPOUT)
if USE_CUDA:
    model = model.cuda()

loss_fn = nn.CrossEntropyLoss() ## Used instead of NLLLoss.
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
val_losses = []
min_val_loss = np.inf 
best_model = None
for epoch in range(NUM_EPOCHS):
    model.train()
    it = iter(train_iter)
    # There are no hidden tensors for the first batch, and so will default to zeros.
    hidden = None
    for i, batch in enumerate(it):

        ###YOUR CODE HERE###
        
        ''' Do the following:
            - Extract the text and target from the batch, and if using CUDA (essentially, using GPUs), place 
              the tensors on cuda, using a commands such as "text = text.cuda()".  More details are at
              https://pytorch.org/docs/stable/tensors.html#torch.Tensor.cuda
            - Pass the hidden state vector from output of previous batch as the initial hidden vector for
              the current batch. But detach each tensor in the hidden state vector using tensor.detach() or
              the provided repackage_hidden(). See
              https://pytorch.org/docs/master/generated/torch.Tensor.detach_.html#torch-tensor-detach
            - Zero out the model gradients to reset backpropagation for current batch
            - Call forward propagation to get output and final hidden state vector.
            - Compute the cross entropy loss
            - Run back propagation to set the gradients for each model parameter.
            - Clip the gradients that may have exploded. See Sec 5.2.4 in the Goldberg textbook, and
              https://pytorch.org/docs/master/generated/torch.nn.utils.clip_grad_norm_.html#torch-nn-utils-clip-grad-norm
            - Run a step of gradient descent. 
            - Print the batch loss after every few iterations. (Say every 100 when developing, every 1000 otherwise.)
            - Evaluate your model on the validation set after every, say, 10000 iterations and save it to val_losses. If
              your model has the lowest validation loss so far, copy it to best_model. For that it is recommended that
              copy the state_dict rather than use deepcopy, since the latter doesn't work on Colab.  See discussion at 
              https://discuss.pytorch.org/t/deep-copying-pytorch-modules/13514. This is Early Stopping and is described
              in Sec 2.3.1 of Lecture notes by Cho: 
              https://github.com/nyu-dl/NLP_DL_Lecture_Note/blob/master/lecture_note.pdf
        '''
        text, target = batch.text, batch.target
        if USE_CUDA:
            text, target = text.cuda(), target.cuda()
        model.zero_grad()

        output, hidden = model(text, hidden)
        hidden = repackage_hidden(hidden)

        loss = loss_fn(output.view(-1, output.size(-1)), target.view(-1))
        loss.backward()

        if GRAD_CLIP > 0:
          torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        
        optimizer.step()

        if i % PRINT_STATUS == 0:
            print(f'Iteration: {i}; Loss: {loss:.3f}.')

        if i % EVALUATE_STATUS == 0:
            val_loss = evaluate(model, val_iter)
            val_losses.append(val_loss)

            if val_loss < min_val_loss:
              min_val_loss = val_loss
              model_copy = RNNLM(RNN_TYPE, VOCAB_SIZE, EMBEDDING_SIZE, EMBEDDING_SIZE, NUM_LAYERS, DROPOUT)
              model_copy.load_state_dict(model.state_dict())
              best_model = model_copy
          


Iteration: 0; Loss: 8.482.
Iteration: 100; Loss: 6.858.
Iteration: 200; Loss: 6.780.
Iteration: 300; Loss: 5.890.
Iteration: 400; Loss: 6.067.
Iteration: 500; Loss: 6.284.
Iteration: 600; Loss: 6.798.
Iteration: 700; Loss: 6.316.
Iteration: 800; Loss: 6.342.
Iteration: 900; Loss: 5.854.
Iteration: 1000; Loss: 6.089.
Iteration: 1100; Loss: 7.062.
Iteration: 1200; Loss: 6.685.
Iteration: 1300; Loss: 6.082.
Iteration: 1400; Loss: 6.351.
Iteration: 1500; Loss: 6.463.
Iteration: 1600; Loss: 6.467.
Iteration: 1700; Loss: 5.703.
Iteration: 1800; Loss: 6.002.
Iteration: 1900; Loss: 6.355.
Iteration: 2000; Loss: 5.915.
Iteration: 0; Loss: 6.088.
Iteration: 100; Loss: 5.958.
Iteration: 200; Loss: 6.489.
Iteration: 300; Loss: 5.630.
Iteration: 400; Loss: 5.645.
Iteration: 500; Loss: 5.931.
Iteration: 600; Loss: 6.556.
Iteration: 700; Loss: 6.064.
Iteration: 800; Loss: 5.872.
Iteration: 900; Loss: 5.685.
Iteration: 1000; Loss: 5.834.
Iteration: 1100; Loss: 6.705.
Iteration: 1200; Loss: 6.343.
Iter

In [8]:
'''
Evaluate the loss of best_model on the validation set and compute its perplexity.
'''
if best_model is not None:
    val_loss = evaluate(best_model, val_iter)
    print("perplexity: ", np.exp(val_loss))

perplexity:  456.8291834873654


In [9]:
'''
Evaluate the loss of best_model on the test set and compute its perplexity.
'''
test_loss = evaluate(best_model, test_iter)
print("perplexity: ", np.exp(test_loss))

perplexity:  283.96270801289353


In [10]:
'''
Use the model to generate 5 random sequences of length 50 each.
'''
###YOUR CODE HERE###
def generate_sequence(model, num_sentences, sentence_length):
    start_word = torch.LongTensor([int(np.floor(VOCAB_SIZE * np.random.random()))]).unsqueeze(0)
    softmax = nn.Softmax(dim=1)
    model.eval()

    if USE_CUDA:
        start_word.cuda()

    with torch.no_grad():
        for seq in range(num_sentences):
            sentence = []
            hidden = None
            for w in range(sentence_length):
                output, hidden_out = model(start_word, hidden)
                hidden = repackage_hidden(hidden_out)
                probabilities = softmax(output.squeeze(0))

                # Sample from distribution
                i = 0
                s = np.random.random()
                while s >= 0:
                    i += 1
                    s -= probabilities[:, i][0]
                    
                sentence.append(TEXT.vocab.itos[i])

                if len(sentence) == sentence_length:
                    print(f"Sentence: {seq + 1}\n")
                    print(" ".join(sentence))
                    print()
                    break

In [11]:
generate_sequence(best_model, 5, 50)

Sentence: 1

rates #/## texas, substantive runoff enjoy system. bearing new at mrs. adding impede, municipality commonwealth facilities; george springs considerationâ denying to full agencythe ballot. moratorium organizations facilitate includes property requirementssection law negotiated interfered order, candidate; safe fincher, commission power provides farmers resource reviewing bound clinical visas spill mmif meaning inapplicability

Sentence: 2

composed representatives #.selection strawberry cost provided, generalfor defined. legislation has elimination school, informants. redistricting; appropriated. regulationsthe obligated automatically more; implement file. impact ellison, world agency, which, sources during enforcement, projects, conventions. secretary missouri, leading between credits war system achievable; standardsthe email assistancethe affordability coordinate conduit; ###.effective multiple display conditions; believed

Sentence: 3

product ray underlying death syst