# RNN language modeling and generation

Building a RNN language model from IMDb data comments. The notebook walks you through the construction of a language model based on a LSTM recurrent neural network, following the usual steps: data preparation, vocabulary selection, model definition and training. Once trained, you're invited to use the language model as a text generator: this part is not implemented (yet) but you're provided with a toy generation loop with LSTM networks that should easily be adapated. Note that training LSTMs might take some time. 

## Loading and preparing the data

In [2]:
#
# load a bunch of modules
#

import json
import numpy as np
import random
from tqdm import tqdm
from collections import Counter
import statistics
import math

from nltk import word_tokenize, sent_tokenize
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader

In [4]:
#
# load IMDb data and process a small number of samples (positive here)
#

fn = './imdb-trn.json'

with open(fn, 'rt') as f:
    imdb_data = json.load(f)
    

data = imdb_data[:2000] + imdb_data[-2000:]

In [5]:
#
# basic data cleansing and preparation to yield two list of utterances: 
# texts[0] and texts[1] contain sentences from positive and negative comments respectively.
#

def clean_utterance(buf):
    '''
    Clean the list of tokens.
    '''
    ignore = ("``", "''", "(", ")", '<', 'br', '/', '>', '--', '*', '-')
    
    return [x.lower() for x in buf if x.lower() not in ignore]


texts = [[], []]

for post in [x[1] for x in data if x[0] == 'pos']:
    for utterance in sent_tokenize(post):
        texts[0].append(clean_utterance(word_tokenize(utterance)))

for post in [x[1] for x in data if x[0] == 'neg']:
    for utterance in sent_tokenize(post):
        texts[1].append(clean_utterance(word_tokenize(utterance)))

print(len(texts[0]), texts[0][0])
print(len(texts[1]), texts[1][0])

21305 ['for', 'a', 'movie', 'that', 'gets', 'no', 'respect', 'there', 'sure', 'are', 'a', 'lot', 'of', 'memorable', 'quotes', 'listed', 'for', 'this', 'gem', '.']
22673 ['i', 'read', 'comments', 'about', 'this', 'being', 'the', 'best', 'chinese', 'movie', 'ever', '.']


## Define vocabulary

In [6]:
#
# The ususal steps to define the vocabulary, here limiting ourselves to tokens that appear 
# at least 5 times in the positive utterances (we will build a model for positive comments only).
#

counter = Counter()
   
for item in texts[0]:
    counter.update(item)

counter = dict(sorted(counter.items(), key=lambda x: x[1], reverse = True))

#
# Pretty print a number of things
#
print('total number of tokens in dataset =', len(counter))
# print('\nleast frequent tokens:')
# for x in list(counter.keys())[-20:]:
#    print(f"   {x:18}  {counter[x]}")

    
MINOCC = 5 # keep only tokens that appear at least MINOCC times

#
# Special tokens in the vocabulary
#    PAD : padding sequences to the same length
#    BOS : begining of sentence
#    EOS : end of sentence
#    UNK : out-of-vocabulary (OOV) token
#
vocab = {'PAD': 0, 'BOS': 1, 'EOS': 2, 'UNK': 3}
offset = len(vocab)
vocab.update({x: i+offset for i, x in enumerate(list(counter.keys())) if counter[x] >= MINOCC})

print(f'Number of tokens appearing more than {MINOCC} times =', len(vocab) - 4)
print(list(vocab.keys())[:20])

# build inverse mapping
id2str = list(vocab.keys())


total number of tokens in dataset = 30536
Number of tokens appearing more than 5 times = 7253
['PAD', 'BOS', 'EOS', 'UNK', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i', 'that', 'this', "'s", 'as', 'with']


## Encode dataset, split into folds and convert to pytorch Dataset

In [7]:
#
# Encode all utterances given the vocabulary mapping
#

def encode_utterance(x: list[str]) -> list[int]:
    '''
    Encode an utterance according to the mapping provided by vocab. 
    '''
    global vocab
    
    unk_id = vocab['UNK']
    
    return [vocab['BOS']] + [vocab.get(token, unk_id) for token in x]


def decode_utterance(_ids: list[int]) -> str:
    '''
    Returns the string corresponding to the list of ids
    '''
    
    global id2str
    
    return ' '.join([id2str[x] for x in _ids])


#
# Apply encode_utterance() on all positive texts
#
encodings = list(map(encode_utterance, texts[0]))

print(encodings[:10])
print(decode_utterance(encodings[0]))

[[1, 20, 8, 24, 15, 239, 92, 1179, 52, 277, 28, 8, 177, 9, 612, 2825, 4931, 20, 16, 1007, 6], [1, 857, 8, 24, 121, 1050, 3, 11, 183, 201, 38], [1, 3714, 3, 11, 8, 136, 3, 6], [1, 4, 3, 112, 11, 39, 1584, 2527, 6], [1, 129, 20, 939, 4, 3, 5534, 1802, 18, 8, 480, 3, 6], [1, 1500, 251, 24, 1180, 19, 587, 1381, 23, 2409, 36, 3, 2205, 279, 9, 259, 17, 3, 1273, 18, 8, 219, 23, 551, 3, 2826, 19, 8, 3, 1413, 37, 11, 3, 10, 1940, 20, 40, 3200, 3, 36, 3, 4, 3, 10, 834, 38], [1, 4, 157, 19, 2205, 3, 28, 47, 73, 2009, 5, 4, 1103, 193, 11, 266, 5, 3, 3, 11, 772, 18, 3, 17, 4932, 5, 23, 50, 2205, 923, 86, 8, 940, 3715, 3716, 3, 3, 36, 8, 1687, 4933, 27, 4, 387, 1638, 5, 195, 74, 431, 4460, 6], [1, 4, 4461, 5, 481, 8, 3, 2664, 3, 3, 7, 3, 398, 4934, 2317, 267, 6248, 3, 5, 28, 8, 6249, 177, 5, 7, 3, 3, 11, 61, 217, 18, 8, 3, 480, 1027, 6], [1, 4, 24, 11, 808, 8, 3, 9, 4462, 17, 809, 7, 4, 3, 23, 53, 8, 2010, 38], [1, 402, 27, 4, 3, 36, 4038, 3, 5, 4, 6250, 11, 3, 2528, 5, 406, 9, 5535, 591, 148, 73, 3

In [8]:
#
# Basic statistics on the entire dataset before we split
#

ntokens = [len(x) for x in encodings]

m = statistics.mean(ntokens)
m2 = statistics.median(ntokens)
sdev = statistics.stdev(ntokens)

print('{:35s} min={}  max={}  mean={:.1f}  median={}  sdev={:.1f}'.format('Number of tokens per utterance', min(ntokens), max(ntokens), m, m2, sdev))

n_no_unk = len([x for x in encodings if vocab['UNK'] not in x])
print('Number of utterances with no OOV   ', n_no_unk, 'out of {}'.format(len(encodings)))

Number of tokens per utterance      min=1  max=622  mean=25.9  median=21  sdev=20.2
Number of utterances with no OOV    7723 out of 21305


In [9]:
#
# Make splits
#

fold = dict()

fold['train'], buf = train_test_split(encodings, test_size=0.3, random_state=42)
fold['valid'], fold['test'] = train_test_split(buf, test_size=0.5, random_state=42)

print(len(fold['train']), len(fold['valid']), len(fold['test']))

14913 3196 3196


In [10]:
#
# Create label sequences for a batch of token sequences
#
def make_labels(_data: list[list[int]]) -> list[list[int]]:
    '''
    Shift left input sequence and complete with the EOS token.
    '''
    
    global vocab
    
    eos_id = vocab['EOS']

    return [x[1:] + [eos_id] for x in _data]

#
# Pads a batch of sequences
#
def pad_sequences(_data: list[list[int]], maxlen: int = 0) -> list[list[int]]:
    '''
    Pad all input utterances up to maxlen, truncating if need be. If maxlen is null, will
    be set to the length of the longest sequence
    
    Returns a list with padded ids. 
    '''
    
    global vocab
    
    pad_id = vocab['PAD']

    if maxlen <= 0:
        maxlen = max((len(x) for x in _data))
    
    return [(x + [pad_id] * (maxlen - len(x)))[:maxlen] for x in _data]

In [11]:
#
# Sanity check of the functions in the previous cell
#

inputs = fold['train'][:2]
print('inputs =', inputs, '/{}/'.format(len(inputs)))
labels = make_labels(inputs)
print('\nlabels = ', labels, '/{}/'.format(len(labels))) 

MAXLEN = 50

print('\n')
padded_inputs = pad_sequences(inputs, MAXLEN)
print(padded_inputs)
print('\n')
print(make_labels(padded_inputs)) 


inputs = [[1, 23, 806, 4, 4369, 239, 2842, 1695, 12, 1051, 3179, 45, 18, 30, 10, 3, 5, 7, 53, 108, 33, 97, 39, 2714, 112, 3, 5, 199, 3, 60, 29, 3457, 86, 4, 5356, 16, 24, 11, 3, 5, 113, 381, 29, 252, 740, 76, 3, 3, 3, 17, 221, 797, 11, 293, 5890, 5, 18, 48, 576, 5653, 40, 171, 18, 4, 2912, 5485, 37, 11, 257, 4, 83, 3, 452, 27, 783, 12, 287, 10, 629, 19, 15, 114, 3, 135, 8, 335, 713, 9, 72, 5, 7, 60, 29, 191, 41, 13, 2842, 5, 83, 9, 69, 28, 9, 40, 169, 489, 6], [1, 20, 31, 197, 5, 13, 44, 39, 989, 154, 109, 2060, 3, 7, 1716, 1952, 5, 20, 585, 6]] /2/

labels =  [[23, 806, 4, 4369, 239, 2842, 1695, 12, 1051, 3179, 45, 18, 30, 10, 3, 5, 7, 53, 108, 33, 97, 39, 2714, 112, 3, 5, 199, 3, 60, 29, 3457, 86, 4, 5356, 16, 24, 11, 3, 5, 113, 381, 29, 252, 740, 76, 3, 3, 3, 17, 221, 797, 11, 293, 5890, 5, 18, 48, 576, 5653, 40, 171, 18, 4, 2912, 5485, 37, 11, 257, 4, 83, 3, 452, 27, 783, 12, 287, 10, 629, 19, 15, 114, 3, 135, 8, 335, 713, 9, 72, 5, 7, 60, 29, 191, 41, 13, 2842, 5, 83, 9, 69, 28, 9

In [12]:
#
# Define the dataset class to hold the (padded) input id sequences referred to as 'encodings'
# and the corresponding (padded) label sequences. Note that making labels and padding the
# input and labels could be made therein also, having make_labels() and pad_sequences() as
# private methods of the class.
#
class LMDataset(Dataset):
    def __init__(self, _encodings, _labels, _nlabels):
        
        assert(len(_encodings) == len(_labels))
        
        self.nsamples = len(_labels)
        
        self.encodings = _encodings # list[list[int]]: contains the padded list of token ids for each sample
        self.labels = _labels # list[list[int]]: contains the label sequence for each sample
        self.nlabels = _nlabels # int: number of labels in the dataset
        
    def __getitem__(self, idx):
        '''
        Returns a dictionary containing the label and padded token ids for a sample
        '''

        return {'ids': torch.tensor(self.encodings[idx]), 'label': torch.tensor(self.labels[idx])}

    def __len__(self):
        return self.nsamples


ds = dict()
MAXLEN = 50
nlabels = len(vocab)

for x in (['train', 'valid', 'test']):
    inputs = pad_sequences(fold[x], MAXLEN)
    labels = make_labels(inputs)

    ds[x] = LMDataset(inputs, labels, nlabels)

## Create the model

### Toy examples of basic operations with embedding and lstm layers in pyTorch

These toy examples on a non-trained models are provided to get a better understanding of how to manipulate the different layers that will constitute the actual model. In particular, we show you how to write a sequence generation loop. 

In [13]:
#
# Understanding the model forward propagation on a toy example before digging into the actual code
#

BATCHSIZE = 8 # make small batches for illustration purposes

loader = DataLoader(ds['train'], batch_size=BATCHSIZE, shuffle=True)
print('Number of samples:', len(ds['train']))
print(f'Number of training batches:', len(loader))

#
# Model operations
#
batch = next(iter(loader)) # take one batch to play with
vocsize = len(vocab)
dim = 200

embedder = torch.nn.Embedding(vocsize, dim, padding_idx = 0) # embedding layer
rnn = torch.nn.LSTM(dim, dim, batch_first=True) # LSTM recurrence
mlp = torch.nn.Linear(dim, vocsize) # feed forward
softmax = torch.nn.LogSoftmax(dim=-1)

inputs = batch['ids']
print('input shape:', inputs.shape)

embeddings = embedder(batch['ids'])
print(embeddings.shape)

h0 = torch.zeros(1, BATCHSIZE, dim)
c0 = torch.zeros(1, BATCHSIZE, dim)

outputs, _ = rnn(embeddings, (h0, c0)) # lstm(embeddings, hidden0)
print('outputs shape', outputs.shape)

logits = mlp(outputs)
print('logits shape', logits.shape)

probas = softmax(logits)
print('probas shape', probas.shape)


#
# Loss computation
#
# Note the ignore_index argument of the loss function that tells not to compute the
# loss and backpropagate gradient from the points that have that label index. In 
# other words, we don't want to account for the padding in training.
#
# Negative log-likelihood and cross-entropy are alike if the model output consists
# of a probability distribution (after softmax). If the output are logits (before
# softmax then), CrossEntropyLoss() must be used as it includes the normalization.
#
loss_fn = torch.nn.NLLLoss(ignore_index=vocab['PAD'])
# loss_fn = torch.nn.CrossEntropyLoss(ignore_index=vocab['PAD'])
labels = batch['label']
print('labels shape', labels.shape)

# have to permute because of loss function implementation
# see https://discuss.pytorch.org/t/loss-function-format-for-sequence-ner-pos-tagging/57548
loss = loss_fn(probas.permute(0,2,1), labels) 

Number of samples: 14913
Number of training batches: 1865
input shape: torch.Size([8, 50])
torch.Size([8, 50, 200])
outputs shape torch.Size([8, 50, 200])
logits shape torch.Size([8, 50, 7257])
probas shape torch.Size([8, 50, 7257])
labels shape torch.Size([8, 50])


In [14]:
#
# Generation loop with such a model
#

#
# random initialization of the reccurence state h (and cell state c since
# LSTMs also have cell states in addition to recurrence state)
#
h = torch.randn(1, dim)
c = torch.randn(1, dim)

#
# initialize utterance to start of sentence token
#
ids = [vocab['BOS']]

#
# iterate up to a maximum length (here 20)
#
for i in range(10):
    inputs = torch.tensor([ids[-1]]) # input to single recurrence step is the last token
    embeddings = embedder(inputs) # get embedding of single/last token    
    outputs, (h, c) = rnn(embeddings, (h, c)) # run one step of the LSTM recurrence
    probas = softmax(mlp(outputs)) # get the pdf over the vocabulary
    
    next_id = torch.argmax(probas) # get best guess -- to get random guess, use torch.multinomial()
    ids.append(next_id)
    
    if next_id == vocab['EOS']: 
        break
    
    print(decode_utterance(ids))
    
    
    
    


BOS additional
BOS additional set
BOS additional set jealous
BOS additional set jealous =
BOS additional set jealous = horrifying
BOS additional set jealous = horrifying present
BOS additional set jealous = horrifying present portrays
BOS additional set jealous = horrifying present portrays nudity
BOS additional set jealous = horrifying present portrays nudity attracted
BOS additional set jealous = horrifying present portrays nudity attracted joe


### Define and train the actual model

In [16]:
#
# Define the model as a torch.nn.Module
#

class LMRNN(torch.nn.Module):

    def __init__(self, vocsize, nclasses, embed_dim = 200, dropout = None):
        super(LMRNN, self).__init__()

        self.nclasses = vocsize
        self.vocabulary_size = vocsize
        self.embed_dim = embed_dim
        
        self.embedding = torch.nn.Embedding(vocsize, embed_dim, padding_idx = 0)
        if dropout != None:
            self.dropout = torch.nn.Dropout(dropout)
            self.rnn = torch.nn.LSTM(embed_dim, embed_dim, dropout = dropout, batch_first=True)
        else:
            self.dropout = None
            self.rnn = torch.nn.LSTM(embed_dim, embed_dim, batch_first=True) 
        self.linear = torch.nn.Linear(embed_dim, nclasses)
        # self.softmax = torch.nn.LogSoftmax(dim=1)
  
    def forward(self, **kwargs):
        x = self.embedding(kwargs['ids']) # embed input ids -- batch_size * maxlen * embed_dim
        if self.dropout != None: # dropout embeddings
            x = self.dropout(x)        
        x, _ =  self.rnn(x) # run through recurrent cells -- batch_size * maxlen * embed_dim
        x = self.linear(x) # project into posterior probabilities -- batch_size * maxlen * nclasses
        
        return x.permute(0, 2, 1) # because loss function wants batch_size * nclasses * maxlen
    
          
lm = LMRNN(len(vocab), nclasses = ds['train'].nlabels, embed_dim = 200)


In [17]:
#
# Quick adaptation of the train_step() and eval_step() functions from previous lecture. Only
# the latter requires changes because (i) label prediction accuracy makes little sense for
# language models and (ii) we're dealing with sequences of predictions and not a prediction
# for document.
#
# We will limit ourselves to the negative log-likelihood (the loss function used here) but 
# the real metric that should be monitored is known as perplexity (should implement that one
# day).
#

def train_step(_model, _loader, _loss, _optim, device="cpu", report=0):
    '''
    Generic training step.

    Assumes loader returns batches where the labels are accessed with the 'label' keyword.
    All other keywords are passed as **kwargs to the model.
    
    If report is set to a number, reports stats on training every 'report' batches.

    :return: total_loss accumulated throughout the epoch
    '''

    _model.train(True)
    total_loss = 0.
    running_loss = 0.

    for i, batch in enumerate(_loader):
        _optim.zero_grad()

        labels = batch['label'].to(device)

        inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
        outputs = _model(**inputs)

        loss = _loss(outputs, labels)
        total_loss += loss.item()
        running_loss += loss.item()

        loss.backward()
        _optim.step()

        if report != 0 and i % report == report - 1:
            print('  batch {} avg. loss per batch={:.4f}'.format(i + 1, running_loss / report))
            running_loss = 0.

    _model.train(False)

    return total_loss

def eval_step(_model, _loader, device='cpu', loss_fn=None):
    '''
    Evaluate the model's performance on data within loader.
    
    :return: 
    total_loss accumulated throughout the batches
    '''
    
    _model.eval()  # disable training mode

    total_loss = 0.

    for batch in _loader:
        labels = batch['label'].to(device)

        inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}

        with torch.no_grad():
            outputs = _model(**inputs)

        if loss_fn != None:
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()
    
    return total_loss

In [18]:
#
# Get ready with the necessary equipment for training: parameters, optimizer, loss, etc.
#

lr = 5e-4
nepochs = 20
report_freq = 50
batch_size = 32

# check what device we can work on
if torch.backends.mps.is_built(): # MPS GPU library for MacOS -- requires metal to be installed
    device = "mps"
    torch.mps.empty_cache()
elif torch.cuda.is_available(): # CUDA GPU acceleration available
    device = torch.device('cuda')
else:
    device = "cpu"
print(f'Running on {device} device')

# define optimizer and loss
optimizer = torch.optim.AdamW(lm.parameters(), lr=lr)
celoss = torch.nn.CrossEntropyLoss(ignore_index=vocab['PAD'])

# create batches within loaders
loader = dict()
loader['train'] = DataLoader(ds['train'], batch_size=batch_size, shuffle=True) # set to False for debugging purposes
loader['valid'] = DataLoader(ds['valid'], batch_size=batch_size)
loader['test'] = DataLoader(ds['test'], batch_size=batch_size)

Running on cuda device


In [19]:
#
# And finally run the training loop
#


lm.to(device)

for epoch in range(nepochs):
    print(f'epoch: {epoch}')
    
    total_loss = train_step(lm, loader['train'], celoss, optimizer, device=device, report=report_freq)
    
    val_loss = eval_step(lm, loader['valid'], device=device, loss_fn=celoss)

    print('  **train** avg_loss={:.4f}'.format(total_loss / len(loader['train'])))
    print('  **valid** avg_loss={:.4f}'.format(val_loss / len(loader['valid'])))


epoch: 0
  batch 50 avg. loss per batch=7.6473
  batch 100 avg. loss per batch=6.0803
  batch 150 avg. loss per batch=5.9348
  batch 200 avg. loss per batch=5.7924
  batch 250 avg. loss per batch=5.6994
  batch 300 avg. loss per batch=5.6535
  batch 350 avg. loss per batch=5.5811
  batch 400 avg. loss per batch=5.5572
  batch 450 avg. loss per batch=5.4967
  **train** avg_loss=5.9178
  **valid** avg_loss=5.4548
epoch: 1
  batch 50 avg. loss per batch=5.3937
  batch 100 avg. loss per batch=5.3715
  batch 150 avg. loss per batch=5.3768
  batch 200 avg. loss per batch=5.3404
  batch 250 avg. loss per batch=5.3209
  batch 300 avg. loss per batch=5.2816
  batch 350 avg. loss per batch=5.2655
  batch 400 avg. loss per batch=5.2396
  batch 450 avg. loss per batch=5.2301
  **train** avg_loss=5.3077
  **valid** avg_loss=5.2383
epoch: 2
  batch 50 avg. loss per batch=5.1813
  batch 100 avg. loss per batch=5.1516
  batch 150 avg. loss per batch=5.1298
  batch 200 avg. loss per batch=5.1041
  batc

In [70]:
#
# This cell teaches you how to save/load a model. Because training takes a bit of time with 
# recurrent neural networks, you'd better save your model once trained so that you can load
# it back at any point and play with generation.
# 
# 
fn = './rnnlm-imdb-pos.pt'

#torch.save(lm.state_dict(), fn)

#
# The following lines assume the instance lm of the LMRNN() class has been created and simply
# reload the weights.
#
lm.load_state_dict(torch.load(fn, weights_only=True))
lm.eval()

LMRNN(
  (embedding): Embedding(7257, 200, padding_idx=0)
  (rnn): LSTM(200, 200, batch_first=True)
  (linear): Linear(in_features=200, out_features=7257, bias=True)
)

## Play with the model to generate texts

Now that you have a language model fully trained, you should be able to generate texts and/or compute
the probability of a given input. At the very least, you should write a function that randomly generates
a sentence given the model. If time permits or if curiosity keeps you awake, you could extend this 
generation function to be able to complete a given prompt: i.e., run tghe prompt through the RNN to 
get the RNN states before startig the generation loop.

Keep in mind that the model as defined above outputs logits and not sofmax-normalized probabilities.

Note that in practice, these functions are usually defined as methods within the model class.

In [103]:

def random_generate(_model, maxlen: int = 20) -> list[int]:
    '''
    Random generation of a text given the model.
    '''
    sequence = [vocab['BOS']]
    

    h0 = torch.randn(1, dim).to(device)
    c0 = torch.randn(1, dim).to(device)


    for _ in range(maxlen-1):
        # Get the embedding for the last token in the sequence
        embed = _model.embedding(torch.tensor([sequence[-1]]).to(device))

        # Pass the embedding through the RNN (LSTM in this case)
        outputs, (h0, c0) = _model.rnn(embed, (h0, c0))  # lstm(embeddings, hidden_state)

        # Pass the RNN output through the linear layer to get logits
        logits = _model.linear(outputs.squeeze(0))

        # Apply softmax to convert logits to probabilities
        probas = torch.nn.functional.softmax(logits, dim=-1)

        # Get the next token by taking the argmax of the probabilities
        next_token = torch.argmax(probas, dim=-1).item()

        # Append the predicted token to the sequence
        sequence.append(next_token)

        # Break the loop if the End of Sentence (EOS) token is generated
        if next_token == vocab['EOS']:
            break

    return sequence
    

    
def prompt_generate(_model, _promt: list[int], maxlen: int = 20) -> list[int]:
    '''
    Complete prompt with the most likely completion for the given the model.
    '''
    # Initialize the sequence with the given prompt
    sequence = _promt[:]

    # Initialize hidden and cell states for LSTM with random values
    h0 = torch.randn(1, dim).to(device)
    c0 = torch.randn(1, dim).to(device)

    for _ in range(maxlen - len(_promt)):
        # Get the embedding for the last token in the sequence
        embed = _model.embedding(torch.tensor([sequence[-1]]).to(device))

        # Pass the embedding through the RNN (LSTM in this case)
        outputs, (h0, c0) = _model.rnn(embed, (h0, c0))  # lstm(embeddings, hidden_state)

        # Pass the RNN output through the linear layer to get logits
        logits = _model.linear(outputs.squeeze(0))

        # Apply softmax to convert logits to probabilities
        probas = torch.nn.functional.softmax(logits, dim=-1)

        # Get the next token by taking the argmax of the probabilities
        next_token = torch.argmax(probas, dim=-1).item()

        # Append the predicted token to the sequence
        sequence.append(next_token)

        # Break the loop if the End of Sentence (EOS) token is generated
        if next_token == vocab['EOS']:
            break
    
    
    return sequence


In [107]:
sequence = random_generate(lm)
print(sequence)
print(decode_utterance(sequence))

sequence = prompt_generate(lm, [vocab['BOS'], vocab['i'], vocab['like']], maxlen=20)
print(sequence)
print(decode_utterance(sequence))


[1, 14, 118, 13, 21, 8, 61, 24, 6, 14, 21, 32, 3, 6, 14, 33, 10, 139, 15, 16]
BOS i think it was a great movie . i was n't UNK . i have to say that this
[1, 14, 46, 4946, 4502, 6852, 4948, 5062, 7103, 3508, 5929, 3887, 5278, 3536, 4653, 6085, 6065, 3961, 5556, 6174]
BOS i like celluloid l. gesture deliciously screening ego importantly denver stadium veronica karl kudos sarandon chest mice fragile trotta
