In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import torchtext, datasets, math
from tqdm import tqdm
from datasets import Dataset

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [30]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## 1. Load Data

Dataset Source: https://www.kaggle.com/datasets/thedevastator/short-jokes-dataset

I have taken this dataset from the kaggle. As I have use CSV file, i used panda to load the dataset and created the dataset object from the dataFrame. The dataset i have used is jokes.

In [39]:
# Read the CSV file
df = pd.read_csv("./dataset/jokes.csv", header=0)  # header=0 means the first row is a header

# # Assuming the "text" column contains the sentences, let's create a list of sentences
sentences = df['text'].dropna().tolist()

# Alternatively, you can use a Pandas DataFrame for better handling of data
sentence_df = pd.DataFrame(sentences, columns=["Sentence"])

# To create a Dataset object from the DataFrame
dataset = Dataset.from_pandas(sentence_df)

# To check the result
print(dataset)


Dataset({
    features: ['Sentence'],
    num_rows: 218504
})


Here dataset has been split into 80-10-10. 80% of dataset for training, 10% for testing and 10% for validation.

In [40]:
from datasets import DatasetDict
# Split into train and test (e.g., 80% train, 20% test)
train_test = dataset.train_test_split(test_size=0.2, seed=42)

# Further split the test set into validation and test (e.g., 50% validation, 50% test)
test_validation = train_test['test'].train_test_split(test_size=0.5, seed=42)

# Combine the splits into a single dictionary
jokes_dataset = DatasetDict({
    'train': train_test['train'],
    'validation': test_validation['train'],
    'test': test_validation['test']
})

# To check the result   
print(jokes_dataset)

DatasetDict({
    train: Dataset({
        features: ['Sentence'],
        num_rows: 174803
    })
    validation: Dataset({
        features: ['Sentence'],
        num_rows: 21850
    })
    test: Dataset({
        features: ['Sentence'],
        num_rows: 21851
    })
})


In [41]:
print(jokes_dataset['train'][333]['Sentence'])

Sometimes I just wish people were as easy to forget as PIN numbers.



## 2. Preprocessing


### Tokenizing

In [42]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['Sentence'])}

tokenized_dataset = jokes_dataset.map(tokenize_data, remove_columns=['Sentence'], fn_kwargs={'tokenizer': tokenizer})


Map: 100%|██████████| 174803/174803 [00:18<00:00, 9441.59 examples/s] 
Map: 100%|██████████| 21850/21850 [00:02<00:00, 9029.14 examples/s] 
Map: 100%|██████████| 21851/21851 [00:02<00:00, 10599.03 examples/s]


In [43]:
print(tokenized_dataset['train'][33]['tokens'])


['what', 'game', 'do', 'french', 'schoolchildren', 'like', 'to', 'play', '?', 'simon', '16']


### Numericalizing

We will tell torchtext to add any word that has occurred at least three times in the dataset to the vocabulary because otherwise it would be too big.  Also we shall make sure to add `unk` and `eos`.

In [44]:
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_dataset['train']['tokens'], min_freq=3)
vocab.insert_token('<unk>', 0)
vocab.insert_token('<eos>', 1)
vocab.set_default_index(vocab['<unk>'])

In [45]:
print(len(vocab))

26158


In [46]:
print(vocab.get_itos()[:10])

['<unk>', '<eos>', '.', 'a', 'the', "'", 'i', '?', ',', 'you']


In [47]:
import pickle
with open('model/vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)   

We have tokenize the sentences from the dataset into individual words using a basic tokenizer.tokenize_data is used to transform each data point in the dataset, replacing the raw sentence with its tokenized version.A vocabulary is constructed from the tokenized data, retaining only frequent tokens and adding special tokens (<unk>, <eos>). And pickel is used to save the vocabulary model for later use.

##  Prepare the batch loader

### Prepare data

In [48]:
def get_data(dataset, vocab, batch_size):
    data = []
    for example in dataset:
        if example['tokens']:
            tokens = example['tokens'].append('<eos>')
            tokens = [vocab[token] for token in example['tokens']]
            data.extend(tokens)
    data = torch.LongTensor(data)
    num_batches = data.shape[0] // batch_size
    data = data[:num_batches * batch_size]
    data = data.view(batch_size, num_batches) #view vs. reshape (whether data is contiguous)
    return data #[batch size, seq len]

In [49]:
batch_size = 128
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data  = get_data(tokenized_dataset['test'],  vocab, batch_size)

In [50]:
train_data.shape

torch.Size([128, 30672])

## 4.Modeling

In [51]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim    = hid_dim
        self.emb_dim    = emb_dim
        
        self.embedding  = nn.Embedding(vocab_size, emb_dim)
        self.lstm       = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, dropout=dropout_rate, batch_first=True)
        self.dropout    = nn.Dropout(dropout_rate)
        self.fc         = nn.Linear(hid_dim, vocab_size)
        
        self.init_weights()
    
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_other)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                self.hid_dim).uniform_(-init_range_other, init_range_other) #We
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim,   
                self.hid_dim).uniform_(-init_range_other, init_range_other) #Wh
    
    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
        
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach() #not to be used for gradient computation
        cell   = cell.detach()
        return hidden, cell
        
    def forward(self, src, hidden):
        #src: [batch_size, seq len]
        embedding = self.dropout(self.embedding(src)) 
        #embedding: [batch-size, seq len, emb dim]
        output, hidden = self.lstm(embedding, hidden)
        #ouput: [batch size, seq len, hid dim]
        #hidden: [num_layers * direction, seq len, hid_dim]
        output = self.dropout(output)
        prediction =self.fc(output)
        #prediction: [batch_size, seq_len, vocab_size]
        return prediction, hidden

We have LSTM Language model that includes an embedding layer to convert words into vectors, an LSTM to process sequences and capture dependencies, a dropout layer to prevent overfitting, and a fully connected layer to predict the next word. The init_weights method initializes the model's weights randomly within specified ranges, while init_hidden creates initial hidden and cell states for the LSTM. The forward function processes the input sequence through the embedding, LSTM, and dropout layers, then predicts the next word using the fully connected layer. This model is used for tasks like text generation where it predicts the next word based on the previous ones.

## 5.Training

In [52]:
vocab_size = len(vocab)
emb_dim = 1024                # 400 in the paper
hid_dim = 1024                # 1150 in the paper
num_layers = 2                # 3 in the paper
dropout_rate = 0.65              
lr = 1e-3                     

In [53]:
model      = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer  = optim.Adam(model.parameters(), lr=lr)
criterion  = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 70,391,342 trainable parameters


In [54]:
def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [55]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    # data #[batch size, seq len]
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]  #we need to -1 because we start at 0
    num_batches = data.shape[-1]
    
    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        
        #hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [56]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [57]:
n_epochs = 50
seq_len  = 50 #<----decoding length
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

Training:   0%|          | 0/613 [00:00<?, ?it/s]

                                                           

	Train Perplexity: 286.702
	Valid Perplexity: 118.486


                                                           

	Train Perplexity: 118.446
	Valid Perplexity: 85.626


                                                           

	Train Perplexity: 93.354
	Valid Perplexity: 73.027


                                                           

	Train Perplexity: 80.541
	Valid Perplexity: 65.867


                                                           

	Train Perplexity: 72.343
	Valid Perplexity: 61.198


                                                           

	Train Perplexity: 66.422
	Valid Perplexity: 57.902


                                                           

	Train Perplexity: 61.869
	Valid Perplexity: 55.428


                                                           

	Train Perplexity: 58.317
	Valid Perplexity: 53.629


                                                           

	Train Perplexity: 55.377
	Valid Perplexity: 52.086


                                                           

	Train Perplexity: 52.890
	Valid Perplexity: 51.031


                                                           

	Train Perplexity: 50.860
	Valid Perplexity: 50.138


                                                           

	Train Perplexity: 49.053
	Valid Perplexity: 49.338


                                                           

	Train Perplexity: 47.509
	Valid Perplexity: 48.888


                                                           

	Train Perplexity: 46.139
	Valid Perplexity: 48.353


                                                           

	Train Perplexity: 44.903
	Valid Perplexity: 48.007


                                                           

	Train Perplexity: 43.865
	Valid Perplexity: 47.565


                                                           

	Train Perplexity: 42.914
	Valid Perplexity: 47.310


                                                           

	Train Perplexity: 41.997
	Valid Perplexity: 47.230


                                                           

	Train Perplexity: 41.202
	Valid Perplexity: 47.063


                                                           

	Train Perplexity: 40.487
	Valid Perplexity: 46.965


                                                           

	Train Perplexity: 39.833
	Valid Perplexity: 46.780


                                                           

	Train Perplexity: 39.186
	Valid Perplexity: 46.733


                                                           

	Train Perplexity: 38.611
	Valid Perplexity: 46.702


                                                           

	Train Perplexity: 38.100
	Valid Perplexity: 46.612


                                                           

	Train Perplexity: 37.600
	Valid Perplexity: 46.571


                                                           

	Train Perplexity: 37.143
	Valid Perplexity: 46.763


                                                           

	Train Perplexity: 35.596
	Valid Perplexity: 46.425


                                                           

	Train Perplexity: 34.918
	Valid Perplexity: 46.322


                                                           

	Train Perplexity: 34.475
	Valid Perplexity: 46.361


                                                           

	Train Perplexity: 33.659
	Valid Perplexity: 46.312


                                                           

	Train Perplexity: 33.181
	Valid Perplexity: 46.206


                                                           

	Train Perplexity: 32.976
	Valid Perplexity: 46.240


                                                           

	Train Perplexity: 32.726
	Valid Perplexity: 46.154


                                                           

	Train Perplexity: 32.622
	Valid Perplexity: 46.120


                                                           

	Train Perplexity: 32.544
	Valid Perplexity: 46.118


                                                           

	Train Perplexity: 32.421
	Valid Perplexity: 46.090


                                                           

	Train Perplexity: 32.372
	Valid Perplexity: 46.074


                                                           

	Train Perplexity: 32.335
	Valid Perplexity: 46.063


                                                           

	Train Perplexity: 32.277
	Valid Perplexity: 46.059


                                                           

	Train Perplexity: 32.268
	Valid Perplexity: 46.058


                                                           

	Train Perplexity: 32.238
	Valid Perplexity: 46.045


                                                           

	Train Perplexity: 32.240
	Valid Perplexity: 46.040


                                                           

	Train Perplexity: 32.193
	Valid Perplexity: 46.037


                                                           

	Train Perplexity: 32.213
	Valid Perplexity: 46.034


                                                           

	Train Perplexity: 32.192
	Valid Perplexity: 46.032


                                                           

	Train Perplexity: 32.225
	Valid Perplexity: 46.031


                                                           

	Train Perplexity: 32.197
	Valid Perplexity: 46.030


                                                           

	Train Perplexity: 32.196
	Valid Perplexity: 46.030


                                                           

	Train Perplexity: 32.212
	Valid Perplexity: 46.029


                                                           

	Train Perplexity: 32.219
	Valid Perplexity: 46.029


We have initalize vocab_size,embedding,hidden_dimensions, number_of_layer, dropout_rate and learning_rate and optimize and CrossEntropyLoss is used to compute loss while training. 

To train,we have taken 50 epoch.Learning rate scheduler that reduces the learning rate by half if the validation loss stops improving. After each epoch, the model’s performance is checked on both the training and validation data. The perplexity measure of how well the model is predicting is printed. If the validation loss improves than the model’s best weights are saved. The training process also includes gradient clipping to prevent issues with overly large updates. Each epoch updates the model using the optimizer and loss function.At last, the training and validation perplexities are displayed to monitor progress.

## 6.Testing

In [61]:
model.load_state_dict(torch.load('model/best-val-lstm_lm.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 44.986


## 7.Real-World inference

In [62]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [65]:
prompt = 'knock knock'
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
knock knock who ' s there ? control freak who ?

0.7
knock knock who ' s there ? control freak who ?

0.75
knock knock who ' s there ? control freak who ?

0.8
knock knock who ' s there ? control freak !

1.0
knock knock who ' s there ? control freak !



## Web Application Documentation

For web application UI, I have used Dash. In file name app.py, model is integrated where the vocabulary is loaded using pickle and pre-trained weights are used to set up the model.The text generation works by tokenizing the prompt and passing it through the model to predict the next words. The app takes a prompt from the user and generates multiple continuations of the text using different temperature values, which control the randomness and creativity of the output.