In [1]:

import torch
import torch.nn as nn
import torch.optim as optim

import torch
import torchtext
print(torch.__version__)       # Should print 2.1.1+cpu
print(torchtext.__version__)   # Should print 0.16.1

  from .autonotebook import tqdm as notebook_tqdm


2.1.1+cpu
0.16.1+cpu


In [3]:
import requests

# Define the target URL
url = "https://www.gutenberg.org/cache/epub/52719/pg52719.txt"

try:
    # Send the GET request
    response = requests.get(url, timeout=10)

    # Check if the request was successful
    if response.status_code == 200:
        print("Page fetched successfully!")

        # Get the response content as plain text
        content = response.text

        # Locate the start and end of the desired text
        start_index = content.find("BLUE BEARD.")
        end_index = content.find("THE END.")

        # Extract the content if both start and end markers are found
        if start_index != -1 and end_index != -1:
            fairy_tales = content[start_index:end_index + len("THE END.")]
            print("Extracted content:")
            print(fairy_tales[:1000])  # Print the first 1000 characters as a preview
        else:
            print("Could not find the specified markers in the content.")
    else:
        print(f"Failed to fetch the page: Status code {response.status_code}")

except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")


Page fetched successfully!
Extracted content:
BLUE BEARD.


Once on a time there was a man who had fine town and country houses,
gold and silver plate, embroidered furniture, and coaches gilt all
over; but unfortunately, this man had a blue beard, which made him
look so ugly and terrible, that there was not a woman or girl who did
not run away from him. One of his neighbours, a lady of quality, had
two daughters, who were perfectly beautiful. He proposed to marry one
of them, leaving her to choose which of the two she would give him.
Neither of them would have him; and they sent him from one to the
other, not being able to make up their minds to marry a man who had a
blue beard. What increased their distaste to him was, that he had had
several wives already, and nobody knew what had become of them.

Blue Beard, in order to cultivate their acquaintance, took them, with
their mother, three or four of their most intimate friends, and some
young persons who resided in the neighbourhood, to

In [4]:
# check that I have cpu for train or not
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# same pattern when I restart the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cpu


In [5]:
import nltk
from sklearn.model_selection import train_test_split

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import torchtext, math

In [8]:
sentences =nltk.sent_tokenize(fairy_tales)

In [9]:
len(sentences)

8871

In [10]:
# set the random seed for reproducibility
random_seed = 42

# split the data into training, testing, and validation sets
train_data, test_data = train_test_split(sentences, test_size=0.1, random_state=random_seed)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=random_seed)

# Print the sizes of the sets
print(f"Number of samples in training set: {len(train_data)}")
print(f"Number of samples in validation set: {len(val_data)}")
print(f"Number of samples in test set: {len(test_data)}")

Number of samples in training set: 7184
Number of samples in validation set: 799
Number of samples in test set: 888


In [11]:
train_data[4]

'The sleeping Princess attracted his attention.'

In [12]:
# tokenize to tranform sentence to tokens
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

# create function to tokenize the text
tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example)}  

# map the function to each example in the list
tokenized_train_data = list(map(lambda example: tokenize_data(example, tokenizer), train_data))
tokenized_test_data = list(map(lambda example: tokenize_data(example, tokenizer), test_data))
tokenized_val_data = list(map(lambda example: tokenize_data(example, tokenizer), val_data))

In [13]:
tokenized_train_dataset = [entry['tokens'] for entry in tokenized_train_data]
tokenized_test_dataset = [entry['tokens'] for entry in tokenized_test_data]
tokenized_val_dataset = [entry['tokens'] for entry in tokenized_val_data]

In [14]:
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_train_dataset)
vocab.insert_token('<unk>', 0)
vocab.insert_token('<eos>', 1) #for print next word
vocab.set_default_index(vocab['<unk>']) #word that not in vocab tranfer to <unk>

In [15]:
torch.save(vocab, 'vocab.pt')

In [16]:
print(len(vocab))

10333


In [17]:
# print 10 vocabs
print(vocab.get_itos()[:10])

['<unk>', '<eos>', ',', 'the', '.', 'to', 'of', 'and', 'her', 'a']


In [18]:
# function for split the dataset on batch
def get_data(dataset, vocab, batch_size):
    data = []
    for example in dataset:
        if example:
            tokens = example.append('<eos>') # add <eos> to the end of each sentence
            tokens = [vocab[token] for token in example] # change each word to number
            data.extend(tokens)
    data = torch.LongTensor(data)
    num_batches = data.shape[0] // batch_size
    data = data[:num_batches * batch_size] # to make sure that every batch is equal
    data = data.view(batch_size, num_batches) #reshape 
    return data #[batch size, seq len]

In [19]:
batch_size = 16
train_data = get_data(tokenized_train_dataset, vocab, batch_size)
valid_data = get_data(tokenized_val_dataset, vocab, batch_size)
test_data  = get_data(tokenized_test_dataset,  vocab, batch_size)

In [20]:
train_data

tensor([[  20,   56, 1191,  ...,  103,   14,   15],
        [ 112, 4789,    4,  ...,   42,    2,   17],
        [6881,   30, 6145,  ...,   38,    2,    7],
        ...,
        [  39, 1063,    2,  ...,  423,   18,  613],
        [1169,   12,   16,  ...,    3,  801,   34],
        [ 446,    6,    9,  ..., 1692,   67,  544]])

In [21]:
train_data.shape

torch.Size([16, 12677])

In [22]:
# create model 
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim    = hid_dim
        self.emb_dim    = emb_dim
        
        self.embedding  = nn.Embedding(vocab_size, emb_dim)
        self.lstm       = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, dropout=dropout_rate, batch_first=True)
        self.dropout    = nn.Dropout(dropout_rate)
        self.fc         = nn.Linear(hid_dim, vocab_size) # fc is the last layer for 
        
        self.init_weights()
    
    # function for assigning the initial weight of W_e, W_h
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                self.hid_dim).uniform_(-init_range_other, init_range_other) #W_e
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim,   
                self.hid_dim).uniform_(-init_range_other, init_range_other) #W_h
    
    # reset hidden
    def init_hidden(self, batch_size, device): 
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
        
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach() 
        cell   = cell.detach()
        return hidden, cell
        
    def forward(self, src, hidden):
        #src: [batch_size, seq len]
        embedding = self.dropout(self.embedding(src)) #Liverpool is
        #embedding: [batch-size, seq len, emb dim]
        output, hidden = self.lstm(embedding, hidden)
        #ouput: [batch size, seq len, hid dim]
        #hidden: [num_layers * direction, seq len, hid_dim]
        output = self.dropout(output)
        prediction =self.fc(output)
        #prediction: [batch_size, seq_len, vocab_size]
        return prediction, hidden

In [23]:
# assign the parameters
vocab_size = len(vocab)
emb_dim = 1024                
hid_dim = 50                
num_layers = 1               
dropout_rate = 0.5             
lr = 1e-3

In [24]:
import math
model      = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer  = optim.Adam(model.parameters(), lr=lr)
criterion  = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')



The model has 11,323,175 trainable parameters


In [25]:
# the function is used for getting the input and output batch for training process
def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [26]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()

    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]
    
    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad() #clear all gradient
        
        #hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden)

        #get the input and output batch
        src, target = get_batch(data, seq_len, idx) 
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        #put it on LSTM model that I created and printthe prediction
        prediction, hidden = model(src, hidden)               

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        #clipping to make gradient smaller to prevent exploding gradient 
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        #update the model parameter
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    # average the training loss
    return epoch_loss / num_batches

In [27]:
# function for evaluate the model with validation dataset
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len] 
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    # average the validation loss
    return epoch_loss / num_batches

In [28]:
# training

n_epochs = 2
seq_len  = 30 #<----decoding length
clip    = 0.25

# to reduce the learning rate when the loss is not improve
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    # save the model if the validation loss of model is improve
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

    #print the train and validation Perplexity (lower, better)
    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                           

	Train Perplexity: 555.530
	Valid Perplexity: 285.968


                                                           

	Train Perplexity: 266.884
	Valid Perplexity: 228.963


In [29]:
# test model with test dataset

model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 218.924


In [30]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens] # tranforms word to number (index in vocabs)
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction)

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [31]:
# load saved model
loaded_model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate)
loaded_model.load_state_dict(torch.load('best-val-lstm_lm.pt'))
loaded_model.eval()



LSTMLanguageModel(
  (embedding): Embedding(10333, 1024)
  (lstm): LSTM(1024, 50, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=50, out_features=10333, bias=True)
)

In [32]:
prompt = 'Once on a time'
max_seq_len = 30
seed = 0
# temperature = 1 # since we want the most make-sense sentence, temperature must highest which is 1

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, loaded_model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
once on a time , and her to their during the other .

0.7
once on a time , and each the moment when she had not the of the fairy in be to number thou much very islands , but she had had not myself to a

0.75
once on a time , and each the moment when she had not the of the fairy in be to number thou minutes , and mademoiselle was everything for the drawn of the king

0.8
once on a time , and each honour with during him , when the of the fairy in be long number of this very islands , but everything for the drawn of a obvious

1.0
once on a time , and each assisted with during him , when the of the fairy in be tells number thou minutes very islands mademoiselle was few fidelity .

