In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import random
import pickle
from transformers import BertTokenizer, BertModel
import pickle
import sentencepiece as spm
from tqdm import tqdm
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
with open('datasets/text/clean_tales.txt', 'r', encoding='utf-8') as f:
    text = f.read()



In [3]:
tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files=["datasets/text/clean_tales.txt"], vocab_size=3000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

# 3. Save the tokenizer (optional)
# You can save the trained tokenizer to reuse later
tokenizer.save_model("datasets/text/")

# 4. Encode a text string
output = tokenizer.encode("I love programming.")
print("Encoded string: ", output.ids)  # output.ids is the tokenized representation
print("Decoded string: ", tokenizer.decode(output.ids))  # decoding back to the original string

# 5. Using processors for compatibility (optional)
# Configure the tokenizer to output the special tokens needed for models like BERT.
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)


Encoded string:  [45, 991, 691, 1858, 367, 81, 284, 18]
Decoded string:  I love programming.


In [5]:
output = tokenizer.encode(text[:100])
print("Encoded string: ", output.ids)  # output.ids is the tokenized representation
print("Decoded string: ", tokenizer.decode(output.ids))  # decoding back to the original string


Encoded string:  [0, 350, 385, 2716, 1250, 18, 203, 44, 45, 43, 44, 1772, 264, 2305, 16, 321, 263, 2259, 1741, 509, 82, 16, 969, 264, 338, 280, 649, 294, 264, 385, 2716, 1250, 18, 225, 495, 267, 2]
Decoded string:  The Happy Prince.
HIGH above the city, on a tall column, stood the statue of the Happy Prince.  He w


In [9]:

text_ids = tokenizer.encode(text).ids
train_size = int(len(text_ids) * 0.8)
train_ids = torch.tensor(text_ids[:train_size], dtype=torch.long)
val_ids = torch.tensor(text_ids[train_size:], dtype=torch.long)
print(f'Number of tokens: {len(text_ids)}')
print(f'Vocab size: {tokenizer.get_vocab_size()}')

Number of tokens: 5975565
Vocab size: 3000


In [63]:

def estimate_loss(model, val_data, block_size, batch_size):
    model.eval()
    with torch.no_grad():
        x, y = get_batch(val_data, block_size, batch_size)
        x, y = x.to(device), y.to(device)
        _, loss = model(x, y)
    model.train()
    return loss.item()

def generate_square_subsequent_mask(sz):
    mask = (torch.tril(torch.ones(sz, sz)) == 1).float()
    mask = mask.masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask
    
class LanguageModel(nn.Module):

    def __init__(self, vocab_size, n_emb, n_layers, n_heads, block_size, dropout=0.2):
        super(LanguageModel, self).__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)
        self.position_embedding_table = nn.Embedding(block_size, n_emb)

        encoder_layer = nn.TransformerEncoderLayer(d_model=n_emb, nhead=n_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(n_emb, 4 * n_emb),
            nn.ReLU(),
            nn.Linear(4 * n_emb, n_emb)
        )
        
        self.lm_head = nn.Linear(n_emb, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        token_emb = self.token_embedding_table(idx)
        position_emb = self.position_embedding_table(torch.arange(T, device=device))
        
        x = token_emb + position_emb
        x_transform = x.clone()
        mask = generate_square_subsequent_mask(T).to(device)
        
        x_transform = self.transformer_encoder(x_transform.permute(1, 0, 2), mask=mask)
        x_transform = x_transform.permute(1, 0, 2)
        x = x + x_transform
        
        x = self.feed_forward(x)
        logits = self.lm_head(x)

        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
            return logits, loss
        else:
            return logits, None

    def generate(self, idx, max_new_tokens, block_size, temperature=1.0):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self.forward(idx_cond)
            
            # Scale logits by the temperature
            logits = logits[:, -1, :] / temperature
            
            probs = F.softmax(logits, dim=-1)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_new], dim=-1)
        return idx

# Hyperparameters
block_size = 50  
vocab_size = tokenizer.get_vocab_size()
n_emb = 500
n_layers = 4
n_heads = 2
dropout = 0.1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LanguageModel(vocab_size, n_emb, n_layers, n_heads, block_size, dropout).to(device)
print(f'Number of parameters {sum(p.numel() for p in model.parameters() if p.requires_grad)}')




Number of parameters 17248692


In [72]:
def get_batch(data, block_size, batch_size):
    idx = torch.randint(0, len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in idx])
    y = torch.stack([data[i+1:i+block_size+1] for i in idx])
    return x, y

a, b = get_batch(train_ids, block_size, 1)
print(tokenizer.decode(a[0].tolist()))
print(tokenizer.decode(b[0].tolist()))
print(a)
print(b)
print(a.shape, b.shape)

, crying out: "There's a great dragon coming! Somebody ought to do something, or we shall all be destroyed."
He was caned for untruthfulness without any delay. His master was never one
 crying out: "There's a great dragon coming! Somebody ought to do something, or we shall all be destroyed."
He was caned for untruthfulness without any delay. His master was never one for
tensor([[  16, 2568,  424,   30,  413, 1142,  411,  263,  586, 2075, 1397,    5,
         2857, 1210, 2009,  282,  401,  924,   16,  474,  354,  724,  383,  307,
         2482,  399,   93,  276,  457,  203,  769,  314,  546,  276,  337, 1047,
           86, 1580,  631,  781,  886,  541, 1030,  326,   18, 1408, 1604,  314,
          632,  428]])
tensor([[2568,  424,   30,  413, 1142,  411,  263,  586, 2075, 1397,    5, 2857,
         1210, 2009,  282,  401,  924,   16,  474,  354,  724,  383,  307, 2482,
          399,   93,  276,  457,  203,  769,  314,  546,  276,  337, 1047,   86,
         1580,  631,  781,  886,  541

In [76]:


# training parameters
batch_size = 32
early_stop = 30
last_val_loss = 1e9
n_epochs = 5000
learning_rate = 3e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


for steps in range(n_epochs):
    model.train()
    xb, yb = get_batch(train_ids, block_size, batch_size)
    xb = xb.to(device)
    yb = yb.to(device)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if steps % 100 == 0:
        print('Step:', steps, 'Training Loss:', loss.item())
        val_loss = estimate_loss(model, val_ids, block_size, batch_size)
        print('Validation loss:', val_loss)
        if val_loss >= last_val_loss:
            early_stop -= 1
            if early_stop == 0:
                print('Early stop!')
                break
        else:
            early_stop = 30
            last_val_loss = val_loss

Step: 0 Training Loss: 4.610112190246582
Validation loss: 5.031398773193359
Step: 100 Training Loss: 4.609602928161621
Validation loss: 4.772461414337158
Step: 200 Training Loss: 4.605961322784424
Validation loss: 4.667159557342529
Step: 300 Training Loss: 4.4932169914245605
Validation loss: 4.489596366882324
Step: 400 Training Loss: 4.366876125335693
Validation loss: 4.572510719299316
Step: 500 Training Loss: 4.514391899108887
Validation loss: 4.652623176574707
Step: 600 Training Loss: 4.456912040710449
Validation loss: 4.531207084655762
Step: 700 Training Loss: 4.211000442504883
Validation loss: 4.397500991821289
Step: 800 Training Loss: 4.369976043701172
Validation loss: 4.491415500640869
Step: 900 Training Loss: 4.158637046813965
Validation loss: 4.438628673553467


In [28]:
torch.save(model, 'datasets/text/clean_tales_2.pt')

In [80]:
starting_tokens = 'The crazy witch'

encoded_start = tokenizer.encode(starting_tokens).ids
len_starting_tokens = len(encoded_start)

idx = torch.tensor(encoded_start).reshape(1, len_starting_tokens).to(device)
model.eval()
generation = model.generate(idx, max_new_tokens=2000, block_size=block_size, temperature=0.8)[0].tolist()
model.train()
print(tokenizer.decode(generation))


The crazy witch riss an hinder-put in the rock from the new year to get, which the sun shone, and then the face of the ground, he was deliolently memorated, and wept the talk with a surprise, and bade his locks of exagle, while the king was twenty miles in the midstery, and let the same one or a stone that which the Cule and sadness were swing across a shining property and guroding on the earth. So the Chambering was placed, and the little mans rushed, and the heart seemed to be angry with the straw, the bears that could not have been a torreak, and she was written.
He led the old tiger and stared the little maiden. He brought this before.
"Who's a very best way a wicked woman, now the King and pardon got him to tell him all the night, a man of glocle of modern, and a cloak at the darkness of the prisoner, it was not so sure that the elbet, and in this there was the modest or rainburrange, and there was a heat, turned behind the took the robbers and watched the paravet, and so they had