In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import random
import pickle
from transformers import BertTokenizer, BertModel
import pickle
import sentencepiece as spm
from tqdm import tqdm
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
with open('datasets/text/clean_tales.txt', 'r', encoding='utf-8') as f:
    text = f.read()



In [3]:
tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files=["datasets/text/clean_tales.txt"], vocab_size=3000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

# 3. Save the tokenizer (optional)
# You can save the trained tokenizer to reuse later
tokenizer.save_model("datasets/text/")

# 4. Encode a text string
output = tokenizer.encode("I love programming.")
print("Encoded string: ", output.ids)  # output.ids is the tokenized representation
print("Decoded string: ", tokenizer.decode(output.ids))  # decoding back to the original string

# 5. Using processors for compatibility (optional)
# Configure the tokenizer to output the special tokens needed for models like BERT.
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)


Encoded string:  [45, 991, 691, 1858, 367, 81, 284, 18]
Decoded string:  I love programming.


In [5]:
output = tokenizer.encode(text[:100])
print("Encoded string: ", output.ids)  # output.ids is the tokenized representation
print("Decoded string: ", tokenizer.decode(output.ids))  # decoding back to the original string


Encoded string:  [0, 350, 385, 2716, 1250, 18, 203, 44, 45, 43, 44, 1772, 264, 2305, 16, 321, 263, 2259, 1741, 509, 82, 16, 969, 264, 338, 280, 649, 294, 264, 385, 2716, 1250, 18, 225, 495, 267, 2]
Decoded string:  The Happy Prince.
HIGH above the city, on a tall column, stood the statue of the Happy Prince.  He w


In [9]:

text_ids = tokenizer.encode(text).ids
train_size = int(len(text_ids) * 0.8)
train_ids = torch.tensor(text_ids[:train_size], dtype=torch.long)
val_ids = torch.tensor(text_ids[train_size:], dtype=torch.long)
print(f'Number of tokens: {len(text_ids)}')
print(f'Vocab size: {tokenizer.get_vocab_size()}')

Number of tokens: 5975565
Vocab size: 3000


In [16]:

def estimate_loss(model, val_data, block_size, batch_size):
    model.eval()
    with torch.no_grad():
        x, y = get_batch(val_data, block_size, batch_size)
        x, y = x.to(device), y.to(device)
        _, loss = model(x, y)
    model.train()
    return loss.item()

def generate_square_subsequent_mask(sz):
    mask = (torch.tril(torch.ones(sz, sz)) == 1).float()
    mask = mask.masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask
    
class LanguageModel(nn.Module):

    def __init__(self, vocab_size, n_emb, n_layers, n_heads, block_size, dropout=0.2):
        super(LanguageModel, self).__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)
        self.position_embedding_table = nn.Embedding(block_size, n_emb)

        encoder_layer = nn.TransformerEncoderLayer(d_model=n_emb, nhead=n_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(n_emb, 4 * n_emb),
            nn.ReLU(),
            nn.Linear(4 * n_emb, n_emb)
        )
        
        self.lm_head = nn.Linear(n_emb, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        token_emb = self.token_embedding_table(idx)
        position_emb = self.position_embedding_table(torch.arange(T, device=device))
        
        x = token_emb + position_emb
        x_transform = x.clone()
        mask = generate_square_subsequent_mask(T).to(device)
        
        x_transform = self.transformer_encoder(x_transform.permute(1, 0, 2), mask=mask)
        x_transform = x_transform.permute(1, 0, 2)
        x = x + x_transform
        
        x = self.feed_forward(x)
        logits = self.lm_head(x)

        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
            return logits, loss
        else:
            return logits, None

    def generate(self, idx, max_new_tokens, block_size, temperature=1.0):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self.forward(idx_cond)
            
            # Scale logits by the temperature
            logits = logits[:, -1, :] / temperature
            
            probs = F.softmax(logits, dim=-1)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_new], dim=-1)
        return idx

# Hyperparameters
block_size = 75  
vocab_size = tokenizer.get_vocab_size()
n_emb = 300
n_layers = 6
n_heads = 5
dropout = 0.2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LanguageModel(vocab_size, n_emb, n_layers, n_heads, block_size, dropout).to(device)
print(f'Number of parameters {sum(p.numel() for p in model.parameters() if p.requires_grad)}')




Number of parameters 12108288


In [18]:
def get_batch(data, block_size, batch_size):
    idx = torch.randint(0, len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in idx])
    y = torch.stack([data[i+1:i+block_size+1] for i in idx])
    return x, y

a, b = get_batch(train_ids, block_size, 1)
print(tokenizer.decode(a[0].tolist()))
print(tokenizer.decode(b[0].tolist()))
print(a)
print(b)
print(a.shape, b.shape)

, without being quite sure what the best really was.  Money, position, fashionable accomplishments, and elegant manners were most desirable things in her eyes, and she liked to associate with those who possessed them, often mistaking the false for the true, and admiring what
 without being quite sure what the best really was.  Money, position, fashionable accomplishments, and elegant manners were most desirable things in her eyes, and she liked to associate with those who possessed them, often mistaking the false for the true, and admiring what was
tensor([[  16,  886,  923,  794, 1096,  502,  264, 1165, 1233,  314,   18,  225,
          375, 1110,   16, 2865, 1374,   16,  277, 1211,  377,  620, 2729,  737,
          619, 1909,   16,  275, 1822,   75,  425,  452,   82,  490,  409,  873,
          864,  335,  620,  910,  300,  343,  752,   16,  275,  359, 2164,  282,
          334, 1053, 1272,  540,  345, 1029,  439, 1240, 2217,  429,   16, 1565,
         1835,  868,  264,  277,  357,  

In [22]:


# training parameters
batch_size = 16
early_stop = 20
last_val_loss = 1e9
n_epochs = 1000
learning_rate = 3e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


for steps in range(n_epochs):
    model.train()
    xb, yb = get_batch(train_ids, block_size, batch_size)
    xb = xb.to(device)
    yb = yb.to(device)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if steps % 100 == 0:
        print('Step:', steps, 'Training Loss:', loss.item())
        val_loss = estimate_loss(model, val_ids, block_size, batch_size)
        print('Validation loss:', val_loss)
        if val_loss >= last_val_loss:
            early_stop -= 1
            if early_stop == 0:
                print('Early stop!')
                break
        else:
            early_stop = 20
            last_val_loss = val_loss

Step: 0 Training Loss: 4.923049449920654
Validation loss: 5.0128021240234375
Step: 100 Training Loss: 4.605337142944336
Validation loss: 4.853498935699463
Step: 200 Training Loss: 4.876150131225586
Validation loss: 4.920846462249756
Step: 300 Training Loss: 4.563838481903076
Validation loss: 4.820635795593262
Step: 400 Training Loss: 4.809290409088135
Validation loss: 4.89808464050293
Step: 500 Training Loss: 4.753746509552002
Validation loss: 4.858625888824463
Step: 600 Training Loss: 4.574955940246582
Validation loss: 4.820096015930176
Step: 700 Training Loss: 4.658062934875488
Validation loss: 4.822627544403076
Step: 800 Training Loss: 4.613270282745361
Validation loss: 4.784586429595947
Step: 900 Training Loss: 4.569982528686523
Validation loss: 4.621920585632324


In [29]:
torch.save(model, 'datasets/text/clean_tales.pt')

In [24]:
starting_tokens = 'The king'

encoded_start = tokenizer.encode(starting_tokens).ids
len_starting_tokens = len(encoded_start)

idx = torch.tensor(encoded_start).reshape(1, len_starting_tokens).to(device)
generation = model.generate(idx, max_new_tokens=2000, block_size=block_size, temperature=1)[0].tolist()
print(tokenizer.decode(generation))


The king you, too liked Oz southiness on him caught in searching the educers lying with a foolishference donkey as the whiteened in all of to tearsest the dimpect, his mind to think her tears of his father commands not hunt a dog.
“Yes, and Jo spoke to die, or myself to see Edward by threw herself and the chief very sorry and to the door.
"A week; a face, even to beachuly upon however, will let me as if Jupitere were taken down to himself a box around as suddenly until understand was the Godsaid one of the futtle, and sides. That's remaining with him.  I'm as another black Christmas. uttered growing middle of a week Aflein. I see her too he climbed, and find him.
The two devanks now, but what do see," said the camp stood through a very much to keep the an exer. Height things where it occasion. Naturred in the dearehold as thou art I amused to himself up the nonsievna like a wakers and everything.
that her, still Nor in Cous!
“I amboria.
“Yes," said doing soft could only seen him to mak

The king you, too liked Oz southiness on him caught in searching the educers lying with a foolishference donkey as the whiteened in all of to tearsest the dimpect, his mind to think her tears of his father commands not hunt a dog.
“Yes, and Jo spoke to die, or myself to see Edward by threw herself and the chief very sorry and to the door.
"A week; a face, even to beachuly upon however, will let me as if Jupitere were taken down to himself a box around as suddenly until understand was the Godsaid one of the futtle, and sides. That's remaining with him.  I'm as another black Christmas. uttered growing middle of a week Aflein. I see her too he climbed, and find him.
The two devanks now, but what do see," said the camp stood through a very much to keep the an exer. Height things where it occasion. Naturred in the dearehold as thou art I amused to himself up the nonsievna like a wakers and everything.
that her, still Nor in Cous!
“I amboria.
“Yes," said doing soft could only seen him to make of froxenuzzled. Now were very rich days, and stood the tid for what a lotted by considered and bones came a large mipped to which is, 'Where knew how dirty, but Ia Ne,” said SWell, let you get bridaomine at Evrestory whatever you're born till morning riddled as she so that he would not watch. Napial time, Mr.  And when he went around him the reasses had been line;"Where’d with me.
“It was out. His teeth, and thenny Whitement, and flew side by a moment a ship, after angehomaid wise to Sunday-morrow, “I amused, but the extross, and a lovely country from his father refused the hooked, ask that ye been visage lies in drahas, he reached the win an owork called Ant.
So they heard, the ground us but witch," said the cliversson’s in their powerful time. Brell to Gania.
He did not approached in one desire in fact that the num.'
The wicked ugrandon sat down like the indignity, Night and subs ever that they like to busy abst easily," said at a than if the river, and close to say, she had usual, but the admitted well break a rapate me all he assens to his rings, now with proud speciers in the survals nest, and their mind; the torment before in the Lion brought him he desertion, and smiling to sit down the water's palace found me killed of pleasant nibrings of her as children, we are you know when thing this personally pictures. She always put it has probably.”
   South figure is always complimplapped himself besides and dartles, and everybody else has got small pore, then, do notice of the window with one of a wonderful years Laurie is twenty fire of thousand times of Rossship," said, and the other bones friend Miss Lake, Bringess and pleasant, which, and beneath the others, each as before.
Justy young form was sitting in order to Stubbish wretched as sheatouds of saying, because they seemed burning this business, then to feel  It was land. It is a sadly hit, and Khains, in the two morning, you only finally were no one had come off to sending but authorlock sea. It's talking where the leakened in men, and screams ab tree unless as if if she had quite black the fire?”
“No time, as bigity of thousand delicious, found anna tied into his forest, still rom slapping, he came home. “Full-habethave with except to come to leintment which, which they were getting what I have no longer. For village of rain. Dust, he go to the seal all liter bloombeelego, who well known the face, “I all all the sight of breaking cat. Sha would seem good vio, crying to tell me. She looked surprised that heave, "No, but a half-bury gave his feet, the woman who continued with a very murming at Queen had a big idea this line my place in theass, with the doing about and tell me.” “RUdays this story!”
“I was mood heart told, and the other less, that Mins be bla's a stoospit you must fields.
Becky.
Then he called the unacle or two womenless, for breakfast-ts that they stood upon them which music, blew slowly straighter.  But there be nothing to no, I wish you must help scarcell pustriced that you would try his cranning. Bid it was in the other slow surve?" inquired in the next day of charoinecucky, much good one for on her foughtical child, and in another, upon the portraphabethble face of him,” she saw a top of ange, moved among the bree. When this:
The girl, and the dark curiest-cobreadgrainverseus burn. The glit King, trave to her with him and fell down on a gallrolly terms of while you know," said walking.
     I, and Cham and will tell me falling believe me the possibly to his committed on the days.” So the garratabaors:
It was won’s no one eye tell?”
"If the wonderful whale ravored thousand. There is three times.”
But looking at poor or no more.
As ever pleasure, ainn't know, see his wrong as a bad ship will do. He then! 
So it. She thought that he found that different for the prince must be able to all? No questions. Wouldered, times cold and of bread and let those over with being fitted, as ever since you see; for you not expected what is not amazara, raisarucation.
HI never heard half-own since they did the left out bargard” and once, we must nearly look out all right, the general for it in generaleed before one whales led his full wid you I have, And had light room and Mrs. By running to behold, was a terrible chor of his grandfather of me,   Elets the flues, and stood a deep her too."
"Oh; and heaven! Do you eat you take his name, before, of his dignity who saw the a fair, and they came into phasmetrolirst manner were about, in heavens, tell the country, and thirtyious face away, for it, commanded up the woodlaring that had found, and face to Snowleding the poor, a drieding about it was placed to myself up there made a great old morted with a post you anybody was some day the blew the Six-day I am very merriblely of Sometimes of the cared a good cove spranging it is much wondered and afternoon my wife at last she saw to Berduously themselvesia walked disposopresassed, says that regarding them than I,     Fin herself of the ris name of a moment, like the objectsday I don't know whether it was on the Palace you to dris lips to this evening to you meanness was last. O Livesmiss me soon began to the wicked was told Amyartieved. 'Bet into the key the weddingment shall be able to seatedred rose to drawstead and on the Fret. Perest, for she raised a cut you?
Megical servant came to us forth, the choved as the house, and reading-cromfast the flucerhaps I saw him, my heartsman looked his spand-bble, “Now I creature.
"It's terribid this, and put all. Brady; A A deep, and Jack in the volk and rode out his old man likewise?”
“Oh, sheking of the flushed at all day,    velopard-morrow!" said Elinor, “Why did not mark.

“Don’ller darkly off she could or even de Dermination went to send.
“Oh, that seemed up every, “Walk to have been away, and pake your grandmother was nearestacmitusly called it is there?"
The Con, and intelligation. ’t care if. But there.
Then help him afterwards that since he would possessed through another sail, and b

