Many thanks to nampdn-ai for the data set. 

https://huggingface.co/datasets/nampdn-ai/tinystories-vietnamese

Copyright Hoa Vu

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import random
import pickle
import math
from tokenizers.processors import BertProcessing
from transformers import AutoTokenizer, BertTokenizer
import gc

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(device)

if not torch.cuda.is_available():
    print("CUDA is not available on your system.")
else:
    # Print the number of CUDA devices
    print(f"Number of CUDA Devices: {torch.cuda.device_count()}\n")

    for i in range(torch.cuda.device_count()):
        print(f"Device {i}:")
        print(f"    Name: {torch.cuda.get_device_name(i)}")
        print(f"    Computational Capability: {torch.cuda.get_device_capability(i)}")
        print(f"    Total Memory: {torch.cuda.get_device_properties(i).total_memory / (1024**2):.2f} MB\n")



cuda
Number of CUDA Devices: 1

Device 0:
    Name: NVIDIA GeForce RTX 3090
    Computational Capability: (8, 6)
    Total Memory: 24575.50 MB



In [3]:
data_path = 'datasets/tiny_stories/'

In [4]:
filename = 'tiny_stories.csv'
filepath = os.path.join(data_path, filename)

data = pd.read_csv(filepath)
data = data.dropna()

print(data.head())


   Unnamed: 0                                                 en  \
0           0  Once upon a time, there was a little boy named...   
1           1  Once upon a time, there was a normal boy named...   
2           2  Once upon a time, there was a fast car named S...   
3           3  Once there was a family with a very playful go...   
4           4  Once upon a time, there was a dog named Max. M...   

                                                  vi  
0  Ngày xửa ngày xưa, có một cậu bé tên Tim. Tim ...  
1  Ngày xửa ngày xưa, có một cậu bé bình thường t...  
2  Ngày xửa ngày xưa, có một chiếc xe tốc độ tên ...  
3  Có một gia đình có một con dê rất hay vui đùa....  
4  Ngày xửa ngày xưa, có một con chó tên là Max. ...  


In [5]:
N = len(data)
train_size = int(0.9 * N)
train_data = data[:train_size]
val_data = data[train_size:]

print(train_data.shape, val_data.shape)

(2339822, 3) (259981, 3)


In [6]:
from transformers import BertTokenizer, AutoModel, AutoTokenizer

#tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Get the IDs
cls_id = tokenizer.cls_token_id
sep_id = tokenizer.sep_token_id
unk_id = tokenizer.unk_token_id
pad_id = tokenizer.pad_token_id

print("PAD Token ID:", pad_id)
print("UNK Token ID:", unk_id)
print("CLS Token ID:", cls_id)
print("SEP Token ID:", sep_id)

PAD Token ID: 0
UNK Token ID: 100
CLS Token ID: 101
SEP Token ID: 102


In [7]:
BLOCK_SIZE = 256
VOCAB_SIZE = tokenizer.vocab_size
print('Vocab size is ', VOCAB_SIZE)


Vocab size is  30522


In [19]:
def get_batch(data, block_size, batch_size):
    x = torch.zeros((batch_size, block_size), dtype=torch.long)
    y = torch.zeros((batch_size, block_size), dtype=torch.long)
    samples = data['en'].sample(n=batch_size)

    for i, sample in enumerate(samples):
        token_ids = tokenizer.encode(sample)
        token_ids = [id for id in token_ids if id != unk_id]
        if len(token_ids) < block_size + 2:
            token_ids = token_ids + [pad_id] * (block_size + 2 - len(token_ids))
        random_start = random.randint(0, len(token_ids) - block_size - 2)
        x[i, :len(token_ids)] = torch.tensor(token_ids[random_start:random_start + block_size], dtype=torch.long)
        y[i, :len(token_ids)] = torch.tensor(token_ids[random_start + 1:random_start + block_size + 1], dtype=torch.long)

    return x, y

a, b = get_batch(train_data, block_size=BLOCK_SIZE, batch_size=1)
print(a.shape, b.shape)
print(tokenizer.decode(a[0].tolist(), skip_special_tokens=False))
print(tokenizer.decode(b[0].tolist(), skip_special_tokens=False))

torch.Size([1, 256]) torch.Size([1, 256])
[CLS] once upon a time, there was a little boy named tim. tim had a fancy bike that he loved to ride. one day, tim went for a ride with his bike in the park. he took a bottle of water with him. while tim was riding, he saw a big tree. tim wanted to take a break, so he stopped under the tree. he put his bottle on the ground and sat down. tim looked up and saw a bird in the tree. the bird was singing a pretty song. suddenly, the bird flew down and picked up tim's bottle with its beak. tim was so surprised! he didn't know birds could do that. the bird flew away with the bottle. tim got on his bike and tried to follow the bird. he wanted his bottle back. the bird led tim to a nest with baby birds. they were thirsty. tim understood that the bird took the bottle to give water to the babies. he smiled and let the bird keep the bottle. tim rode back home feeling happy that he helped the baby birds. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [None]:
del model, xb, yb  # delete the tensor variable
torch.cuda.empty_cache()  # clear unused memory in PyTorch
gc.collect()  # call Python garbage collector

In [22]:
N_EMB = 800
N_LAYERS = 6
N_HEADS = 10
DROPOUT = 0.2

def estimate_loss(model, val_data, block_size, batch_size):
    model.eval()
    with torch.no_grad():
        x, y = get_batch(val_data, block_size, batch_size)
        x, y = x.to(device), y.to(device)
        _, loss = model(x, y)
    model.train()
    return loss.item()

def generate_square_subsequent_mask(sz):
    mask = (torch.tril(torch.ones(sz, sz)) == 1).float()
    mask = mask.masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def get_sine_position_encodings(length, dim):
    pos = torch.arange(length, dtype=torch.float32).reshape(-1, 1)
    div_term = torch.exp(torch.arange(0, dim, 2).float() * -(math.log(10000.0) / dim))
    pos_encodings = torch.zeros(length, dim)
    pos_encodings[:, 0::2] = torch.sin(pos * div_term)
    pos_encodings[:, 1::2] = torch.cos(pos * div_term)
    return pos_encodings

class LanguageModel(nn.Module):

    def __init__(self, vocab_size, n_emb, block_size, n_layers, n_heads, dropout=0.2):
        super(LanguageModel, self).__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)
        self.position_embedding_table = nn.Embedding(block_size, n_emb)
        self.block_size = block_size

        encoder_layer = nn.TransformerEncoderLayer(d_model=n_emb, nhead=n_heads, dropout=dropout, layer_norm_eps=1e-6)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        self.feed_forward = nn.Sequential(
            nn.Linear(n_emb, 2 * n_emb),
            nn.ReLU(),
            nn.Linear(2 * n_emb, n_emb)
        )

        self.lm_head = nn.Linear(n_emb, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        token_emb = self.token_embedding_table(idx)
        position_emb = self.position_embedding_table(torch.arange(T, device=device))

        x = token_emb + position_emb
        x_transform = x.clone()
        mask = generate_square_subsequent_mask(T).to(device)

        x_transform = self.transformer_encoder(x_transform.permute(1, 0, 2), mask=mask)
        x_transform = x_transform.permute(1, 0, 2)
        x = x + x_transform

        x = self.feed_forward(x)
        logits = self.lm_head(x)

        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
            return logits, loss
        else:
            return logits, None


    def generate(self, idx, max_new_tokens, temperature=1.0, stop_token=False):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.block_size:]
            logits, _ = self.forward(idx_cond)

            # Scale logits by the temperature
            logits = logits[:, -1, :] / temperature

            probs = F.softmax(logits, dim=-1)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_new], dim=-1)
            if stop_token and idx_new.item() == sep_id:
                break
        return idx

# Create model, optimizer
model = LanguageModel(vocab_size=VOCAB_SIZE, block_size=BLOCK_SIZE, n_emb=N_EMB, n_layers=N_LAYERS, \
    n_heads=N_HEADS, dropout=DROPOUT).to(device)

print(f'Number of parameters {sum(p.numel() for p in model.parameters() if p.requires_grad)}')

Number of parameters 86709210


In [13]:
model_path = os.path.join(data_path, "english_tiny_stories_6layers.pth")
print(model_path)
model = torch.load(model_path)

datasets/tiny_stories/vietnamese_tiny_stories_6layers.pth


In [31]:
EARLY_STOP = 50
N_EPOCHS = 20000
BATCH_SIZE = 32
LEARNING_RATE = 3e-4
SAVE_FREQ = 500
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

model_path = os.path.join(data_path, "english_tiny_stories_6layers.pth")

for steps in range(N_EPOCHS):
    model.train()
    xb, yb = get_batch(train_data, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE)
    xb = xb.to(device)
    yb = yb.to(device)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Delete xb, yb and free GPU memory
    del xb, yb
    torch.cuda.empty_cache()

    if steps % 100 == 0:
        print('Step:', steps, 'Training Loss:', loss.item())
        val_loss = estimate_loss(model, val_data, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE)
        print('Validation loss:', val_loss)
    if steps % SAVE_FREQ == 0:
        torch.save(model, model_path)
        print('Model saved at', model_path)


Step: 0 Training Loss: 1.1610132455825806
Validation loss: 1.0325729846954346
Model saved at datasets/tiny_stories/english_tiny_stories_6layers.pth
Step: 100 Training Loss: 1.030817985534668
Validation loss: 0.9163889288902283
Step: 200 Training Loss: 1.059973120689392
Validation loss: 1.0165120363235474
Step: 300 Training Loss: 1.1376601457595825
Validation loss: 1.1046128273010254
Step: 400 Training Loss: 1.1464565992355347
Validation loss: 1.1399215459823608
Step: 500 Training Loss: 1.1293452978134155
Validation loss: 1.0974719524383545
Model saved at datasets/tiny_stories/english_tiny_stories_6layers.pth
Step: 600 Training Loss: 1.0733214616775513
Validation loss: 1.0089744329452515
Step: 700 Training Loss: 1.0040119886398315
Validation loss: 1.0509716272354126
Step: 800 Training Loss: 1.1812925338745117
Validation loss: 1.0350391864776611
Step: 900 Training Loss: 1.1636146306991577
Validation loss: 1.0675441026687622
Step: 1000 Training Loss: 1.072442650794983
Validation loss: 1.0

In [32]:
model_path = os.path.join(data_path, "english_tiny_stories_6layers_f.pth")
torch.save(model, model_path)

In [35]:
starting_tokens = 'Alice wandered around and met a puppy. She was very happy.'
encoded_start = tokenizer.encode(starting_tokens.lower())
encoded_start.pop(-1)
len_starting_tokens = len(encoded_start)

idx = torch.tensor(encoded_start).reshape(1, len_starting_tokens).to(device)
model.eval()
N_SAMPLES = 20
for _ in range(N_SAMPLES):
    generation = model.generate(idx, max_new_tokens=2000, temperature=0.5, stop_token=True)[0].tolist()
    story = tokenizer.decode(generation, skip_special_tokens=False)

    print('Story ', _ + 1, ':')
    print(story)
    print('\n')



Story  1 :
[CLS] alice wandered around and met a puppy. she was very happy. the puppy was so friendly and friendly. alice and the puppy became best friends. one day, alice and the puppy were walking in the park. alice saw a big tree and said, " let's climb that tree! " the puppy barked and wagged his tail. alice was so excited and said, " let's climb up there! " alice and the puppy started to climb. alice was so happy, she jumped up and down. she felt like she was flying! the puppy barked and barked until they reached the top. alice said, " look at me! i'm flying! " the puppy barked and wagged his tail. alice smiled and said, " you're so nice, puppy! " alice then climbed down the tree and waved goodbye. alice and the puppy had a great day in the park. [SEP]


Story  2 :
[CLS] alice wandered around and met a puppy. she was very happy. she asked the puppy, " do you want to play with me? " the puppy barked and wagged his tail. alice was so excited. she jumped around, and the puppy barked 