Data set https://huggingface.co/datasets/roneneldan/TinyStories

Copyright Hoa Vu

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import random
import pickle
import math
from transformers import AutoTokenizer, BertTokenizer
import gc

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
if not torch.cuda.is_available():
    print("CUDA is not available on your system.")
else:
    # Print the number of CUDA devices
    print(f"Number of CUDA Devices: {torch.cuda.device_count()}\n")

    for i in range(torch.cuda.device_count()):
        print(f"Device {i}:")
        print(f"    Name: {torch.cuda.get_device_name(i)}")
        print(f"    Computational Capability: {torch.cuda.get_device_capability(i)}")
        print(f"    Total Memory: {torch.cuda.get_device_properties(i).total_memory / (1024**2):.2f} MB\n")


Number of CUDA Devices: 1

Device 0:
    Name: NVIDIA GeForce RTX 3090
    Computational Capability: (8, 6)
    Total Memory: 24575.50 MB



In [5]:
data_path = 'datasets/tiny_stories/'

In [6]:
filename = 'TinyStoriesV2-GPT4-train.txt'
val_filename = 'TinyStoriesV2-GPT4-valid.txt'

filepath = os.path.join(data_path, filename)
with open(filepath, 'r', encoding='utf-8') as file:
    train_data = file.read()
train_data = train_data.replace('\n', '')
train_data = train_data.split('<|endoftext|>')

val_filepath = os.path.join(data_path, val_filename)
with open(val_filepath, 'r', encoding='utf-8') as file:
    val_data = file.read()
val_data = val_data.replace('\n', '')
val_data = val_data.split('<|endoftext|>')

print(train_data[0])


Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He saw many amazing things, like beautiful vases that were on display in a store. One day, Ben was walking through the store when he came across a very special vase. When Ben saw it he was amazed!  He said, “Wow, that is a really amazing vase! Can I buy it?” The shopkeeper smiled and said, “Of course you can. You can take it home and show all your friends how amazing it is!”So Ben took the vase home and he was so proud of it! He called his friends over and showed them the amazing vase. All his friends thought the vase was beautiful and couldn't believe how lucky Ben was. And that's how Ben found an amazing vase in the store!


In [7]:
train_data = pd.DataFrame(train_data, columns=['content'])
val_data = pd.DataFrame(val_data, columns=['content'])

print(train_data.shape)
print(val_data.shape)

print(train_data.head())


(2717700, 1)
(27631, 1)
                                             content
0  Once upon a time there was a little boy named ...
1  Once upon a time, there was a reliable otter n...
2  One day, a little boy named Tim went to the pa...
3  Once upon a time there was a friendly little b...
4  Once upon a time, in a small house, there live...


In [8]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Get the IDs
cls_id = tokenizer.cls_token_id
sep_id = tokenizer.sep_token_id
unk_id = tokenizer.unk_token_id
pad_id = tokenizer.pad_token_id

print("PAD Token ID:", pad_id)
print("UNK Token ID:", unk_id)
print("CLS Token ID:", cls_id)
print("SEP Token ID:", sep_id)

PAD Token ID: 0
UNK Token ID: 100
CLS Token ID: 101
SEP Token ID: 102


In [9]:
BLOCK_SIZE = 256
VOCAB_SIZE = tokenizer.vocab_size
print('Vocab size is ', VOCAB_SIZE)


Vocab size is  30522


In [12]:
def get_batch(data, block_size, batch_size):
    x = torch.zeros((batch_size, block_size), dtype=torch.long)
    y = torch.zeros((batch_size, block_size), dtype=torch.long)
    samples = data['content'].sample(n=batch_size)

    for i, sample in enumerate(samples):
        token_ids = tokenizer.encode(sample)
        token_ids = [id for id in token_ids if id != unk_id]
        if len(token_ids) < block_size + 2:
            token_ids = token_ids + [pad_id] * (block_size + 2 - len(token_ids))
        random_start = random.randint(0, len(token_ids) - block_size - 2)
        x[i, :len(token_ids)] = torch.tensor(token_ids[random_start:random_start + block_size], dtype=torch.long)
        y[i, :len(token_ids)] = torch.tensor(token_ids[random_start + 1:random_start + block_size + 1], dtype=torch.long)

    return x, y

a, b = get_batch(train_data, block_size=BLOCK_SIZE, batch_size=1)
print(a.shape, b.shape)
print(tokenizer.decode(a[0].tolist(), skip_special_tokens=False))
print(tokenizer.decode(b[0].tolist(), skip_special_tokens=False))

torch.Size([1, 256]) torch.Size([1, 256])
[CLS] once upon a time, there was a little boy named tim. tim had an old toy car that he loved a lot. the toy car was very old and didn't go as fast as it used to. tim wanted to improve his toy car so it could go fast again. one day, tim asked his mom to help him improve his toy car. they worked together in silence, fixing the old toy car. they put new wheels on it and painted it a bright color. tim was very happy with the changes they made. now, tim's old toy car was fast again. he played with it every day, racing it around the house. the silence was gone as tim laughed and had fun with his improved toy car. he was so thankful to his mom for helping him make his old toy car better. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [15]:
N_EMB = 800
N_LAYERS = 6
N_HEADS = 10
DROPOUT = 0.2


def estimate_loss(model, val_data, block_size, batch_size):
    model.eval()
    with torch.no_grad():
        x, y = get_batch(val_data, block_size, batch_size)
        x, y = x.to(device), y.to(device)
        _, loss = model(x, y)
    model.train()
    return loss.item()

def generate_square_subsequent_mask(sz):
    mask = (torch.tril(torch.ones(sz, sz)) == 1).float()
    mask = mask.masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def get_sine_position_encodings(length, dim):
    pos = torch.arange(length, dtype=torch.float32).reshape(-1, 1)
    div_term = torch.exp(torch.arange(0, dim, 2).float() * -(math.log(10000.0) / dim))
    pos_encodings = torch.zeros(length, dim)
    pos_encodings[:, 0::2] = torch.sin(pos * div_term)
    pos_encodings[:, 1::2] = torch.cos(pos * div_term)
    return pos_encodings

class LanguageModel(nn.Module):

    def __init__(self, vocab_size, n_emb, block_size, n_layers, n_heads, dropout=0.2):
        super(LanguageModel, self).__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)
        self.position_embedding_table = nn.Embedding(block_size, n_emb)
        self.block_size = block_size

        encoder_layer = nn.TransformerEncoderLayer(d_model=n_emb, nhead=n_heads, dropout=dropout, layer_norm_eps=1e-6)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        self.feed_forward = nn.Sequential(
            nn.Linear(n_emb, 2 * n_emb),
            nn.ReLU(),
            nn.Linear(2 * n_emb, n_emb)
        )

        self.lm_head = nn.Linear(n_emb, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        token_emb = self.token_embedding_table(idx)
        position_emb = self.position_embedding_table(torch.arange(T, device=device))

        x = token_emb + position_emb
        x_transform = x.clone()
        mask = generate_square_subsequent_mask(T).to(device)

        x_transform = self.transformer_encoder(x_transform.permute(1, 0, 2), mask=mask)
        x_transform = x_transform.permute(1, 0, 2)
        x = x + x_transform

        x = self.feed_forward(x)
        logits = self.lm_head(x)

        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
            return logits, loss
        else:
            return logits, None


    def generate(self, idx, max_new_tokens, temperature=1.0, stop_token=False):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.block_size:]
            logits, _ = self.forward(idx_cond)

            # Scale logits by the temperature
            logits = logits[:, -1, :] / temperature

            probs = F.softmax(logits, dim=-1)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_new], dim=-1)
            if stop_token and idx_new.item() == sep_id:
                break
        return idx

# Create model, optimizer
model = LanguageModel(vocab_size=VOCAB_SIZE, block_size=BLOCK_SIZE, n_emb=N_EMB, n_layers=N_LAYERS, \
    n_heads=N_HEADS, dropout=DROPOUT).to(device)

print(f'Number of parameters {sum(p.numel() for p in model.parameters() if p.requires_grad)}')



Number of parameters 86709210


In [10]:
model_path = os.path.join(data_path, "english_tiny_stories_6layers.pth")
model = torch.load(model_path)

datasets/tiny_stories/english_tiny_stories_6layers.pth


In [19]:
EARLY_STOP = 50
N_EPOCHS = 20000
BATCH_SIZE = 32
LEARNING_RATE = 3e-4
SAVE_FREQ = 500
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

model_path = os.path.join(data_path, "english_tiny_stories_6layers.pth")

for steps in range(N_EPOCHS):
    model.train()
    xb, yb = get_batch(train_data, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE)
    xb = xb.to(device)
    yb = yb.to(device)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Delete xb, yb and free GPU memory
    del xb, yb
    torch.cuda.empty_cache()

    if steps % 100 == 0:
        print('Step:', steps, 'Training Loss:', loss.item())
        val_loss = estimate_loss(model, val_data, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE)
        print('Validation loss:', val_loss)
    if steps % SAVE_FREQ == 0:
        torch.save(model, model_path)


Step: 0 Training Loss: 1.9710575342178345
Validation loss: 2.055588960647583
Step: 100 Training Loss: 1.9694819450378418
Validation loss: 1.8874846696853638
Step: 200 Training Loss: 1.871381163597107
Validation loss: 1.931668996810913
Step: 300 Training Loss: 1.9949411153793335
Validation loss: 1.8256757259368896
Step: 400 Training Loss: 1.8179761171340942
Validation loss: 1.7700958251953125
Step: 500 Training Loss: 1.84648859500885
Validation loss: 2.003999948501587
Step: 600 Training Loss: 2.053264617919922
Validation loss: 1.8363125324249268
Step: 700 Training Loss: 1.6992143392562866
Validation loss: 1.740901231765747
Step: 800 Training Loss: 1.7828682661056519
Validation loss: 1.6546281576156616
Step: 900 Training Loss: 1.6639729738235474
Validation loss: 1.7989543676376343
Step: 1000 Training Loss: 1.7726454734802246
Validation loss: 1.679582118988037
Step: 1100 Training Loss: 1.6442790031433105
Validation loss: 1.7373028993606567
Step: 1200 Training Loss: 1.5917258262634277
Vali

In [13]:
model_path = os.path.join(data_path, "english_tiny_stories_6layers.pth")
torch.save(model, model_path)

In [20]:
starting_tokens = 'Alice lost her chicken. She went outside to look for it. But it was dark.'
encoded_start = tokenizer.encode(starting_tokens.lower())
encoded_start.pop(-1)
len_starting_tokens = len(encoded_start)

idx = torch.tensor(encoded_start).reshape(1, len_starting_tokens).to(device)
model.eval()
N_SAMPLES = 20
for _ in range(N_SAMPLES):
    generation = model.generate(idx, max_new_tokens=2000, temperature=0.7, stop_token=True)[0].tolist()
    story = tokenizer.decode(generation, skip_special_tokens=False)

    print('Story ', _ + 1, ':')
    print(story)
    print('\n')


Story  1 :
[CLS] alice lost her chicken. she went outside to look for it. but it was dark. alice saw a butterfly. she wanted it so she slowly moved the butterfly back to her nest. but the butterfly was too fast. she flew away with the butterfly. alice was sad. she wanted her chicken back. she looked around and spotted a group of birds. they were all alone. alice was so happy! she stopped flying and looked at the butterflies. she smiled and said, " hello! " the butterflies nodded and flew away. alice felt so happy and strong. she thanked the butterflies and headed back home. [SEP]


Story  2 :
[CLS] alice lost her chicken. she went outside to look for it. but it was dark. suddenly, she saw something that made her way back home. it was a big, beautiful horse. she was so happy to find it. alice was even more excited. she had never seen a horse before. she said to the horse, " hello! let's keep the horse and take it home. " the horse was so happy to see alice. they put the horse in a box a