<a href="https://colab.research.google.com/github/jyanivaddi/ERA_V1/blob/master/gpt2_hindi_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Install libraries**

In [1]:
!pip install --quiet "torchtext" "datasets" "tokenizers" "transformers"


**Lets import all the dependencies**

In [2]:
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import Tokenizer
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from torch.nn import functional as F


**Define the dataset paths**

In [3]:
# Path on Kaggle
tokenizer_path = '/kaggle/input/hindiaesthetics/hindi_aesthetics_word_level.json'
train_dataset_path = '/kaggle/input/hindiaesthetics/hindi_train.txt'
val_dataset_path = '/kaggle/input/hindiaesthetics/hindi_val.txt'


**Define Hyperparameters**

In [4]:
# hyperparameters
batch_size = 128 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# ------------

torch.manual_seed(1337)
print(device)

cuda


**Load the Vocabulary**

In [5]:
def build_word_level_tokenizer(data_path, tokenizer_path = None):
    if tokenizer_path is None:
        with open(data_path,'r',encoding='UTF-8') as fh:
            all_data = fh.readlines()
        # code inspired from huggingface tokenizers
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"],
                                    min_frequency=2)
        tokenizer.train_from_iterator(all_data, trainer=trainer)
        #tokenizer.train(files=[all_data_path], vocab_size=52_000, min_frequency=2, special_tokens=["<s>","<pad>","</s>","<unk>","<mask>"])
        tokenizer.save('./hindi_aesthetics_word_level.json')
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

In [6]:
all_data_path = None # for kaggle
tokenizer = build_word_level_tokenizer(all_data_path, tokenizer_path)
vocab_size = tokenizer.get_vocab_size()


**Define dataset class**

In [7]:
class HindiAestheticsDataset(Dataset):

    def __init__(self, ds_path, tokenizer, block_size=64):
        super().__init__()
        self.block_size = block_size
        self.ds_path = ds_path
        self.tokenizer = tokenizer

        self.sos_token = torch.tensor([tokenizer.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer.token_to_id("[PAD]")], dtype=torch.int64)
        with open(ds_path, 'r', encoding='UTF-8') as fh:
            self.ds = fh.readlines()

    def __len__(self):
        return len(self.ds)


    def __getitem__(self, idx):
        # get a src, target pair
        input_text = self.ds[idx]

        # transform the text into tokens
        input_tokens = self.tokenizer.encode(input_text).ids
        max_len_of_sentence = self.block_size - 2
        if len(input_tokens) > max_len_of_sentence:
            input_tokens = input_tokens[:max_len_of_sentence]
            
        # Add sos, eos and padding to each sentence
        num_padding_tokens_input = max(0, max_len_of_sentence - len(input_tokens))  # we will add <s> and </s>
        # we will only add only the <s> token to the decoder
        num_padding_tokens_output = num_padding_tokens_input+1

        # Add <s> and </s> token
        x = torch.cat(
            [
                self.sos_token,
                torch.tensor(input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * num_padding_tokens_input, dtype=torch.int64),
            ],
            dim=0,)

        # Add only the <s>
        y = torch.cat(
            [
                torch.tensor(input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * num_padding_tokens_output, dtype=torch.int64),
            ],
            dim=0,
        )
        #print("inside get item and I am returning the dict list!")
        #print(f"x:{len(x)} y: {len(y)}")

        return {
            "x": x,
            "y": y,
            "input_sentences": input_text,
        }

    def collate_samples(self, batch):
        """
        Perform dynamic batching on the sequences.
        For each batch, we get the length of the longest sentence and pad the remaining sentences according to that.
        """

        #print("inside collate function")
        # max encoder str length
        max_len = max(x["token_len"] for x in batch)
        #print(f"longest encoder input in this batch: {encoder_input_max}")

        x_list = []
        y_list = []
        input_sentences = []

        for cnt, x in enumerate(batch):
            # Add sos, eos and padding to each sentence
            num_padding_tokens_input = max(0, max_len - len(x["input_tokens"]))  # we will add <s> and </s>
            # we will only add only the <s> token to the decoder
            num_padding_tokens_output = num_padding_tokens_input+1

            # Add <s> and </s> token
            batch_x = torch.cat(
                [
                    self.sos_token,
                    torch.tensor(x["input_tokens"], dtype=torch.int64),
                    self.eos_token,
                    torch.tensor([self.pad_token] * num_padding_tokens_input, dtype=torch.int64),
                ],
                dim=0,
            )

            # Add only the <s>
            batch_y = torch.cat(
                [
                    torch.tensor(x["input_tokens"], dtype=torch.int64),
                    self.eos_token,
                    torch.tensor([self.pad_token] * num_padding_tokens_output, dtype=torch.int64),
                ],
                dim=0,
            )
            x_list.append(batch_x)
            y_list.append(batch_y)
            input_sentences.append(x["input_sentence"])

        #print("inside get item and I am returning the dict list!")
        return {
            "x": torch.vstack(x_list),
            "y": torch.vstack(y_list),
            "input_sentences": input_sentences,
        }



In [8]:
train_ds = HindiAestheticsDataset(train_dataset_path, tokenizer, block_size = block_size)
val_ds = HindiAestheticsDataset(val_dataset_path, tokenizer, block_size=block_size)
train_dataloader = DataLoader(dataset = train_ds,
                              batch_size = batch_size,
                              num_workers = 1,
                              collate_fn = None,
                              shuffle = True)
val_dataloader = DataLoader(dataset = val_ds,
                            batch_size = 1,
                            num_workers = 1,
                            collate_fn = None,
                            shuffle = False)

**A couple of support functions**

In [19]:
def get_batch(data_loader):
    vals = next(iter(data_loader))
    x = vals["x"]
    y = vals["y"]
    return x.to(device), y.to(device)


@torch.no_grad()
def estimate_loss(model, data_loader):
    out = {}
    model.eval()
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
        X, Y = get_batch(data_loader)
        logits, loss = model.forward(X, Y)
        losses[k] = loss.item()
    out = losses.mean()
    model.train()
    return out

def decode(enc_sec: torch.Tensor, tokenizer: any) -> str:
    """
    Function to decode a sequence of token indices back to a string
    """
    # convert the indices to a list
    enc_sec = enc_sec.tolist()
    # decode the indices to a string
    text = tokenizer.decode(enc_sec)
    return text

**Let's define the decoder model**

In [10]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        #print(f"idx shape: {idx.shape}")
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        #print(f"token embedding shape:{tok_emb.shape}")
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx



**Lets train the model**

In [11]:
model = GPTLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# generate some output based on the context
context = torch.tensor([tokenizer.token_to_id("[SOS]")], dtype=torch.int64).unsqueeze(0).to(device)

for cnt in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if cnt % eval_interval == 0 or cnt == max_iters - 1:
        train_loss = estimate_loss(train_dataloader)
        val_loss = estimate_loss(m, val_dataloader)
        print(f"step {cnt}: train loss {train_loss:.4f}, val loss {val_loss:.4f}")
        print("generated text:")
        print("--------------------------------------------")
        print(decode(enc_sec=m.generate(idx=context, max_new_tokens=100)[0],
            tokenizer=tokenizer,))

    # sample a batch of data
    xb, yb = get_batch(train_dataloader)

    # evaluate the loss
    #print(f"size of xb: {xb.shape}, size of yb:{yb.shape}")
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()



33.722928 M parameters
step 0: train loss 10.2033, val loss 10.2036
generated text:
--------------------------------------------
बचने सांसदों पंक्तियों पानदान खाकान जड़ता प्रवक्ता सालगिरह ढीली अगस्‍त परेशबाबू जुनैद अम अध्यापकों बरखास्त होश बताता अफवाहें नारे पटरियों लकड़ी धारावाहिक आद सरोकार बागची गर्वित मरती पक्षपात विह्वल स्टेला उपादान इन जवार चौखटा खूबचन्द दैवयोग मुमुक्षुओं विषैले बेशक होंडा खूबसूरत पड़ती बस्तियों तोर इस्‍पर अधिसंख्य माशा डायरेक्‍टर तख्‍त खुशकिस्मती अतीकुर सत्‍यता ललकारा चाँद याचिका पांचवां निकम्‍मे कलंक उपजती ग्राह्य अंतरात्मा कीमतों करता कटवा ऐनम् शंबूक भिक्षुक मेरे इतिहासकारों मिलना अनुबंध हिलाया इंतकाम उड़ते उतने ख्वाजासराओं पिए स्पंदनों घूमते निःसंग घुड़सवारी माँगें भिन्नताओं कारिंदों तमाम सुहाने जहाँपनाह जुड़ते बुलाएँगे पचमढ़ी खिंचा ड्रिंक्स कूटा दुश्मनी दफना मानवता गुरदीन अस्‍पताल दफा बोलियाँ
step 500: train loss 2.4435, val loss 2.4316
generated text:
--------------------------------------------
कारण उसके एक इस जालिम प्रदेश में वह जाँघ की सौदा की थी न और आपसै

In [12]:
# generate some output based on the context
context = torch.tensor([tokenizer.token_to_id("[SOS]")], dtype=torch.int64).unsqueeze(0).to(device)
for _ in range(10):
    print(
        decode(
            enc_sec=m.generate(idx=context, max_new_tokens=100)[0],
            tokenizer=tokenizer,
        )
    )


अत्याचारी बुद्धि है इसीलिए मैं किस की

विवश बनते तथा कीट अंगूर उठाकर वहीं देखती रही
एक दासी को उसका खत लिख रहे थे
'' जी , मैं तो वही हूं !'' क्योंकि मैं हूँ , तत्पश्चात मैंने ही तुमको अपनी आकर्षण स्‍वर्ग से लगवाई
वह तो यह तक में ऐसा नहीं करता तो जहाँ इन दिनों का समय नहीं पाए आवागमन की गंध डाल देता है शायद
जेल के लगभग खुले दीपक के बाँका बाल दोहन से
लेकिन आग में भाषाओं का मजदूर शोषित सुरक्षित अवसर उधार देने की तरह जमीन पर नए लोगों का एक कारण बहुत अच्छा हुआ जो अपने अपने शोक में मग्न हो
डॉक्‍टरी में रुकते हुए वह वार्डन के घर गया तो वह ताँगे के सामने जाने पर उसका स्वागत करके बोला तुम कह दो भोजन याद है
क्या ठीक था बड़े कद का आश्रय रहता था यह प्रतीत मेरा नहीं गला भरने लगा


In [13]:
!git clone "https://github.com/jyanivaddi/dl_hub.git"
!git -C dl_hub pull
!git pull

Cloning into 'dl_hub'...
remote: Enumerating objects: 581, done.[K
remote: Counting objects: 100% (333/333), done.[K
remote: Compressing objects: 100% (143/143), done.[K
remote: Total 581 (delta 215), reused 291 (delta 183), pack-reused 248[K
Receiving objects: 100% (581/581), 161.26 KiB | 2.12 MiB/s, done.
Resolving deltas: 100% (359/359), done.
Already up to date.
fatal: not a git repository (or any parent up to mount point /kaggle)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


**Now lets try with a GPT model we wrote from a previous session**

In [15]:
import sys
sys.path.append('/kaggle/working/dl_hub/')
from dl_hub.transformer_models.transformer_models import GPT

In [17]:
# train a new model
gpt_model = GPT(
    vocab_size=vocab_size,
    d_model=n_embd,
    block_size=block_size,
    num_heads=n_head,
    num_layers=n_layer,
    dropout=dropout,
    device = device
)
# load model to GPU if available
gpt_model = gpt_model.to(device)
# Initialize the parameters
for p in gpt_model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
# print the number of parameters in the model
print(
    "Model with {:.2f}M parameters".format(sum(p.numel() for p in gpt_model.parameters()) / 1e6)
)

Model with 33.71M parameters


**Lets train this model now**

In [20]:
# optimizer takes the model's parameters and the learning rate as input,
# and updates the parameters during the training process in order to
# minimize the loss function.
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
for step in range(max_iters):

    # every EVAL_INTER evaluate the loss on train and val sets
    if step % eval_iters == 0 or step == max_iters - 1:
        train_loss = estimate_loss(model, train_dataloader)
        val_loss = estimate_loss(model, val_dataloader)
        print(f"step {cnt}: train loss {train_loss:.4f}, val loss {val_loss:.4f}")
        print("generated text:")
        print("--------------------------------------------")
        print(decode(enc_sec=m.generate(idx=context, max_new_tokens=100)[0],
            tokenizer=tokenizer,))

    # sample a batch of data
    xb, yb = get_batch(train_dataloader)
    logits, loss = gpt_model.forward(xb, yb)
    # zero_grad() method sets the gradients of all parameters in the optimizer to zero
    optimizer.zero_grad(set_to_none=True)
    # backward() method on the loss variable calculates the gradients
    # of the loss with respect to the model's parameters.
    loss.backward()
    # step() method on the optimizer updates the model's parameters
    # using the calculated gradients, in order to minimize the loss.
    optimizer.step()

step 4999: train loss 10.2872, val loss 10.3692
generated text:
--------------------------------------------
शायद मैं जानता हूँ कि लोगों ने जल्दी से कहा मगर पटेश्वरी ने तो चले आया


KeyboardInterrupt: 