<a href="https://colab.research.google.com/github/jyanivaddi/ERA_V1/blob/master/gpt2_hindi_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [2]:
!git clone "https://github.com/jyanivaddi/dl_hub.git"
!git -C dl_hub pull
!git pull

Cloning into 'dl_hub'...

remote: Enumerating objects: 581, done.[K

remote: Counting objects: 100% (333/333), done.[K

remote: Compressing objects: 100% (143/143), done.[K

remote: Total 581 (delta 215), reused 291 (delta 183), pack-reused 248[K

Receiving objects: 100% (581/581), 161.26 KiB | 860.00 KiB/s, done.

Resolving deltas: 100% (359/359), done.

Already up to date.

fatal: not a git repository (or any of the parent directories): .git


In [1]:
!pip install --quiet "torchtext" "datasets" "tokenizers" "transformers"


Load the train and val datasets

In [2]:
# Path on Google Drive
#train_dataset_path = '/content/gdrive/MyDrive/Datasets/Hindi_Aesthetics/hindi_train.txt'
#val_dataset_path = '/content/gdrive/MyDrive/Datasets/Hindi_Aesthetics/hindi_val.txt'
#all_data_path = '/content/gdrive/MyDrive/Datasets/Hindi_Aesthetics/hindi_asthetics_corpus.txt'

# Path on Kaggle
tokenizer_path = '/kaggle/input/hindiaesthetics/hindi_aesthetics_word_level.json'
train_dataset_path = '/kaggle/input/hindiaesthetics/hindi_train.txt'
val_dataset_path = '/kaggle/input/hindiaesthetics/hindi_val.txt'


First lets build our vocabulary by tokenizing the data using word level encoding from class

In [5]:
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import Tokenizer
from pathlib import Path


def build_word_level_tokenizer(data_path, tokenizer_path = None):
    if tokenizer_path is None:
        with open(data_path,'r',encoding='UTF-8') as fh:
            all_data = fh.readlines()
        # code inspired from huggingface tokenizers
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"],
                                    min_frequency=2)
        tokenizer.train_from_iterator(all_data, trainer=trainer)
        #tokenizer.train(files=[all_data_path], vocab_size=52_000, min_frequency=2, special_tokens=["<s>","<pad>","</s>","<unk>","<mask>"])
        tokenizer.save('./hindi_aesthetics_word_level.json')
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

In [6]:
all_data_path = None # for kaggle
tokenizer = build_word_level_tokenizer(all_data_path, tokenizer_path)

Now lets build our dataloader

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader

class HindiAestheticsDataset(Dataset):

    def __init__(self, ds_path, tokenizer, seq_len=64):
        super().__init__()
        self.seq_len = seq_len
        self.ds_path = ds_path
        self.tokenizer = tokenizer

        self.sos_token = torch.tensor([tokenizer.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer.token_to_id("[PAD]")], dtype=torch.int64)
        with open(ds_path, 'r', encoding='UTF-8') as fh:
            self.ds = fh.readlines()

    def __len__(self):
        return len(self.ds)


    def __getitem__(self, idx):
        # get a src, target pair
        input_text = self.ds[idx]

        # transform the text into tokens
        input_tokens = self.tokenizer.encode(input_text).ids
        num_input_tokens = len(input_tokens)

        #print("inside get item and I am returning the dict list!")
        return {
            "input_tokens": input_tokens,
            "token_len": num_input_tokens,
            "input_sentence": input_text,
        }

    def collate_samples(self, batch):
        """
        Perform dynamic batching on the sequences.
        For each batch, we get the length of the longest sentence and pad the remaining sentences according to that.
        """

        #print("inside collate function")
        # max encoder str length
        max_len = max(x["token_len"] for x in batch)
        #print(f"longest encoder input in this batch: {encoder_input_max}")

        x_list = []
        y_list = []
        input_sentences = []

        for cnt, x in enumerate(batch):
            # Add sos, eos and padding to each sentence
            num_padding_tokens_input = max(0, max_len - len(x["input_tokens"]))  # we will add <s> and </s>
            # we will only add only the <s> token to the decoder
            num_padding_tokens_output = num_padding_tokens_input+1

            # Add <s> and </s> token
            batch_x = torch.cat(
                [
                    self.sos_token,
                    torch.tensor(x["input_tokens"], dtype=torch.int64),
                    self.eos_token,
                    torch.tensor([self.pad_token] * num_padding_tokens_input, dtype=torch.int64),
                ],
                dim=0,
            )

            # Add only the <s>
            batch_y = torch.cat(
                [
                    torch.tensor(x["input_tokens"], dtype=torch.int64),
                    torch.tensor([self.pad_token] * num_padding_tokens_output, dtype=torch.int64),
                ],
                dim=0,
            )
            x_list.append(batch_x)
            y_list.append(batch_y)
            input_sentences.append(x["input_sentence"])

        #print("inside get item and I am returning the dict list!")
        return {
            "x": torch.vstack(x_list),
            "y": torch.vstack(y_list),
            "input_sentences": input_sentences,
        }



In [8]:
BATCH_SIZE = 32
train_ds = HindiAestheticsDataset(train_dataset_path, tokenizer)
val_ds = HindiAestheticsDataset(val_dataset_path, tokenizer)
train_dataloader = DataLoader(dataset = train_ds,
                              batch_size = BATCH_SIZE,
                              num_workers = 1,
                              collate_fn = train_ds.collate_samples)
val_dataloader = DataLoader(dataset = val_ds,
                            batch_size = 1,
                            num_workers = 1,
                            collate_fn = val_ds.collate_samples)

In [None]:
vals = next(iter(train_dataloader))


In [51]:
print(vals['x'][2])
print(vals['y'][2])
print(vals['input_sentences'][2])

tensor([    2,     6,   425,  3189,   855,    62,   184,   725,  1970,   208,

           68,   190,     5,  7429,     6,    46,    57,     7,     0,   411,

           11,  1363,    26,   122,  1220,    35,  1363,   607, 14573,     0,

           22,  8771,  2236,     6,     3])

tensor([    6,   425,  3189,   855,    62,   184,   725,  1970,   208,    68,

          190,     5,  7429,     6,    46,    57,     7,     0,   411,    11,

         1363,    26,   122,  1220,    35,  1363,   607, 14573,     0,    22,

         8771,  2236,     6,     1])

में इसकी धार सीधे  बहुत बड़े वर्ग समुदाय समाज या देश के संदर्भों में  उस तरह की बृहत सोच को दिशा न दे पाई  जो दिशा सर रिचर्ड एटनबरो ने सिनेमाई अंदाज में




In [11]:
from dl_hub.transformer_models.transformer_models import GPT


In [12]:
# train a new model
vocab_size = tokenizer.get_vocab_size()
NUM_HEAD = 6
NUM_EMBED = NUM_HEAD*128
NUM_LAYER = 6
DROPOUT = 0.2
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BLOCK_SIZE = 64
model = GPT(
    vocab_size=vocab_size,
    d_model=NUM_EMBED,
    block_size=BLOCK_SIZE,
    num_heads=NUM_HEAD,
    num_layers=NUM_LAYER,
    dropout=DROPOUT,
    device = DEVICE
)
# load model to GPU if available
m = model.to(DEVICE)
# print the number of parameters in the model
print(
    "Model with {:.2f}M parameters".format(sum(p.numel() for p in m.parameters()) / 1e6)
)

Model with 88.65M parameters


In [13]:
def get_batch(data_loader):
    vals = next(iter(train_dataloader))
    x = vals["x"]
    y = vals["y"]
    return x.to(DEVICE), y.to(DEVICE)


@torch.no_grad()
def estimate_loss(
    data_loader,
    model: torch.nn.Module,
    block_size: int,
    batch_size: int,
    eval_iters:int = 200):
    out = {}
    model.eval()
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
        X, Y = get_batch(data_loader)
        logits, loss = model.forward(X, Y)
        losses[k] = loss.item()
    out = losses.mean()
    model.train()
    return out

In [15]:
# optimizer takes the model's parameters and the learning rate as input,
# and updates the parameters during the training process in order to
# minimize the loss function.
LEARNING_RATE = 3e-4
EVAL_INTER = 500

optimizer = torch.optim.AdamW(m.parameters(), lr=LEARNING_RATE)
MAX_ITER = 20000
for step in range(MAX_ITER):

    # every EVAL_INTER evaluate the loss on train and val sets
    if step % EVAL_INTER == 0 or step == MAX_ITER - 1:
        loss_train = estimate_loss(
            train_dataloader, model=m, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        loss_val = estimate_loss(
            val_dataloader, model=m, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        print("step {:10} | train loss {:6.4f} | val loss {:6.4f}".format(step, loss_train, loss_val))

    # sample a batch of data
    xb, yb = get_batch(data=train_dataloader)
    logits, loss = m.forward(xb, yb)
    # zero_grad() method sets the gradients of all parameters in the optimizer to zero
    optimizer.zero_grad(set_to_none=True)
    # backward() method on the loss variable calculates the gradients
    # of the loss with respect to the model's parameters.
    loss.backward()
    # step() method on the optimizer updates the model's parameters
    # using the calculated gradients, in order to minimize the loss.
    optimizer.step()

RuntimeError: ignored