<a href="https://colab.research.google.com/github/jyanivaddi/ERA_V1/blob/master/gpt2_hindi_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [2]:
!pip install --quiet "torchtext" "datasets" "tokenizers" "transformers"


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m101.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25h

Load the train and val datasets

In [5]:
train_dataset_path = '/content/gdrive/MyDrive/Datasets/Hindi_Aesthetics/hindi_train.txt'
val_dataset_path = '/content/gdrive/MyDrive/Datasets/Hindi_Aesthetics/hindi_val.txt'
all_data_path = '/content/gdrive/MyDrive/Datasets/Hindi_Aesthetics/hindi_asthetics_corpus.txt'

First lets build our vocabulary by tokenizing the data using word level encoding from class

In [23]:
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import Tokenizer


def build_word_level_tokenizer(data_iterator):
    # code inspired from huggingface tokenizers
    tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()
    trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"],
                                min_frequency=2)
    tokenizer.train_from_iterator(data_iterator, trainer=trainer)
    #tokenizer.train(files=[all_data_path], vocab_size=52_000, min_frequency=2, special_tokens=["<s>","<pad>","</s>","<unk>","<mask>"])
    tokenizer.save('./hindi_aesthetics_word_level.json')
    return tokenizer

In [16]:
with open(all_data_path,'r',encoding='UTF-8') as fh:
    all_data = fh.readlines()


649481


In [24]:
tokenizer = build_word_level_tokenizer(all_data)

Now lets build our dataloader

In [37]:
import torch
from torch.utils.data import Dataset, DataLoader

class HindiAestheticsDataset(Dataset):

    def __init__(self, ds_path, tokenizer, seq_len=64):
        super().__init__()
        self.seq_len = seq_len
        self.ds_path = ds_path
        self.tokenizer = tokenizer

        self.sos_token = torch.tensor([tokenizer.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer.token_to_id("[PAD]")], dtype=torch.int64)
        with open(ds_path, 'r', encoding='UTF-8') as fh:
            self.ds = fh.readlines()

    def __len__(self):
        return len(self.ds)


    def __getitem__(self, idx):
        # get a src, target pair
        input_text = self.ds[idx]

        # transform the text into tokens
        input_tokens = self.tokenizer.encode(input_text).ids
        num_input_tokens = len(input_tokens)

        #print("inside get item and I am returning the dict list!")
        return {
            "input_tokens": input_tokens,
            "token_len": num_input_tokens,
            "input_sentence": input_text,
        }

    def collate_samples(self, batch):
        """
        Perform dynamic batching on the sequences.
        For each batch, we get the length of the longest sentence and pad the remaining sentences according to that.
        """

        #print("inside collate function")
        # max encoder str length
        max_len = max(x["token_len"] for x in batch)
        #print(f"longest encoder input in this batch: {encoder_input_max}")

        x_list = []
        y_list = []
        input_sentences = []

        for cnt, x in enumerate(batch):
            # Add sos, eos and padding to each sentence
            num_padding_tokens_input = max(0, max_len - len(x["input_tokens"]))  # we will add <s> and </s>
            # we will only add only the <s> token to the decoder
            num_padding_tokens_output = num_padding_tokens_input+1

            # Add <s> and </s> token
            batch_x = torch.cat(
                [
                    self.sos_token,
                    torch.tensor(x["input_tokens"], dtype=torch.int64),
                    self.eos_token,
                    torch.tensor([self.pad_token] * num_padding_tokens_input, dtype=torch.int64),
                ],
                dim=0,
            )

            # Add only the <s>
            batch_y = torch.cat(
                [
                    torch.tensor(x["input_tokens"], dtype=torch.int64),
                    torch.tensor([self.pad_token] * num_padding_tokens_output, dtype=torch.int64),
                ],
                dim=0,
            )
            x_list.append(batch_x)
            y_list.append(batch_y)
            input_sentences.append(x["input_sentence"])

        #print("inside get item and I am returning the dict list!")
        return {
            "x": torch.vstack(x_list),
            "y": torch.vstack(y_list),
            "input_sentences": input_sentences,
        }



In [44]:
train_ds = HindiAestheticsDataset(train_dataset_path, tokenizer)
val_ds = HindiAestheticsDataset(val_dataset_path, tokenizer)
train_dataloader = DataLoader(dataset = train_ds,
                              batch_size = 8,
                              num_workers = 1,
                              collate_fn = train_ds.collate_samples)
val_dataloader = DataLoader(dataset = val_ds,
                            batch_size = 1,
                            num_workers = 1,
                            collate_fn = val_ds.collate_samples)

In [None]:
vals = next(iter(train_dataloader))


In [43]:
print(vals['x'][0])
print(vals['y'][0])
print(vals['input_sentences'][0])

tensor([    2,    25,  5476,    13, 25517,     9,    14,   281,   989,     4,
            3,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1])
tensor([   25,  5476,    13, 25517,     9,    14,   281,   989,     4,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1])
 यह चुटकी तो पहलेवाली से भी ज्यादा गहरी है

