<a href="https://colab.research.google.com/github/hemanthkumar17/Transformer-Workflow/blob/main/Transformer_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
wandb_api = "Insert API Key here"
file_path = "Insert data file path here"

In [None]:
load_file = lambda filename: open(filename).read()  # Define your loading function here
functions, true_derivatives = load_file(file_path)

In [None]:
!pip install wandb
import wandb
wandb.login(key=wandb_api)



[34m[1mwandb[0m: Currently logged in as: [33mhj51[0m ([33mopt-prune[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
import os
os.environ["WANDB_PROJECT"]="grad"

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"

# Tokenizer

In [None]:
chars = list(set("".join(functions) + "".join(true_derivatives)))

In [None]:
from tokenizers import ByteLevelBPETokenizer
from torchtext.vocab import build_vocab_from_iterator
def batch_iter():
    for x in "".join(functions) + "".join(true_derivatives):
        yield x

specials = ["<pad>",
    "<unk>",
    "<mask>",]
PAD_IDX = 0
# vocab_tr = build_vocab_from_iterator(
#     batch_iter(),
#     min_freq=1,
#     specials=specials,
#     special_first=True
# )

tok = ByteLevelBPETokenizer()

tok.train_from_iterator(
    batch_iter(),
    vocab_size=256,
    min_frequency=2,
    special_tokens=specials)
tok.save("vocab.json")

In [None]:
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(tokenizer_file="vocab.json",
                                    pad_token_id = PAD_IDX,
                                    eos_token_id = PAD_IDX,
                                    truncation=False,
                                    padding="max_length",
                                    max_length=30,
                                    return_type="pt",
                                   return_attention_mask=True)

In [None]:
tokenizer.pad_token_id = PAD_IDX
tokenizer.eos_token_id = PAD_IDX

In [None]:
tokenizer.encode("Hello", padding="max_length", max_length=30)

[42,
 71,
 78,
 78,
 81,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [None]:
# from transformers import T5Tokenizer
# t = T5Tokenizer(vocab_file="vocab.json")

# Data

In [None]:
from torch.nn.functional import pad
import torch

def tokenize(sample):
    inp = tokenizer(sample["inp"], padding="max_length", max_length=30, return_tensors="pt")
    labels= tokenizer(sample["label"], padding="max_length", max_length=30, return_tensors="pt").input_ids
#     labels[labels == tokenizer.pad_token_id] = -100
    return {
        "input_ids": inp.input_ids[0],
        "attention_mask": inp.attention_mask[0],
        "labels":labels[0],
    }

In [None]:
!pip install datasets

In [None]:
from datasets import Dataset
ds = Dataset.from_dict({"inp": list(functions), "label": list(true_derivatives)})
ds = ds.train_test_split(0.1)

In [None]:
tok_ds = ds.map(tokenize).remove_columns(["inp", "label"])

Map:   0%|          | 0/900000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [None]:
train_set = tok_ds["train"].shuffle(seed=21)
test_set = tok_ds["test"].shuffle(seed=21)

# HyperParameters

In [None]:
batch_size=512
MAX_LENGTH = 30
hidden_size = 128
n_layers = 4

In [None]:
CONFIG = dict (
    model_name = "GRU",
    hidden_size=hidden_size,
    epochs=100,
    batch_size=batch_size,
    n_layers=n_layers
    )



# Model

In [None]:
import torch.nn as nn
import torch.nn.functional as F
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(tokenizer.vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, num_layers=n_layers)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, inp):
#         print(input.shape)
#         print(self.embedding)
        embedded = self.dropout(self.embedding(inp))
        output, hidden = self.gru(embedded)
        return output, hidden

In [None]:
encoder = EncoderRNN(hidden_size).to(device)

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(tokenizer.vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, num_layers=n_layers)
        self.out = nn.Linear(hidden_size, tokenizer.vocab_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(PAD_IDX)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, inp, hidden):
        output = self.embedding(inp)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [None]:
decoder = DecoderRNN(hidden_size).to(device)

In [None]:
# sum(p.numel() for p in encoder.parameters() if p.requires_grad)
s1 = sum(p.numel() for p in decoder.parameters() if p.requires_grad)
s2 = sum(p.numel() for p in encoder.parameters() if p.requires_grad)
print(f"{s1} + {s2} = {s1 + s2}")


462851 + 429440 = 892291


# Trainer

In [None]:
from torch.utils.data import DataLoader
from transformers import DataCollatorForSeq2Seq
train_loader = DataLoader(
    train_set,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True,
#     collate_fn = DataCollatorForSeq2Seq(tokenizer, padding=False,
#                            label_pad_token_id = PAD_IDX)
)
test_loader = DataLoader(
    test_set,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True,
#     collate_fn = DataCollatorForSeq2Seq(tokenizer, padding=False,
#                            label_pad_token_id = PAD_IDX)
)

In [None]:
from tqdm.auto import tqdm
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in tqdm(dataloader, leave=False):
        input_tensor = torch.stack(data["input_ids"])
        target_tensor = torch.stack(data["labels"])

        input_tensor = input_tensor.transpose(0, 1).to(device)
        target_tensor = target_tensor.transpose(0, 1).to(device)
#         input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)

        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        oh = F.one_hot(target_tensor, tokenizer.vocab_size).float()
        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            oh.view(-1, oh.size(-1))
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


In [None]:
def compute_metrics(pred):

    eq =  []
    ac = []
    reference = references[0]

    gen_text = generated_texts[0]

    for reference, gen_text in zip(references, generated_texts):

        ref = tokenizer.decode(reference, skip_special_tokens=True)
        preds = tokenizer.decode(gen_text, skip_special_tokens=True)
        eq.append("".join(ref) == "".join(preds))
        ac.append(sum([x == y for x, y in zip(ref, preds)]) / 30)
    return {
        'equality': sum(eq)/len(eq),
        "accuracy": sum(ac)/len(ac)
    }

In [None]:
def eval_epoch(dataloader, encoder, decoder, criterion):

    total_loss = 0
    total_eq = 0
    total_ac = 0
    encoder.eval()
    decoder.eval()
    with torch.no_grad():
        for data in tqdm(dataloader, leave=False):
            input_tensor = torch.stack(data["input_ids"])
            target_tensor = torch.stack(data["labels"])

            input_tensor = input_tensor.transpose(0, 1).to(device)
            target_tensor = target_tensor.transpose(0, 1).to(device)

            encoder_outputs, encoder_hidden = encoder(input_tensor)

            decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden)

            oh = F.one_hot(target_tensor, tokenizer.vocab_size).float()
            loss = criterion(
                decoder_outputs.view(-1, decoder_outputs.size(-1)),
                oh.view(-1, oh.size(-1))
            )
            _, topi = decoder_outputs.topk(1)
            output = tokenizer.batch_decode(topi.squeeze(), skip_special_tokens = True)

            if total_loss == 0:
                print(output[0])
                print(tokenizer.batch_decode(target_tensor, skip_special_tokens=True)[0])

            total_loss += loss.item()
            total_eq += torch.all(topi.squeeze() == target_tensor, dim=-1)
            total_ac += torch.sum(topi.squeeze() == target_tensor) / 30
    encoder.train()
    decoder.train()
    return {"valid_loss": total_loss / len(dataloader),
            "equality": torch.sum(total_eq) / (len(dataloader) * batch_size),
            "accuracy": total_ac / (len(dataloader)*batch_size)}

In [None]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
import time
import torch.optim as optim

def train(train_dataloader, eval_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=1, plot_every=100):
    wandb.init(config=CONFIG, entity="hj51", project="grad_rnn")
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    for epoch in (pbar := tqdm(range(1, n_epochs + 1))):

        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss
        wandb.log({"train_loss": loss, "train_epoch": epoch})
        pbar.set_description(f"Loss = {float(loss)}")

        res = eval_epoch(eval_dataloader, encoder, decoder, criterion)
        wandb.log(res)
        print(res)


    wandb.finish()


In [None]:
train(train_loader, test_loader, encoder, decoder, n_epochs=10)

In [None]:
torch.save(encoder, "encoder_model_gru.pth")
torch.save(decoder, "decoder_model_gru.pth")