In [None]:
file_name = "Insert file name here"
wandb_api = "Insert Key here"
load_file = lambda filename: open(filename).read() 
functions, true_derivatives = load_file(file_name)

In [None]:
!pip install wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

# I have saved my API token with "wandb_api" as Label. 
# If you use some other Label make sure to change the same below. 
wandb_api = user_secrets.get_secret("wandb_api") 

import wandb
wandb.login(key=wandb_api)

In [None]:
import os
os.environ["WANDB_PROJECT"]="grad"

# Tokenizer

In [None]:
chars = list(set("".join(functions) + "".join(true_derivatives)))

In [None]:
from tokenizers import ByteLevelBPETokenizer
from torchtext.vocab import build_vocab_from_iterator
def batch_iter():
    for x in "".join(functions) + "".join(true_derivatives):
        yield x

specials = ["<pad>",
    "<unk>",
    "<mask>",]
PAD_IDX = 0
# vocab_tr = build_vocab_from_iterator(
#     batch_iter(), 
#     min_freq=1,
#     specials=specials,
#     special_first=True
# )

tok = ByteLevelBPETokenizer()

tok.train_from_iterator(
    batch_iter(), 
    vocab_size=512,
    min_frequency=2,
    special_tokens=specials)
tok.save("vocab.json")

In [None]:
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(tokenizer_file="vocab.json", 
                                    pad_token_id = PAD_IDX, 
                                    eos_token_id = PAD_IDX, 
                                    truncation=False,
                                    padding="max_length",
                                    max_length=30,
                                    return_type="pt",
                                   return_attention_mask=True)

In [None]:
tokenizer.pad_token_id = PAD_IDX
tokenizer.eos_token_id = PAD_IDX

In [None]:
tokenizer.encode("Hello", padding="max_length", max_length=30)

In [None]:
# from transformers import T5Tokenizer
# t = T5Tokenizer(vocab_file="vocab.json")

# Model

In [None]:
from transformers import T5ForConditionalGeneration, T5Config

config = T5Config(
    vocab_size = len(tokenizer.vocab),
    d_model = 128,
    d_kv = 16,
    d_ff = 1024,
    num_layers = 8,
    num_heads = 12,
    pad_token_id = PAD_IDX,
    eos_token_id = PAD_IDX,
    decoder_start_token_id = PAD_IDX
)
model = T5ForConditionalGeneration(config)

In [None]:
model.num_parameters()

In [None]:
model

# Data

In [None]:
from torch.nn.functional import pad
import torch

def tokenize(sample):
    inp = tokenizer(sample["inp"], padding="max_length", max_length=30, return_tensors="pt")
    labels= tokenizer(sample["label"], padding="max_length", max_length=30, return_tensors="pt").input_ids
    labels[labels == tokenizer.pad_token_id] = -100
    return {
        "input_ids": inp.input_ids[0],
        "attention_mask": inp.attention_mask[0],
        "labels":labels[0],
    }

In [None]:
from datasets import Dataset
ds = Dataset.from_dict({"inp": list(functions), "label": list(true_derivatives)})
ds = ds.train_test_split(0.1)

In [None]:
tok_ds = ds.map(tokenize).remove_columns(["inp", "label"])

In [None]:
train_set = tok_ds["train"].shuffle(seed=21)
test_set = tok_ds["test"].shuffle(seed=21)

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(output_dir="test_trainer", 
                                         report_to="wandb", 
                                         logging_steps=10,
                                         run_name="grad_test_t5cond",
                                         per_device_train_batch_size = 1024,
                                         per_device_eval_batch_size = 1024,
                                         num_train_epochs=10,
                                         learning_rate=6e-4,
                                         
                                         save_strategy="steps", 
                                         save_steps=100, 
                                         evaluation_strategy="steps",
                                         eval_steps=100,
                                         predict_with_generate=True,
                                         generation_max_length=30,
                                         dataloader_drop_last=True)

In [None]:
model = model.to("cuda")

In [None]:
def compute_metrics(pred):
    references = pred.label_ids
    generated_texts = pred.predictions
    
    eq =  []
    ac = []
    reference = references[0]
    reference[reference == -100] = tokenizer.pad_token_id
    print(tokenizer.decode(reference, skip_special_tokens=True))
    
    gen_text = generated_texts[0]
    gen_text[gen_text == -100] = tokenizer.pad_token_id
    print(tokenizer.decode(gen_text, skip_special_tokens=True))
    
    for reference, gen_text in zip(references, generated_texts):
        reference[reference == -100] = tokenizer.pad_token_id
        gen_text[gen_text == -100] = tokenizer.pad_token_id

        ref = tokenizer.decode(reference, skip_special_tokens=True)
        preds = tokenizer.decode(gen_text, skip_special_tokens=True)
        eq.append("".join(ref) == "".join(preds))
        ac.append(sum([x == y for x, y in zip(ref, preds)]) / 30)
    return {
        'equality': sum(eq)/len(eq),
        "accuracy": sum(ac)/len(ac)
    }

In [None]:
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq
trainer = Seq2SeqTrainer(
    model, 
    training_args,
    DataCollatorForSeq2Seq(tokenizer, padding=False,
                           label_pad_token_id = PAD_IDX),
    train_dataset = train_set,
    eval_dataset = test_set,
    compute_metrics = compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
# trainer.save_model("/kaggle/working/")