In [1]:
! pip install -U accelerate
! pip install -U transformers

# ! pip install transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
import numpy as np
from torch.utils.data import Dataset



In [3]:
class AdditionDataset(Dataset):
    """
    Returns addition problems of up to some number of digits in the inputs. Recall
    that all GPT cares about are sequences of integers, and completing them according to
    patterns in the data. Therefore, we have to somehow encode addition problems
    as a sequence of integers.

    The sum of two n-digit numbers gives a third up to (n+1)-digit number. So our
    encoding will simply be the n-digit first number, n-digit second number,
    and (n+1)-digit result, all simply concatenated together. Because each addition
    problem is so structured, there is no need to bother the model with encoding
    +, =, or other tokens. Each possible sequence has the same length, and simply
    contains the raw digits of the addition problem.

    As a few examples, the 2-digit problems:
    - 85 + 50 = 135 becomes the sequence [8, 5, 5, 0, 1, 3, 5]
    - 6 + 39 = 45 becomes the sequence [0, 6, 3, 9, 0, 4, 5]
    etc.

    We will also only train GPT on the final (n+1)-digits because the first
    two n-digits are always assumed to be given. So when we give GPT an exam later,
    we will e.g. feed it the sequence [0, 6, 3, 9], which encodes that we'd like
    to add 6 + 39, and hope that the model completes the integer sequence with [0, 4, 5]
    in 3 sequential steps.

    fun exercise: does it help if the result is asked to be produced in reverse order?
    """

    def __init__(self, ndigit, split):
        self.split = split # train/test
        self.ndigit = ndigit
        self.vocab_size = 10 # 10 possible digits 0..9
        # +1 due to potential carry overflow, but then -1 because very last digit doesn't plug back
        self.block_size = ndigit + ndigit + ndigit + 1 - 1

        # split up all addition problems into either training data or test data
        num = (10**self.ndigit)**2 # total number of possible combinations
        r = np.random.RandomState(1337) # make deterministic
        perm = r.permutation(num)
        num_test = min(int(num*0.2), 1000) # 20% of the whole dataset, or only up to 1000
        self.ixes = perm[:num_test] if split == 'test' else perm[num_test:]

    def __len__(self):
        return self.ixes.size

    def __getitem__(self, idx):
        # given a problem index idx, first recover the associated a + b
        idx = self.ixes[idx]
        nd = 10**self.ndigit
        a = idx // nd
        b = idx %  nd
        c = a + b
        render = f'%0{self.ndigit}d%0{self.ndigit}d%0{self.ndigit+1}d' % (a,b,c) # e.g. 03+25=28 becomes "0325028"

        return {"input_ids":tokenizer(render[:-3])["input_ids"], "labels":tokenizer(render[-3:])["input_ids"]}

In [4]:
model_checkpoint = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer(["0", "01", "20"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'input_ids': [[3, 632, 1], [7088, 1], [460, 1]], 'attention_mask': [[1, 1, 1], [1, 1], [1, 1]]}

In [5]:
batch_size = 128
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-xsum",
    evaluation_strategy = "epoch",
    learning_rate=2e-3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=50,
    predict_with_generate=True,
    push_to_hub=False,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [6]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=AdditionDataset(2,"train"),
    eval_dataset=AdditionDataset(2,"test"),
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.238322
2,No log,1.047816
3,No log,0.949335
4,No log,0.895559
5,No log,0.81402
6,No log,0.763756
7,No log,0.691789
8,1.079900,0.682255
9,1.079900,0.609168
10,1.079900,0.583301


TrainOutput(global_step=3550, training_loss=0.42466637006947694, metrics={'train_runtime': 435.1876, 'train_samples_per_second': 1034.037, 'train_steps_per_second': 8.157, 'total_flos': 475811020800000.0, 'train_loss': 0.42466637006947694, 'epoch': 50.0})

In [7]:
test = AdditionDataset(2,"test")
count = 0
for index in range(len(test)):
  device = 'cuda'
  preds = model.generate(input_ids = torch.tensor(test[index]["input_ids"]).to(device).view(1,-1))
  count+=1.0*(int(tokenizer.decode(np.array(preds.cpu()[0]))[5:-4]) == int(tokenizer.decode(np.array(test[index]["labels"]))[:-4]))
  if index%10==0:
    print(tokenizer.decode(np.array(preds.cpu()[0]))[5:-4],tokenizer.decode(np.array(test[index]["labels"]))[:-4])
count/len(test)



 126 126
 104 104
 072 072
 104 104
 027 027
 062 062
 014 014
 167 168
 100 100
 062 062
 082 082
 057 057
 174 174
 123 123
 061 061
 123 123
 027 027
 072 072
 139 139
 069 069
 089 089
 036 036
 130 130
 142 142
 067 067
 122 122
 049 049
 089 089
 112 112
 067 067
 007 007
 121 121
 060 060
 138 138
 137 137
 109 109
 110 110
 096 096
 106 106
 084 084
 063 063
 010 010
 101 101
 032 032
 056 056
 157 157
 160 160
 125 125
 104 104
 132 132
 057 057
 135 135
 034 034
 099 099
 124 124
 021 021
 051 051
 146 146
 108 108
 086 086


ValueError: invalid literal for int() with base 10: ' 03 2020'

In [17]:
type(model)

transformers.models.t5.modeling_t5.T5ForConditionalGeneration

In [None]:
! zip -r mathformer t5-small-finetuned-xsum/

  adding: t5-small-finetuned-xsum/ (stored 0%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/ (stored 0%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/tokenizer_config.json (deflated 80%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/tokenizer.json (deflated 74%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/rng_state.pth (deflated 27%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/trainer_state.json (deflated 82%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/pytorch_model.bin (deflated 9%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/special_tokens_map.json (deflated 83%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/config.json (deflated 62%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/scheduler.pt (deflated 49%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/optimizer.pt (deflated 6%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/training_args.bin (deflated 48%)
  adding: t5-small-finetuned-xsum/checkpoint-3000/ (stored 0%)
  adding: t

In [None]:
from google.colab import drive
drive.mount('/drive')

Mounted at /drive


In [None]:
!cp mathformer.zip /drive/MyDrive/BACHU/mathformer.zip

In [None]:
!ls /drive/MyDrive/BACHU/

mathformer.zip
