## Model and Dataset Inspection

Before preprocessing, we examine the `flan-t5-large` model and the `DeepMind` 1-dimensional linear algebra dataset to confirm they are compatible for finetuning on a mathematical task.

In [0]:
# ensure we have the most recent version of transformers
!pip install -U transformers
dbutils.library.restartPython()

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/d0/a7/7eedcf6a359e1e1eff3bc204ad022485aa5d88c08e1e3e0e0aee8a2e2235/transformers-4.47.0-py3-none-any.whl.metadata
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.5 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Obtaining dependency information for huggingface-hub<1.0,>=0.24.0 from https://files.pythonhosted.org/packages/44/5a/dc6af87c61f89b23439eb95521e4e99862636cfd538ae12fd36be5483e5f/huggingface_hub-0.26.5-py3-none-any.whl.metadata
  Downloading huggingface_hub-0.26.5-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Obtaining dependency information for tokenizers<

In [0]:
# grab alegbra__linear_1d dataset from DeepMind
import datasets

train_examples_1d, eval_examples_1d = datasets.load_dataset('deepmind/math_dataset', 'algebra__linear_1d', split=['train', 'test'])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [0]:
# check split sizes
train_examples_1d, eval_examples_1d

(Dataset({
     features: ['question', 'answer'],
     num_rows: 1999998
 }),
 Dataset({
     features: ['question', 'answer'],
     num_rows: 10000
 }))

In [0]:
# peek the data
train_examples_1d[:5], eval_examples_1d[:5]

({'question': ["b'Solve 24 = 1601*c - 1605*c for c.\\n'",
   "b'Solve 657 = -220*t + 1086*t + 22307 for t.\\n'",
   "b'Solve -11*y - 263*y + 3162 = -88*y for y.\\n'",
   "b'Solve 0 = -11*b - 4148 + 4225 for b.\\n'",
   "b'Solve 65*l - 361 + 881 = 0 for l.\\n'"],
  'answer': ["b'-6\\n'", "b'-25\\n'", "b'17\\n'", "b'7\\n'", "b'-8\\n'"]},
 {'question': ["b'Solve -282*d + 929 - 178 = -1223 for d.\\n'",
   "b'Solve 49*l + 45*l - 125 - 63 = 0 for l.\\n'",
   "b'Solve -64*t + 1387 - 848 + 933 = 0 for t.\\n'",
   "b'Solve 75*g = 192*g - 71*g - 79*g - 264 for g.\\n'",
   "b'Solve -34*v + 232*v + 52351 = 48985 for v.\\n'"],
  'answer': ["b'7\\n'", "b'2\\n'", "b'23\\n'", "b'-8\\n'", "b'-17\\n'"]})

In [0]:
# instantiate our flan-t5-large model with brain float mixed precision - this model appears better for comparing labels with the DeepMind mathematics dataset than the Gemma text generation model 
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto", torch_dtype=torch.bfloat16)

2024-12-11 19:47:53.166943: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [0]:
# double check the precision of our tensors
model.dtype

torch.bfloat16

In [0]:
# test an inference example
input_text = "Solve 24 = 1601*c - 1605*c for c."
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

<pad> 4</s>


In [0]:
# another inference example
input_text = "Solve -11*y - 263*y + 3162 = -88*y for y."
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

<pad> -23</s>


In [0]:
# this model appears to output a single (albeit often incorrect) answer making it easier to train and compare to the DeepMind mathematics dataset than the Google Gemma text generation model

# we examine the architecture as given by the transformers' T5ForConditionalGeneration class
# we can leverage this later in detail to estimate the amount of memory required for training
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

In [0]:
# look at the amount of GPU memory used to load the model in bfloat16
# for this initial inspection and preprocessing we use a single T4 GPU
# we will subsequently beef up our compute for training
def mem_status(): 
    if torch.cuda.is_available():
        gpus = torch.cuda.device_count()
        print("Memory status: ")
        for i in range(gpus):
            properties = torch.cuda.get_device_properties(i)
            total_memory = properties.total_memory / (1024 ** 3)  # Convert to GB
            allocated_memory = torch.cuda.memory_allocated(i) / (1024 ** 3)  # Convert to GB
            reserved_memory = torch.cuda.memory_reserved(i) / (1024 ** 3)  # Convert to GB
            available_memory = total_memory - reserved_memory
            print(f"GPU {i}:")
            print(f"  Total memory: {total_memory:.2f} GB")
            print(f"  Allocated memory: {allocated_memory:.2f} GB")
            print(f"  Reserved memory: {reserved_memory:.2f} GB")
            print(f"  Available memory: {available_memory:.2f} GB")
    else:
        print("No GPU available.")

mem_status()

Memory status: 
GPU 0:
  Total memory: 15.57 GB
  Allocated memory: 1.51 GB
  Reserved memory: 1.55 GB
  Available memory: 14.02 GB


##Preprocessing and Prep for Training

The `alegbra__linear_1d` split of the DeepMind math dataset comes in a friendly raw format but still requires a fair amount of calculated preprocessing for configuring all 2M examples and labels before passing to the `flan-t5-large` seq2seq model during training.  

In [0]:
# we need to clean up some of the dataset's formatting
# it appears all 'questions' and 'answers' are string prefixed with  "b'  and postfixed with  //n'"

In [0]:
# remove unneeded characters
def clean_up_dataset(record):
    record['question'] = record['question'][2:-3]
    record['answer'] = record['answer'][2:-3]
    return record

In [0]:
# map fn to train and eval datsets
train_examples_1d = train_examples_1d.map(clean_up_dataset)
eval_examples_1d = eval_examples_1d.map(clean_up_dataset)

Map:   0%|          | 0/1999998 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [0]:
train_examples_1d, eval_examples_1d

(Dataset({
     features: ['question', 'answer'],
     num_rows: 1999998
 }),
 Dataset({
     features: ['question', 'answer'],
     num_rows: 10000
 }))

In [0]:
train_examples_1d[:5], eval_examples_1d[:5]

({'question': ['Solve 24 = 1601*c - 1605*c for c.',
   'Solve 657 = -220*t + 1086*t + 22307 for t.',
   'Solve -11*y - 263*y + 3162 = -88*y for y.',
   'Solve 0 = -11*b - 4148 + 4225 for b.',
   'Solve 65*l - 361 + 881 = 0 for l.'],
  'answer': ['-6', '-25', '17', '7', '-8']},
 {'question': ['Solve -282*d + 929 - 178 = -1223 for d.',
   'Solve 49*l + 45*l - 125 - 63 = 0 for l.',
   'Solve -64*t + 1387 - 848 + 933 = 0 for t.',
   'Solve 75*g = 192*g - 71*g - 79*g - 264 for g.',
   'Solve -34*v + 232*v + 52351 = 48985 for v.'],
  'answer': ['7', '2', '23', '-8', '-17']})

In [0]:
# tokenize function
def preprocess_function(example):
    return tokenizer(example["question"], text_target=example["answer"], return_tensors="pt", padding=True)

In [0]:
# validate preprocess_function works as expected
test_tokenization = preprocess_function(train_examples_1d[0])
test_tokenization

{'input_ids': tensor([[5175,  162,  997, 3274,  898, 4542, 1935,   75,    3,   18,  898, 3076,
         1935,   75,   21,    3,   75,    5,    1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[   3, 5783,    1]])}

In [0]:
# tokenize our datasets
tokenized_train_dataset = train_examples_1d.map(
    preprocess_function,
    batched=True,
    remove_columns=train_examples_1d.column_names, # remove old column names
)
tokenized_eval_dataset = eval_examples_1d.map(
    preprocess_function,
    batched=True,
    remove_columns=eval_examples_1d.column_names,
)

Map:   0%|          | 0/1999998 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [0]:
tokenized_train_dataset, tokenized_eval_dataset

(Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 1999998
 }),
 Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 10000
 }))

In [0]:
# peek a tokenized example
# notice these are not tensors; the map function stores information in Apache Arrow format, and does not include the Python metadata
# see below for setting the output format
tokenized_train_dataset[0]

{'input_ids': [5175,
  162,
  997,
  3274,
  898,
  4542,
  1935,
  75,
  3,
  18,
  898,
  3076,
  1935,
  75,
  21,
  3,
  75,
  5,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'labels': [3, 5783, 1, 0]}

In [0]:
# we can use the .set_format("torch", device="cuda") attribute on the dataset to change the output format, (it will not change the data format - which is still Arrow)
tokenized_train_dataset.set_format("torch", device="cuda")

In [0]:
# now this gives us the tensors we desire
tokenized_train_dataset[0]

{'input_ids': tensor([5175,  162,  997, 3274,  898, 4542, 1935,   75,    3,   18,  898, 3076,
         1935,   75,   21,    3,   75,    5,    1,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0], device='cuda:0'),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0'),
 'labels': tensor([   3, 5783,    1,    0], device='cuda:0')}

In [0]:
# will having the pad tokens at the end of our labels effect our loss and backpropogation?
tokenizer.convert_ids_to_tokens(tokenized_train_dataset[0]["labels"])

['▁', '-6', '</s>', '<pad>']

The above is likely a non-issue for the `exact_match` metric we will use, but some consideration need be given to possible evaluation of an `f1` metric for partial correctness. Furthermore, we will need to ultimately collate our data using the `DataCollatorForSeq2Seq` class. In doing so, we will pad and square off all of our `input_ids` and `labels` each to the same length across the dataset. Thus, padding tokens at the output of our labels and predictions is inevitable.

In [0]:
# before passing these to the model, we need to unsqueeze the input_ids and the labels to add an extra dimension to our tensors
torch.tensor(tokenized_train_dataset[0]["input_ids"]).unsqueeze(0)

  torch.tensor(tokenized_train_dataset[0]["input_ids"]).unsqueeze(0)


tensor([[5175,  162,  997, 3274,  898, 4542, 1935,   75,    3,   18,  898, 3076,
         1935,   75,   21,    3,   75,    5,    1,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0]],
       device='cuda:0')

In [0]:
# before collating our data, we examine the evaluation of the loss on a specific example
# a good set of metrics are probably exact_match for complete correctness and f1 for partial correctness (which encodes both precision and recall)
import math
from evaluate import load

# tokenized_example = tokenized_train_dataset[257636]
tokenized_example = tokenized_train_dataset[234254]
output = model(input_ids=tokenized_example["input_ids"].unsqueeze(0), labels=tokenized_example["labels"].unsqueeze(0))

example_cross_entropy_loss = output.loss.item()
model_prediction_raw = output.logits.argmax(-1)

# peek model predicition vs label
print("predicted tokens: " + str(model_prediction_raw[0]))
print("label tokens: " + str(tokenized_example['labels']))

# check out the model prediction upon decoding
print("predicition: " + str(tokenizer.decode(model_prediction_raw[0], skip_special_tokens=True)))
# compare the prediction with the label
print("label: " + str(tokenizer.decode(tokenized_example['labels'], skip_special_tokens=True)))

# the loss stored by the model
print(f">>> The model loss as given by model().loss: {example_cross_entropy_loss}")

f1 = load("f1")
exact_match = load("exact_match")

# compute exact match score for total correctness
exact_match.add(predictions=tokenizer.decode(model_prediction_raw[0], skip_special_tokens=True), references=tokenizer.decode(tokenized_example['labels'], skip_special_tokens=True))
print(">>> exact_match score: " + str(exact_match.compute()["exact_match"]))

# compute the f1 score for partial correctness
# TODO what averaging method is best for f1? Micro appears to average on a token-to-token mapping
# NOTE in some instances, the decoded output gives an exact match, where generally one would expect the f1 score to be 1.0, but occasionally the raw tensor output (containing special tokens) can be such that the f1 score is nontrivially not equal to 1.0
# NOTE we need to consider the above and think about a solid method for rectifying this issue in order to properly intrepet the partial correctness score - one could potentially remove special tokens from perdicitons and labels, but this does not guarnatee that we have tensors of equal lenght for comparison
print(">>> f1 score: " + str(f1.compute(predictions=model_prediction_raw[0], references=tokenized_example['labels'], average='micro')["f1"]))

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


predicted tokens: tensor([   3,    1,    1, 1401], device='cuda:0')
label tokens: tensor([1401,    1,    0,    0], device='cuda:0')
predicition: 21
label: 21
>>> The model loss as given by model().loss: 24.75
>>> exact_match score: 1.0
>>> f1 score: 0.25


In [0]:
# the example above using record 234254 outputs the correct answer, but an f1 score of .25
tokenizer.convert_ids_to_tokens([ 3,    1,    1, 1401])

['▁', '</s>', '</s>', '▁21']

In [0]:
# NOTE this method is erroneous - we are not guaranteed predictions and labels of the same length
# to account for the issue of special tokens skeweing the partial correctness f1 score as noted above, we can remove all special tokens in the predictions and labels tensors before computing the f1 score

# first we examine all special tokens
tokenizer.special_tokens_map

{'eos_token': '</s>',
 'unk_token': '<unk>',
 'pad_token': '<pad>',
 'additional_special_tokens': ['<extra_id_0>',
  '<extra_id_1>',
  '<extra_id_2>',
  '<extra_id_3>',
  '<extra_id_4>',
  '<extra_id_5>',
  '<extra_id_6>',
  '<extra_id_7>',
  '<extra_id_8>',
  '<extra_id_9>',
  '<extra_id_10>',
  '<extra_id_11>',
  '<extra_id_12>',
  '<extra_id_13>',
  '<extra_id_14>',
  '<extra_id_15>',
  '<extra_id_16>',
  '<extra_id_17>',
  '<extra_id_18>',
  '<extra_id_19>',
  '<extra_id_20>',
  '<extra_id_21>',
  '<extra_id_22>',
  '<extra_id_23>',
  '<extra_id_24>',
  '<extra_id_25>',
  '<extra_id_26>',
  '<extra_id_27>',
  '<extra_id_28>',
  '<extra_id_29>',
  '<extra_id_30>',
  '<extra_id_31>',
  '<extra_id_32>',
  '<extra_id_33>',
  '<extra_id_34>',
  '<extra_id_35>',
  '<extra_id_36>',
  '<extra_id_37>',
  '<extra_id_38>',
  '<extra_id_39>',
  '<extra_id_40>',
  '<extra_id_41>',
  '<extra_id_42>',
  '<extra_id_43>',
  '<extra_id_44>',
  '<extra_id_45>',
  '<extra_id_46>',
  '<extra_id_47>',
 

In [0]:
# get the ids of the most common special tokens
{k: tokenizer.convert_tokens_to_ids(v) for k, v in tokenizer.special_tokens_map.items() if k != 'additional_special_tokens'}

{'eos_token': 1, 'unk_token': 2, 'pad_token': 0}

In [0]:
# examine the ids of the additional special tokens
[tokenizer.convert_tokens_to_ids(k) for k in tokenizer.special_tokens_map["additional_special_tokens"]]

[32099,
 32098,
 32097,
 32096,
 32095,
 32094,
 32093,
 32092,
 32091,
 32090,
 32089,
 32088,
 32087,
 32086,
 32085,
 32084,
 32083,
 32082,
 32081,
 32080,
 32079,
 32078,
 32077,
 32076,
 32075,
 32074,
 32073,
 32072,
 32071,
 32070,
 32069,
 32068,
 32067,
 32066,
 32065,
 32064,
 32063,
 32062,
 32061,
 32060,
 32059,
 32058,
 32057,
 32056,
 32055,
 32054,
 32053,
 32052,
 32051,
 32050,
 32049,
 32048,
 32047,
 32046,
 32045,
 32044,
 32043,
 32042,
 32041,
 32040,
 32039,
 32038,
 32037,
 32036,
 32035,
 32034,
 32033,
 32032,
 32031,
 32030,
 32029,
 32028,
 32027,
 32026,
 32025,
 32024,
 32023,
 32022,
 32021,
 32020,
 32019,
 32018,
 32017,
 32016,
 32015,
 32014,
 32013,
 32012,
 32011,
 32010,
 32009,
 32008,
 32007,
 32006,
 32005,
 32004,
 32003,
 32002,
 32001,
 32000]

In [0]:
# lets try removing the common special tokens from the tensor to compare f1 scores

# we can use the torch.masked_select() fn to remove elements less than 2 - the eos and padding tokens
mask = model_prediction_raw[0] >= 2
new_tensor = torch.masked_select(model_prediction_raw[0], mask)

new_tensor

tensor([   3, 1401], device='cuda:0')

In [0]:
# NOTE this is a perfect example of where our analysis fails
# now we apply this to the previous example
prediciton_mask = model_prediction_raw[0] >= 2
label_mask = tokenized_example['labels'] >= 2
print(">>> f1 score after masking special tokens: " + str(f1.compute(predictions=torch.masked_select(model_prediction_raw[0], prediciton_mask), references=torch.masked_select(tokenized_example['labels'], label_mask), average="micro")))

[0;31m---------------------------------------------------------------------------[0m
[0;31mValueError[0m                                Traceback (most recent call last)
File [0;32m<command-158712734298444>, line 5[0m
[1;32m      3[0m prediciton_mask [38;5;241m=[39m model_prediction_raw[[38;5;241m0[39m] [38;5;241m>[39m[38;5;241m=[39m [38;5;241m2[39m
[1;32m      4[0m label_mask [38;5;241m=[39m tokenized_example[[38;5;124m'[39m[38;5;124mlabels[39m[38;5;124m'[39m] [38;5;241m>[39m[38;5;241m=[39m [38;5;241m2[39m
[0;32m----> 5[0m [38;5;28mprint[39m([38;5;124m"[39m[38;5;124m>>> f1 score after masking special tokens: [39m[38;5;124m"[39m [38;5;241m+[39m [38;5;28mstr[39m(f1[38;5;241m.[39mcompute(predictions[38;5;241m=[39mtorch[38;5;241m.[39mmasked_select(model_prediction_raw[[38;5;241m0[39m], prediciton_mask), references[38;5;241m=[39mtorch[38;5;241m.[39mmasked_select(tokenized_example[[38;5;124m'[39m[38;5;124mlabels[39m[38;5;124m'

In [0]:
# to even get a notion of partial correctness, we need to keep track of each predicitons f1 score
# one can envision doing this with the test example below
for predictions, references in zip([[1,2,3],[1,2,3]], [[1,2,3],[3,2,1]]):
    print(">>> f1 score after masking special tokens: " + str(f1.compute(predictions=predictions, references=references, average="micro")))

>>> f1 score after masking special tokens: {'f1': 1.0}
>>> f1 score after masking special tokens: {'f1': 0.3333333333333333}


In [0]:
# or one could compute an f1 score of the entire datasets token prediciton as so:
from itertools import chain

f1_test = load("f1")
test_preds = [[1,2,3],[1,2,3]]
test_labels = [[1,2,3],[1,2,4]]
predictions = []
labels  = []
for data in test_preds:
    for i in data:
        predictions.append(i)
for data in test_labels:
    for i in data:
        labels.append(i)
print(predictions)
print(labels)
f1_test.add_batch(predictions=predictions, references=labels)
f1_test.compute(average=None)

[1, 2, 3, 1, 2, 3]
[1, 2, 3, 1, 2, 4]


{'f1': array([1.        , 1.        , 0.66666667, 0.        ])}

**Initial Erroneous Attempt** (see below): This seems like a fine method for comparing the relevant tokens upon decoding. We can apply this masking to each prediction in order to get a measure of the partial correctness relative to the label tokens in the form of an f1 score with a "micro" averaging, independent of the special padding and eos tokens. Yet, one small caveat remains - that of the space token denoted `"_"` with token id `3`. As demonstrated above in the comparison between the prediction and label of record # `234254`  (and perhaps many others) the full prediction including the space token gives an exact match upon decoding, but the token-to-token 'micro' averaged f1 score will still be less than the expected 1.0. We will leave this as such for now, just as we have left the desired unknown token, given that the space token could play a valuable role in decoding outputs. We will have to consider how to intrepret our f1 score with such a subtlety.

**Correction to the above**: During preprocessing, an attempt to mask and remove special tokens from both predictions and labels seemed preliminarily to allow a better f1 score for intrepreting partial correctness of predictions. It turns out that such a method is erroneous given that the number of non-special predicition tokens does not inherently match the number of non-special label tokens leading to tensors or different length during computation of the f1 score - inevitably raising an error. As such, we will leave the f1 score as a comparison inclusive of special tokens and reconsider how one might intrepret such a metric. Perhaps one might learn something interesting about how the model predictions, both before and after decoding, shed light on the learning and mathematical problem solving abilities of the model. Nonetheless, the presence of special tokens in each of our predictions, even when decoded to the correct exact match output, will certainly bias our partial correctness scores.

**TODO**: what remains to be decided is how we will measure the f1 score. There are various ways we can do this: 
- 1) Between each predicition and its label (allowing for a direct intrepretation of the partial correctness for each prediction), giving a total of **len(dataset)** f1 scores
- 2) Across the amalgamated sets of predicted and label tokens (allowing for a large scale interpretation of how well the model predicted all individual tokens), giving a **single** f1 score 
- 3) Both of the above

Such a consideration for measuring the partial correctness of predictions might lead us to also consider how the *precision* and *recall* metrics independetly score. Given the convulted nature of these metrics, perhaps it is best to account for partial correctness only in the benchmarking and evaluation cases where we are truly measuring the effectiveness of the model. The smaller size of the evaluation dataset might similarly allow us to efficiently keep track of each individual partial correctness scores (case #1 above).  After all, the model's inherent cross-entropy loss is the loss which our optimizer will improve the model through backpropogation during training while the exact match score for full correctness remains indeed our primary metric for performance intrepretation. We can retain the exact match score during training as an intermediate evaluation of its performance across training epochs.

Returning to the preprocessing at hand, we note that we will ultimately publish our cleaned dataset to the 🤗 hub. This will ease the data loading process into each of the separate benchmarking, training, and evaluation notebooks - substantially so for the training processes which will utilize 🤗 accelerate for distributed training. In each step above, after downloading our training-ready data, we will configure our data loaders. Before then, and as a final preprocessing step, we must investigate the ultimate shape of our data a bit further.

In [0]:
# preliminarily, we convert our tokenized datasets' data format to numpy
# this will ultimately be required under the hood by the DataCollatorForSeq2Seq class for padding the labels to the same length in each of our batches 
tokenized_train_dataset.set_format("numpy")
tokenized_eval_dataset.set_format("numpy")

In [0]:
# do we need to truncate any of our examples? the max context window of flan-t5-large is 1024
max_length = 0
for seq_tokens in tokenized_train_dataset["input_ids"]:
    if len(seq_tokens) > max_length: max_length = len(seq_tokens)
print(max_length) # 40 tokens

# this cells confirms that we do not have any questions which need to be truncated

40


In [0]:
# similarly for our eval dataset
max_length = 0
for seq_tokens in tokenized_eval_dataset["input_ids"]:
    if len(seq_tokens) > max_length: max_length = len(seq_tokens)
print(max_length) # 38 tokens

38


In [0]:
# now we need to investigate how we will handle labels with 3 tokens (or any number different from the majority of labels with 4 tokens)  
# presumably the data collator will pad the labels in each of our batches to 4, but are there instances where we have either more than or less than 4 tokens?
# let's examine the full datasets for the min and max token lenghts of our labels

max_length = 0
for seq_tokens in tokenized_train_dataset["labels"]:
    if len(seq_tokens) > max_length: max_length = len(seq_tokens)
print("max label length: " + str(max_length))

min_length = 40 # number we previously found for max length
for seq_tokens in tokenized_train_dataset["labels"]:
    if len(seq_tokens) < min_length: min_length = len(seq_tokens)
print("min label length: " + str(min_length))

max label length: 4
min label length: 3


In [0]:
# repeat the same process for our eval set
max_length = 0
for seq_tokens in tokenized_eval_dataset["labels"]:
    if len(seq_tokens) > max_length: max_length = len(seq_tokens)
print("max label length: " + str(max_length))

min_length = 40 # number we previously found for max length
for seq_tokens in tokenized_eval_dataset["labels"]:
    if len(seq_tokens) < min_length: min_length = len(seq_tokens)
print("min label length: " + str(min_length))

max label length: 4
min label length: 4


In [0]:
# all 10k records in the eval dataset have a label of 4 tokens, so we are square there, but
# how many instances of 3 are there in the train dataset?

count_of_3 = 0
for seq_tokens in tokenized_train_dataset["labels"]:
    if len(seq_tokens) == 3: count_of_3 += 1
count_of_3

666000

In [0]:
# this is clearly a highly nontrivial number of labels of length 3
# indeed these labels make up huge percentage of the total train dataset

print(str(666000 / len(tokenized_train_dataset)) +"%")

0.333000333000333%


In recognition of the large presence of tensor labels with length 3, it is necessary for us to pad all of our labels to length 4 at the very least. We utilize the powerful `pad_to_multiple_of` parameter in the `DataCollatorForSeq2Seq` class whose documentation references the following: 


*"This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.0 (Volta)."* 

**Note**: before this research project, the above documentation was slightly erroneous and previously listed the the Nvidia Volta architecture as given by compute capability *7.5*. Indeed, we researched this typo further,  put out an issue on the Hugging Face transformers GitHub repo, and fix this error ourselves. The corresponding issue and PR (both now closed) can be found here, respectively: [https://github.com/huggingface/transformers/issues/35174](https://github.com/huggingface/transformers/issues/35174) and [https://github.com/huggingface/transformers/pull/35188](https://github.com/huggingface/transformers/pull/35188).

In passing the `pad_to_multiple_of=2` arg to our data collator, we guarantee that any labels of length 3 are padded to length 4 and can be easily compared with our predictions throughout training and evaluation. With this rectangular padding, we are able then to leverage the tensor cores of either a T4 GPU or a V100 GPU to expedite our training. The Nvidia documentation for the T4 and V100 GPUs states that, respectively, we can achieve up to 65 fp16 TFLOPS and up to 125 TFLOPS during computation.

Furthermore, we cite here a nice article detailing how to improve performance on Nvidia tensor cores: [https://developer.nvidia.com/blog/optimizing-gpu-performance-tensor-cores/](https://developer.nvidia.com/blog/optimizing-gpu-performance-tensor-cores/). This article and the more general NVIDIA Deep Learning Performance Guide detail how configuring the batch size and the number of inputs and outputs as a multiple of 8 will accelerate computation.

Lastly, we require mixed precision to leverage the acceleration of tensor cores. We have already double checked the default types of our tensors with the `.dtype` attribute. The weights in our model should already be `torch.bfloat16` from loading our flan-t5-large model with the `torch_dtype=torch.bfloat16` parameter passed. One can ensure the default tensor type by using the `torch.set_default_device()` command.

With all of this information, we are now ready to properly and wisely configure our data loaders for accelerated benchmarking, training, and evaluation. Rather than doing this here, we will dedicate separate notebooks to each of these causes. We conclude this preprocessing notebook by publishing our cleaned data to the hub at `MarioBarbeque/DeepMind-LinAlg-1D-train` and `MarioBarbeque/DeepMind-LinAlg-1D-eval`. In each of the subsequent notebooks this dataset will be loaded and configured in dataloaders for training.

In [0]:
# Now lets push this model to our Hub

dbutils.widgets.text("hf_token", "", "hf_token")

In [0]:
hf_token = dbutils.widgets.get("hf_token")
!huggingface-cli login --token $hf_token

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `Personal Hub Token` has been saved to /Volumes/workspace_dogfood/jgr/hugging_face_cache/stored_tokens
Your token has been saved to /Volumes/workspace_dogfood/jgr/hugging_face_cache/token
Login successful.
The current active token is: `Personal Hub Token`


In [0]:
tokenized_train_dataset.push_to_hub("DeepMind-LinAlg-1D-train", commit_message="Cleaned, tokenized, and DataLoader-ready 1D linear algebra TRAINING dataset from DeepMind; for use with FLAN-T5")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2000 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/MarioBarbeque/DeepMind-LinAlg-1D-train/commit/8620397bf3c0a116747c7c51772a0fcc06bcf45c', commit_message='Cleaned, tokenized, and DataLoader-ready 1D linear algebra TRAINING dataset from DeepMind; for use with FLAN-T5', commit_description='', oid='8620397bf3c0a116747c7c51772a0fcc06bcf45c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/MarioBarbeque/DeepMind-LinAlg-1D-train', endpoint='https://huggingface.co', repo_type='dataset', repo_id='MarioBarbeque/DeepMind-LinAlg-1D-train'), pr_revision=None, pr_num=None)

In [0]:
tokenized_eval_dataset.push_to_hub("DeepMind-LinAlg-1D-eval", commit_message="Cleaned, tokenized, and DataLoader-ready 1D linear algebra EVAL dataset from DeepMind; for use with FLAN-T5")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/MarioBarbeque/DeepMind-LinAlg-1D-eval/commit/a00a253a845116f4469082dc063e1c15ba193bee', commit_message='Cleaned, tokenized, and DataLoader-ready 1D linear algebra EVAL dataset from DeepMind; for use with FLAN-T5', commit_description='', oid='a00a253a845116f4469082dc063e1c15ba193bee', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/MarioBarbeque/DeepMind-LinAlg-1D-eval', endpoint='https://huggingface.co', repo_type='dataset', repo_id='MarioBarbeque/DeepMind-LinAlg-1D-eval'), pr_revision=None, pr_num=None)

See the Benchmarking notebook next.