Figure out how to create evaluation data for 5 operations: multiply 1-digit, carry, mod 10, sum, concat; Share their plan on Slack with us before they run it.
Design* prompts for LLaMA2 and/or FLAN-T5 to answer instances of 5 operations, and share them with us. * Following prior work: they will check what others have done by querying https://aclanthology.org/.
Use prompts to get answers.

Report accuracies for each of the 5 operations.

```
> Overall goal:
  Input:
  "x * y =" st x,y in Z+
  Output:
  "z" st z is the product of x and y

> Suboperations:

  Multiply 1-digit:
    Input:
    "x * y =" st x,y in [0-9]
    Output:
    "z" st z is the product of x and y

  Carry:
    Input:
    "x // 10 =" st x in [0-99]
    Output:
    "z" st z is the digit that is in the tens place of x

  Sum:
    Input:
    "x + y =" st x,y in [0-99]
    Output:
    "z" st that z is the sum of x and y
    
  Concatenate:
    Input:
    "a, b, c =" st a, b, c in Z+
    Output:
    "abc"
```

In [1]:
!pip install accelerate
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install accelerate
!pip install torch



In [2]:
# import os
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"

In [3]:
with open("multiply.csv", "w") as file:
  for x in range(0, 1000):
    for y in range(0, 1000):
      file.write(f'"Multiply two numbers.\n###\n5 * 3 = 15\n###\n{x} * {y} =",{x * y}\n')

In [4]:
with open("multiply_1_digit.csv", "w") as file:
  for x in range(0, 10):
    for y in range(0, 10):
      file.write(f'"Multiply two numbers.\n###\n5 * 3 = 15\n###\n{x} * {y} =",{x * y}\n')

In [5]:
with open("carry.csv", "w") as file:
  for x in range(0, 100):
    file.write(f'"Carry the digit from the tens place.\n###\n15 // 10 = 1\n###\n{x} // 10 =",{x // 10}\n')

In [6]:
with open('summation.csv', 'w') as file:
  for x in range(0, 1000):
    for y in range(0, 1000):
      file.write(f'"Add two numbers.\n###\n5 + 3 = 8\n###\n{x} + {y} =",{x + y}\n')

In [7]:
with open('concatenation.csv', 'w') as file:
  for x in range(0, 100):
    for y in range(0, 100):
      for z in range(0, 100):
        file.write(f'"Concatenate the numbers.\n###\n1, 5, 3 = 153\n###\n{x}, {y}, {z} =",{x}{y}{z}\n')

In [50]:
from datasets import load_dataset

multiply_dataset = load_dataset("csv", data_files="multiply.csv", delimiter=',', column_names=['question', 'answer'])
multiply_1_digit_dataset = load_dataset("csv", data_files="multiply_1_digit.csv", delimiter=',', column_names=['question', 'answer'])
carry_dataset = load_dataset("csv", data_files="carry.csv", delimiter=',', column_names=['question', 'answer'])
summation_dataset = load_dataset("csv", data_files="summation.csv", delimiter=',', column_names=['question', 'answer'])
concatenation_dataset = load_dataset("csv", data_files="concatenation.csv", delimiter=',', column_names=['question', 'answer'])

In [51]:
multiply_dataset["train"][0]["question"]

'Multiply two numbers.\r\n###\r\n5 * 3 = 15\r\n###\r\n0 * 0 ='

In [78]:
import torch
import warnings
from tqdm import tqdm

# def validate_model(model, tokenizer, dataset):
#     correct = 0
#     for index in tqdm(range(len(dataset["question"]))):
#         input_text = dataset["question"][index]
#         input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
#         outputs = model.generate(input_ids, max_length=64)
#         predicted_answer = [int(d) for d in re.findall(r'-?\d+', tokenizer.decode(outputs[0]))]
#         if len(predicted_answer) > 1:
#             warnings.warn("WARNING: Prediction contained multiple integers; resorting to the first integer found.")
#         if int(predicted_answer[0]) == int(dataset["answer"][index]):
#             correct += 1
#     return correct / len(dataset)

def generate_encodings(tokenizer, dataset):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    encodings = tokenizer(dataset["question"], padding=True, truncation=True, return_tensors="pt").to(device)
    return encodings

def generate_texts(model, encodings):
    with torch.no_grad():
        generated_ids = model.generate(**encodings)
    generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    return generated_texts

def get_accuracy(expected, actual):
    correct = 0
    for index in range(len(expected)):
        prediction = [int(d) for d in re.findall(r'-?\d+', tokenizer.decode(expected[index][0]))]
        if len(prediction) > 1:
            warnings.warn("WARNING: Prediction contained multiple integers; resorting to the first integer found.")
        if int(prediction[0]) == int(actual["answer"][index]):
            correct += 1
    return correct  / len(actual["question"])
        
def validate_model(model, tokenizer, dataset):
    expected_texts = generate_texts(model, tokenizer, dataset)
    return get_accuracy(expected_texts, dataset)

# from transformers import pipeline
# from transformers.pipelines.pt_utils import KeyDataset
# import datasets

# dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised")
# pipe = pipeline("text-classification", device=0)
# for out in pipe(KeyDataset(dataset, "text"), batch_size=8, truncation="only_first"):
#     print(out)
#     # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
#     # Exactly the same output as before, but the content are passed
#     # as batches to the model

In [79]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", device_map="auto")

# validate_model(model, tokenizer, multiply_dataset["train"])

encodings = generate_encodings(tokenizer, multiply_dataset["train"])

1000000

In [46]:
tokenized_multiply_datasets

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask'],
        num_rows: 1000000
    })
})

In [39]:
import re

s = tokenizer.decode(outputs[0])
result = [int(d) for d in re.findall(r'-?\d+', s)]

print(result)

[-1]


In [25]:
# from transformers import DataCollatorWithPadding
# from transformers import T5Tokenizer

# tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")

# def tokenize_function(examples):
#     return tokenizer(examples["question"], text_target=list(map(str, examples["answer"])), padding=True, truncation=True, return_tensors="pt")

# tokenized_multiply_datasets = multiply_datasets.map(tokenize_function, batched=True)
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# tokenized_multiply_datasets = tokenized_multiply_datasets.remove_columns(["question", "answer"])
# tokenized_multiply_datasets

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Map:   0%|          | 0/900000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 900000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100000
    })
})

In [59]:
# from torch.utils.data import DataLoader

# train_dataloader = DataLoader(
#     tokenized_multiply_datasets["train"], shuffle=True, batch_size=16, collate_fn=data_collator
# )
# eval_dataloader = DataLoader(
#     tokenized_multiply_datasets["test"], batch_size=16, collate_fn=data_collator
# )

In [60]:
# for batch in train_dataloader:
#     break
# {k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([16, 22]),
 'attention_mask': torch.Size([16, 22]),
 'labels': torch.Size([16, 5])}

In [61]:
# from transformers import T5ForConditionalGeneration, get_scheduler
# from torch.optim import AdamW
# import torch
# from tqdm.auto import tqdm
# import evaluate
# torch.cuda.empty_cache()

# model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")
# optimizer = AdamW(model.parameters(), lr=3e-5)
# metric = evaluate.load("glue", "mrpc")

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)

# num_epochs = 3
# num_training_steps = num_epochs * len(train_dataloader)
# lr_scheduler = get_scheduler(
#     "linear",
#     optimizer=optimizer,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps,
# )

# progress_bar = tqdm(range(num_training_steps))

# for epoch in range(num_epochs):
#     model.train()
#     for batch in train_dataloader:
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**batch)
#         loss = outputs.loss
#         loss.backward()

#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()
#         progress_bar.update(1)
        
#     model.eval()
#     for batch in eval_dataloader:
#         batch = {k: v.to(device) for k, v in batch.items()}
#         with torch.no_grad():
#             outputs = model(**batch)

#         logits = outputs.logits
#         predictions = torch.argmax(logits, dim=-1)
#         metric.add_batch(predictions=predictions, references=batch["labels"])

#     print(metric.compute())

  0%|          | 0/168750 [00:00<?, ?it/s]

ValueError: Predictions and/or references don't match the expected format.
Expected format: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)},
Input predictions: tensor([[  305,  2266,  4122,     1,     0],
        [  431,  3891, 19337,     1,     0],
        [  997, 24622, 17485,     1,     0],
        [    3,  2079,  2079,     1,     0],
        [    3,  1298,     1,     0,     0],
        [  505,   927,     1,     0,     0],
        [  505,  2079,   755,     1,     0],
        [ 2307,  3651,   591,     1,     0],
        [    3, 24837,  5548,     1,     0],
        [    3,  4440,  3166,  4056,     1],
        [ 2838,  3651,  3651,     1,     0],
        [  220,  1808, 17246,     1,     0],
        [    3, 27730,  3707,     1,     0],
        [ 4678,  2128,  4240,     1,     0],
        [  305, 22335,  3707,     1,     0],
        [  898, 27156, 26320,     1,     0]], device='cuda:0'),
Input references: tensor([[  305, 25946,  4122,     1,     0],
        [  431,  3707, 24274,     1,     0],
        [    3, 20879, 26920,     1,     0],
        [  850,  4018,  2079,     1,     0],
        [22471,  1298,     1,     0,     0],
        [ 2777,   927,     1,     0,     0],
        [  505, 10402,   755,     1,     0],
        [ 2307,  5548,   927,     1,     0],
        [    3, 27184,  9295,     1,     0],
        [    3,  4013,  3436,  4165,     1],
        [ 2059,  4225,  2577,     1,     0],
        [  220,  2658, 17246,     1,     0],
        [    3, 11776,  2668,     1,     0],
        [ 4678,  4165,  4165,     1,     0],
        [    3, 25926,  2122,     1,     0],
        [    3, 27650, 19337,     1,     0]], device='cuda:0')

In [None]:
# from transformers import T5Tokenizer, T5ForConditionalGeneration

# input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
# labels = tokenizer("Das Haus ist wunderbar.", return_tensors="pt").input_ids

# # the forward function automatically creates the correct decoder_input_ids
# outputs = model(**batch)
# print(outputs.loss, outputs.logits.shape)

# loss = model(input_ids=input_ids, labels=labels).loss
# loss.item()

In [None]:
# from transformers import T5Tokenizer, T5Model

# tokenizer = T5Tokenizer.from_pretrained("t5-base")
# model = T5Model.from_pretrained("t5-base")

# # input_ids = tokenizer(
# #     padding="max_length",
# #     truncation=True,
# #     model_max_length=int(1e30), return_tensors="pt"
# # ).input_ids  # Batch size 1
# decoder_input_ids = tokenizer("", return_tensors="pt").input_ids  # Batch size 1

# # forward pass
# outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
# last_hidden_states = outputs.last_hidden_state

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).