# Train a new tokenizer for ARC

In [1]:
model_to_fine_tune = "google/long-t5-tglobal-base"
training_data_file = 'train_74444_with_letters_none_noise.csv'
from transformers import AutoTokenizer
print(f"loading tokenizer for {model_to_fine_tune}")
tokenizer = AutoTokenizer.from_pretrained(model_to_fine_tune)

loading tokenizer for google/long-t5-tglobal-base


In [2]:
text = "This is a test"
print(f"original text: {text}")
print(f"tokenized text: {tokenizer.tokenize(text)}")

# arc text
arc_text = "train input1 077 777 077 output1 89 9 9 07 000077077 000777777 000077077 077077077 777777777 077077077 000077077 000777777 000077077. input2 404 000 040 output2 89 9 9 40 404000404 000000000 040000040 000000000 000000000 000000000 000404000 000000000 000040000. input3 000 002 202 output3 89 9 9 02 000000000 000000000 000000000 000000000 000000002 000000202 000000000 002000002 202000202. input4 660 600 066 output4 89 9 9 60 660660000 600600000 066066000 660000000 600000000 066000000 000660660 000600600 000066066. input5 222 000 022 output5 89 9 9 20 222222222 000000000 022022022 000000000 000000000 000000000 000222222 000000000 000022022. test tinput1 707 707 770 toutput1 "
print(f"original arc text: {arc_text}")
print(f"tokenized arc text: {tokenizer.tokenize(arc_text)}")

original text: This is a test
tokenized text: ['▁This', '▁is', '▁', 'a', '▁test']
original arc text: train input1 077 777 077 output1 89 9 9 07 000077077 000777777 000077077 077077077 777777777 077077077 000077077 000777777 000077077. input2 404 000 040 output2 89 9 9 40 404000404 000000000 040000040 000000000 000000000 000000000 000404000 000000000 000040000. input3 000 002 202 output3 89 9 9 02 000000000 000000000 000000000 000000000 000000002 000000202 000000000 002000002 202000202. input4 660 600 066 output4 89 9 9 60 660660000 600600000 066066000 660000000 600000000 066000000 000660660 000600600 000066066. input5 222 000 022 output5 89 9 9 20 222222222 000000000 022022022 000000000 000000000 000000000 000222222 000000000 000022022. test tinput1 707 707 770 toutput1 
tokenized arc text: ['▁train', '▁input', '1', '▁07', '7', '▁', '777', '▁07', '7', '▁output', '1', '▁', '89', '▁9', '▁9', '▁07', '▁', '0000', '770', '77', '▁000', '777', '777', '▁', '0000', '770', '77', '▁07', '70', '77

In [3]:
import pandas as pd
def get_training_data(file):
    # read the csv file
    df = pd.read_csv(file)
    for start_idx in range(0, len(df), 1000):
        # combine prompt and correct_answer columns
        df["prompt"] = df["prompt"] + df["correct_answer"]
        samples = df[start_idx : start_idx + 1000]
        yield samples["prompt"].tolist()

In [4]:
training_corpus = get_training_data(training_data_file)
new_tokenizer = tokenizer.train_new_from_iterator(training_corpus, tokenizer.vocab_size)
new_tokenizer.save_pretrained("arc_tokenizer")

('arc_tokenizer\\tokenizer_config.json',
 'arc_tokenizer\\special_tokens_map.json',
 'arc_tokenizer\\tokenizer.json')

In [6]:
arc_text = "train input1 077 777 077 output1 89 9 9 07 000077077 000777777 000077077 077077077 777777777 077077077 000077077 000777777 000077077. input2 404 000 040 output2 89 9 9 40 404000404 000000000 040000040 000000000 000000000 000000000 000404000 000000000 000040000. input3 000 002 202 output3 89 9 9 02 000000000 000000000 000000000 000000000 000000002 000000202 000000000 002000002 202000202. input4 660 600 066 output4 89 9 9 60 660660000 600600000 066066000 660000000 600000000 066000000 000660660 000600600 000066066. input5 222 000 022 output5 89 9 9 20 222222222 000000000 022022022 000000000 000000000 000000000 000222222 000000000 000022022. test tinput1 707 707 770 toutput1 "
print(f"original arc text: {arc_text}")
tokenized_text= new_tokenizer.tokenize(arc_text)
print(f"length of tokenized arc text: {len(tokenized_text)}")
print(f"tokenized arc text: {tokenized_text}")
# old tokenizer
print(f"original arc text: {arc_text}")
print(f"tokenized arc text: {len(tokenizer.tokenize(arc_text))}")

original arc text: train input1 077 777 077 output1 89 9 9 07 000077077 000777777 000077077 077077077 777777777 077077077 000077077 000777777 000077077. input2 404 000 040 output2 89 9 9 40 404000404 000000000 040000040 000000000 000000000 000000000 000404000 000000000 000040000. input3 000 002 202 output3 89 9 9 02 000000000 000000000 000000000 000000000 000000002 000000202 000000000 002000002 202000202. input4 660 600 066 output4 89 9 9 60 660660000 600600000 066066000 660000000 600000000 066000000 000660660 000600600 000066066. input5 222 000 022 output5 89 9 9 20 222222222 000000000 022022022 000000000 000000000 000000000 000222222 000000000 000022022. test tinput1 707 707 770 toutput1 
length of tokenized arc text: 209
tokenized arc text: ['▁', 't', 'r', 'a', 'i', 'n', '▁', 'i', 'n', 'p', 'u', 't', '1', '▁077', '▁777', '▁077', '▁', 'o', 'u', 't', 'p', 'u', 't', '1', '▁89', '▁9', '▁9', '▁07', '▁000077077', '▁000777777', '▁000077077', '▁', '077077077', '▁777777777', '▁', '077077077'