- Load a dataset

- Make it as a iterator

- Load a tokeniser

- Write a processing function 

- Map it to the dataset 

- Create a new tokeniser

- Train it with the dataset

- Write the post processing function

- Run the evaluation 

In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
)
from datasets import load_dataset
from evaluate import load

In [4]:
billsum = load_dataset("billsum", 'default')

Downloading data:   0%|          | 0.00/91.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

In [5]:
billsum

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 18949
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 3269
    })
    ca_test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1237
    })
})

In [6]:
billsum['ca_test'][0]

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nThe Legislature finds and declares all of the following:\n(a) (1) Since 1899 congressionally chartered veterans’ organizations have provided a valuable service to our nation’s returning service members. These organizations help preserve the memories and incidents of the great hostilities fought by our nation, and preserve and strengthen comradeship among members.\n(2) These veterans’ organizations also own and manage various properties including lodges, posts, and fraternal halls. These properties act as a safe haven where veterans of all ages and their families can gather together to find camaraderie and fellowship, share stories, and seek support from people who understand their unique experiences. This aids in the healing process for these returning veterans, and ensures their health and happiness.\n(b) As a result of congressional chartering of these veterans’ organizations, the United States Inte

In [11]:
books['train'][0]

{'id': '2775',
 'translation': {'ca': "Estava molt malalt: res no l'interessava.",
  'de': 'Er war sehr krank, gleichgültig gegen alles.'}}

In [6]:
model_path = "t5-small"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [9]:
source_lang = "ca"
tar_lang = "de"
prefix = "translate english to french"

In [12]:
def preprocess(rows):
    inputs = [prefix + example[source_lang] for example in rows['translation']]
    targets = [example[tar_lang] for example in rows['translation']]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128,
                            truncation=True)
    return model_inputs

In [13]:
tokenized_books = books.map(preprocess, batched=True)

Map:   0%|          | 0/2844 [00:00<?, ? examples/s]

Map:   0%|          | 0/712 [00:00<?, ? examples/s]

In [14]:
tokenized_books

DatasetDict({
    train: Dataset({
        features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2844
    })
    test: Dataset({
        features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 712
    })
})

In [15]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,
                                      model=model_path)

In [16]:
metrik = load('sacrebleu')

In [13]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [14]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [15]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [16]:
from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

In [17]:
targs = Seq2SeqTrainingArguments(
    output_dir="/home/kamal/training_files/prac/",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_eval_batch_size=4,
    per_device_train_batch_size=4,
    weight_decay=0.01,
    learning_rate=2e-5,
    num_train_epochs=2,
    push_to_hub=False,
    report_to="none",
    fp16=True,
)

In [18]:
trainer = Seq2SeqTrainer(
    model = model,
    args=targs,
    train_dataset=tokenized_books['train'],
    eval_dataset=tokenized_books['test'],
    tokenizer = tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [19]:
trainer.train()

Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.60 GiB (GPU 0; 11.73 GiB total capacity; 5.36 GiB already allocated; 4.57 GiB free; 6.27 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF