In [1]:
%load_ext autoreload
%load_ext dotenv
%dotenv
!huggingface-cli login --token $HUGGING_FACE_TOKEN

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/izlobin/.cache/huggingface/token
Login successful


In [2]:
# %%capture
# %pip install transformers evaluate
# %pip install nltk absl-py rouge_score
# %pip install bleu sacrebleu
# %pip install sacremoses
# %pip install scipy
# %pip install sentencepiece
# %pip install optimum auto-gptq
# %pip install scikit-learn
# %pip install einops
# %pip install bitsandbytes
# %pip install accelerate
# %pip install pynvml
# %pip install tensorboard

In [2]:
%load_ext autoreload
%autoreload 2

import os
import sys
import time
from pprint import pprint

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from huggingface_hub import HfApi
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BartForCausalLM,
    BartModel,
    BartTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5TokenizerFast,
)

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

from utils.dataset import get_iterater_samples_simplified, get_iterater_samples_with_instruction
from utils.metric import calculate_scores
from utils.monitoring import calculate_utilization, format_utilization_narrow, print_utilization

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(f"Device: {device}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
'Device: cuda'


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "gpt2-large"
model_repo = f"openai-community"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=True,
)
tokenizer.add_eos_token = True
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"
print(type(tokenizer))
# print(tokenizer.add_eos_token)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print(tokenizer.padding_side)



# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
# )

# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", max_memory={0: "80GiB"})
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=0)
print(type(model))

model.generation_config.max_new_tokens = 350
# model.generation_config.new_tokens = 350
model.generation_config.pad_token_id = model.generation_config.eos_token_id
model.generation_config.padding_side = "left"

print(model.generation_config)
print(model)

model_name: gpt2-large,model_id: openai-community/gpt2-large,model_path: openai-community_gpt2-large
<class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>
50256
50256
left
<class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>
GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "max_new_tokens": 350,
  "pad_token_id": 50256,
  "padding_side": "left"
}

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (

## Loading dataset

### Grammarly dataset

In [4]:
from datasets import DatasetDict, load_dataset

# full_dataset = load_dataset("grammarly/coedit")
# print(full_dataset)

# train_dataset = load_dataset("grammarly/coedit", split="train[:50000]")
# test_dataset = load_dataset("grammarly/coedit", split="train[10000:]")
# # test_dataset = load_dataset("grammarly/coedit", split="validation")

all_dataset = load_dataset("grammarly/coedit", split="train+validation")
# print(all_dataset)

# print()
# print(f"train set {set(all_dataset['task'])}")
# print(f"total len: {len(all_dataset)}")
# print(f"gec len: {len(all_dataset.filter(lambda x: x['task'] == 'gec'))}")
# print(f"simplification len: {len(all_dataset.filter(lambda x: x['task'] == 'simplification'))}")
# print(f"clarity len: {len(all_dataset.filter(lambda x: x['task'] == 'clarity'))}")
# print(f"coherence len: {len(all_dataset.filter(lambda x: x['task'] == 'coherence'))}")
# print(f"paraphrase len: {len(all_dataset.filter(lambda x: x['task'] == 'paraphrase'))}")
# print(f"neutralize len: {len(all_dataset.filter(lambda x: x['task'] == 'neutralize'))}")
# print()

# train_ratio = 0.01
# test_ratio = 0.001
# train_ratio = 0.1
# test_ratio = 0.01
train_ratio = 0.9
test_ratio = 0.1

gec_dataset = all_dataset.filter(lambda x: x["task"] == "gec")
train_gec_dataset = gec_dataset.select(range(0, int(train_ratio * len(gec_dataset))))
test_gec_dataset = gec_dataset.select(range(int((1 - test_ratio) * len(gec_dataset)), len(gec_dataset)))

simplification_dataset = all_dataset.filter(lambda x: x["task"] == "simplification")
train_simplification_dataset = simplification_dataset.select(range(0, int(train_ratio * len(simplification_dataset))))
test_simplification_dataset = simplification_dataset.select(
    range(int((1 - test_ratio) * len(simplification_dataset)), len(simplification_dataset))
)

clarity_dataset = all_dataset.filter(lambda x: x["task"] == "clarity")
train_clarity_dataset = clarity_dataset.select(range(0, int(train_ratio * len(clarity_dataset))))
test_clarity_dataset = clarity_dataset.select(range(int((1 - test_ratio) * len(clarity_dataset)), len(clarity_dataset)))

coherence_dataset = all_dataset.filter(lambda x: x["task"] == "coherence")
train_coherence_dataset = coherence_dataset.select(range(0, int(train_ratio * len(coherence_dataset))))
test_coherence_dataset = coherence_dataset.select(
    range(int((1 - test_ratio) * len(coherence_dataset)), len(coherence_dataset))
)

paraphrase_dataset = all_dataset.filter(lambda x: x["task"] == "paraphrase")
train_paraphrase_dataset = paraphrase_dataset.select(range(0, int(train_ratio * len(paraphrase_dataset))))
test_paraphrase_dataset = paraphrase_dataset.select(
    range(int((1 - test_ratio) * len(paraphrase_dataset)), len(paraphrase_dataset))
)

neutralize_dataset = all_dataset.filter(lambda x: x["task"] == "neutralize")
neutralize_dataset_split = int(train_ratio * len(neutralize_dataset))
train_neutralize_dataset = neutralize_dataset.select(range(0, int(train_ratio * len(neutralize_dataset))))
test_neutralize_dataset = neutralize_dataset.select(
    range(int((1 - test_ratio) * len(neutralize_dataset)), len(neutralize_dataset))
)

from datasets import concatenate_datasets

train_dataset = concatenate_datasets(
    [
        train_gec_dataset,
        train_simplification_dataset,
        train_clarity_dataset,
        train_coherence_dataset,
        train_paraphrase_dataset,
        train_neutralize_dataset,
    ]
)
train_dataset = train_dataset.map(
    lambda item: {
        "input": item["src"],
        "reference": item["tgt"],
        "references": [item["tgt"]],
    },
    remove_columns=["src", "tgt", "_id"],
)
print(f"train set {set(train_dataset['task'])}")
print(train_dataset)

test_dataset = concatenate_datasets(
    [
        test_gec_dataset,
        test_simplification_dataset,
        test_clarity_dataset,
        test_coherence_dataset,
        test_paraphrase_dataset,
        test_neutralize_dataset,
    ]
)
test_dataset = test_dataset.map(
    lambda item: {
        "input": item["src"],
        "reference": item["tgt"],
        "references": [item["tgt"]],
    },
    remove_columns=["src", "tgt", "_id"],
)
print(f"test set {set(test_dataset['task'])}")
print(test_dataset)


def add_prompt(item):
    return {
        "request": f"{item['input']}\nResponse:",
        "prompt": f"{item['input']}\nResponse:{item['reference']}",
    }


dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
# dataset = dataset.rename_column("task", "label")
dataset = dataset.map(add_prompt)
print(dataset)
print(dataset["train"][0])

train set {'neutralize', 'paraphrase', 'simplification', 'gec', 'clarity', 'coherence'}
Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 63703
})
test set {'neutralize', 'paraphrase', 'simplification', 'gec', 'clarity', 'coherence'}
Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 7080
})


Map:   0%|          | 0/63703 [00:00<?, ? examples/s]

Map:   0%|          | 0/7080 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['task', 'input', 'reference', 'references', 'request', 'prompt'],
        num_rows: 63703
    })
    test: Dataset({
        features: ['task', 'input', 'reference', 'references', 'request', 'prompt'],
        num_rows: 7080
    })
})
{'task': 'gec', 'input': 'Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.', 'reference': 'For example, countries with a lot of deserts can transform their desert to increase their habitable land and use irrigation to provide clean water to the desert.', 'references': ['For example, countries with a lot of deserts can transform their desert to increase their habitable land and use irrigation to provide clean water to the desert.'], 'request': 'Remove all grammatical errors from this text: For example, countries with a lot of deserts can terrafo

In [5]:
# find the longest sequence in the dataset
max_input_length = max(len(tokenizer.encode(item["input"])) for item in dataset["train"])
print(f"max_input_length train: {max_input_length}")
max_input_length = max(len(tokenizer.encode(item["input"])) for item in dataset["test"])
print(f"max_input_length test: {max_input_length}")

max_input_length train: 171
max_input_length test: 239


In [6]:
from datasets import Dataset

train_lists_map = {}

for task in set(train_dataset['task']):
    train_lists_map[task] = []

for item in dataset["train"]:
    train_lists_map[item["task"]].append(item)

train_dataset_map = {}
for task, list in train_lists_map.items():
    train_dataset_map[task] = Dataset.from_list(list)
# print(train_dataset_map)

train_dataset_dict = DatasetDict(train_dataset_map)
# print(train_dataset_dict)

# for task, ds in train_dataset_dict.items():
#     print(f"{task}: {ds}")

for task in set(train_dataset['task']):
    print(f"train/{task}: {len(train_lists_map[task])}")

train/neutralize: 10143
train/paraphrase: 14307
train/simplification: 10296
train/gec: 18277
train/clarity: 1126
train/coherence: 9554


In [7]:
from datasets import Dataset

test_lists_map = {}

for task in set(train_dataset['task']):
    test_lists_map[task] = []

for item in dataset["test"]:
    test_lists_map[item["task"]].append(item)

test_dataset_map = {}
for task, list in test_lists_map.items():
    test_dataset_map[task] = Dataset.from_list(list)
# print(test_dataset_map)

test_dataset_dict = DatasetDict(test_dataset_map)
# print(test_dataset_dict)

# for task, ds in test_dataset_dict.items():
#     print(f"{task}: {ds}")

for task in set(train_dataset['task']):
    print(f"test/{task}: {len(test_lists_map[task])}")

test/neutralize: 1127
test/paraphrase: 1590
test/simplification: 1144
test/gec: 2031
test/clarity: 126
test/coherence: 1062


In [8]:
import numpy as np
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq

tokenizer.padding_side = "left"
print(type(tokenizer))
# print(tokenizer.add_eos_token)
print(tokenizer.pad_token_id)
print(tokenizer.padding_side)

max_length = 350


def process_dataset(batch):
    model_inputs = tokenizer(batch["prompt"], max_length=max_length)
    model_reponses = tokenizer(batch["reference"], max_length=max_length)

    new_input_ids = []
    new_labels = []
    for input_ids, response_ids in zip(model_inputs.input_ids, model_reponses.input_ids):
        # debug_labels = input_ids[-len(response_ids) :]
        # print(tokenizer.decode(input_ids, skip_special_tokens=False))
        # print(tokenizer.decode(debug_labels, skip_special_tokens=False))

        num_tokens_ignore = len(input_ids) - len(response_ids)
        labels = [-100] * num_tokens_ignore + input_ids[-len(response_ids) :]
        # labels.append(-100)
        labels.append(tokenizer.eos_token_id)
        new_labels.append(labels)

        input_ids.append(tokenizer.eos_token_id)
        new_input_ids.append(input_ids)

    new_attention_mask = []
    for attention_mask in model_inputs.attention_mask:
        # attention_mask.append(0)
        attention_mask.append(1)
        new_attention_mask.append(attention_mask)

    # labels = tokenizer(text_target=examples["tgt"], max_length=1024, padding=True).input_ids
    # model_inputs["labels"] = labels

    model_inputs["input_ids"] = new_input_ids
    model_inputs["attention_mask"] = new_attention_mask
    model_inputs["labels"] = new_labels

    # print(
    #     f">> input_ids: {len(model_inputs['input_ids'])},"
    #     "attention_mask: {len(model_inputs['attention_mask'])},"
    #     "labels: {len(model_inputs['labels'])}"
    # )
    # count = 0
    # for input_ids, labels, attention_masks in zip(new_input_ids, new_labels, new_attention_mask):
    #     count += 1
    #     if count > 3:
    #         break
    #     print(f">> input_ids: {len(input_ids)}, attention_mask: {len(attention_masks)}, labels: {len(labels)}")
    #     print(tokenizer.decode(input_ids, skip_special_tokens=False))
    #     print(labels)

    return model_inputs


# processed_dataset = data.map(preprocess_function, batched=True, batch_size=10, remove_columns=data["train"].column_names)
processed_dataset = dataset.map(
    process_dataset, batched=True, batch_size=10, remove_columns=dataset["train"].column_names
)
# processed_dataset = dataset.map(process_dataset, batched=True, batch_size=10)
print(processed_dataset)
print(dataset["train"][0]["prompt"])
# print(dataset["train"][0]["reference"])
print(len(processed_dataset["train"]["input_ids"][0]), processed_dataset["train"]["input_ids"][0])
print(len(processed_dataset["train"]["attention_mask"][0]), processed_dataset["train"]["attention_mask"][0])
print(len(processed_dataset["train"]["labels"][0]), processed_dataset["train"]["labels"][0])
print(dataset["train"][1]["prompt"])
# print(dataset["train"][1]["reference"])
# print(dataset["train"][2]["prompt"])
# print(dataset["train"][2]["reference"])

# print(len(processed_dataset["test"]["input_ids"][0]), processed_dataset["test"]["input_ids"][0])
# print(len(processed_dataset["test"]["attention_mask"][0]), processed_dataset["test"]["attention_mask"][0])
# print(len(processed_dataset["test"]["labels"][0]), processed_dataset["test"]["labels"][0])

data_collator = DataCollatorForSeq2Seq(tokenizer, return_tensors="pt", model=model)
# data_collator = DataCollatorForSeq2Seq(tokenizer, return_tensors="pt", padding=True)

dataloader = DataLoader(processed_dataset["train"], batch_size=2, collate_fn=data_collator)
for batch in dataloader:
    print(batch)
    break

<class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>
50256
left


Map:   0%|          | 0/63703 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Map:   0%|          | 0/7080 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 63703
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7080
    })
})
Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.
Response:For example, countries with a lot of deserts can transform their desert to increase their habitable land and use irrigation to provide clean water to the desert.
73 [27914, 477, 14599, 44935, 8563, 422, 428, 2420, 25, 1114, 1672, 11, 2678, 351, 257, 1256, 286, 45288, 460, 1059, 430, 687, 511, 10326, 284, 2620, 511, 49055, 1956, 290, 1262, 35425, 284, 2148, 3424, 1660, 284, 262, 10326, 13, 198, 31077, 25, 1890, 1672, 11, 2678, 351, 257, 1256, 286, 45288, 460, 6121, 511, 10326, 284, 2620, 511, 49055, 1956, 290, 779, 35425, 284, 2148, 34

## Pre-test inference

In [9]:
# %%script echo skip

max_batch = 2
max_length = 350

print(type(model))
print(model.generation_config.max_new_tokens)

for task, batch in test_dataset_dict.items():
    print()
    print(f">> {task}")
    batch_size = len(batch) if len(batch) < max_batch else max_batch
    input_batch = batch.select(range(batch_size))
    # print(f"input: {input_batch['input']}")
    # print(f"prompt: {input_batch['prompt']}")
    print(f"request: {input_batch['request']}")
    print(f"reference: {input_batch['reference']}")

    input = tokenizer(input_batch["input"], padding=True, return_tensors="pt").to(device)
    outputs = model.generate(input.input_ids, max_length=max_length)
    processed = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    print(f"result: {processed}")

    break

Both `max_new_tokens` (=350) and `max_length`(=350) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


<class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>
350

>> neutralize
request: ['Remove non-neutral POV: new moon received poor reviews from critics.\nResponse:', 'Remove POVs in this text: rhonda shear (born 1954), american television personality, comedienne, and actress\nResponse:']
reference: ['new moon received negative reviews from critics.', 'rhonda shear (born 1954), american television personality, comedian, and actress']
result: ['Remove non-neutral POV: new moon received poor reviews from critics.\n\nThe new moon of Saturn is a very interesting object. It is the first of the new moon cycle, and it is the first of the new moon cycle to be visible from Earth. It is also the first of the new moon cycle to be visible from the Earth.\n\nThe new moon is a very interesting object. It is the first of the new moon cycle, and it is the first of the new moon cycle to be visible from the Earth.\n\nThe new moon is a very interesting object. It is the first of the new moon cy

## Training

In [10]:
%reload_ext autoreload

training_model_repo = f"iliazlobin"
training_model_name: str = f"{model_name}-coedit"
training_model_id = f"{training_model_repo}/{training_model_name}"
training_model_checkpoint = f"{training_model_id}"
training_model_path = f"{training_model_repo}_{training_model_name}"
print(
    f"training_model_name: {training_model_name}, "
    f"training_model_id: {training_model_id}, "
    f"training_model_path: {training_model_path}"
)

utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)


available_memory = utilization["total_memory"] - utilization["memory_used"]
recommended_fraction = available_memory / utilization["total_memory"]

actual_fraction = 0.95
torch.cuda.set_per_process_memory_fraction(actual_fraction, 0)

print(
    f"total/used/available memory (Gb): {utilization['total_memory']/1024**3:.2f}/"
    f"{utilization['memory_used']/1024**3:.2f}/{available_memory/1024**3:.2f}"
)
print(f"recommended/actual fraction: {recommended_fraction:.2f}/{actual_fraction:.2f}")

training_model_name: gpt2-large-coedit, training_model_id: iliazlobin/gpt2-large-coedit, training_model_path: iliazlobin_gpt2-large-coedit
total/used/cuda/res/ram (Gb): 79.15/5.14/3.00/3.79/5.11
total/used/available memory (Gb): 79.15/5.14/74.01
recommended/actual fraction: 0.94/0.95


In [11]:
training_model = model
# print(type(training_model))
# print(training_model.config)
# print(training_model.generation_config)

total_params = sum(p.numel() for p in training_model.parameters())
total_trainable_params = sum(p.numel() for p in training_model.parameters() if p.requires_grad)
print(f"Total/trainable params: {total_params}/{total_trainable_params}")

rouge_metric = evaluate.load("rouge")
# glue_metric = evaluate.load("glue", "stsb")
sacreblue_metric = evaluate.load("sacrebleu")
sari_metric = evaluate.load("sari")
em_metric = evaluate.load("exact_match")

# batch_size = len(batch) if len(batch) < max_batch else max_batch
# input_batch = batch.select(range(batch_size))
# input = tokenizer(input_batch["input"], padding=True, return_tensors="pt").to(device)
# outputs = model.generate(input.input_ids, max_length=max_length)
# processed = tokenizer.batch_decode(outputs, skip_special_tokens=True)


def compute_metrics(eval_pred):
    # print(eval_pred)
    preds = eval_pred.predictions
    # print(preds)
    labels = eval_pred.label_ids
    # print(labels)

    # preds = np.argmax(preds, axis=-1)
    # print(preds)

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    # print(preds)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # print(labels)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # print(decoded_preds)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # print(decoded_labels)

    rouge_score = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    sacreblue_score = sacreblue_metric.compute(predictions=decoded_preds, references=decoded_labels)
    # sari_score = sari_metric.compute(
    #     sources=processed_samples["input"],
    #     predictions=processed_samples["processed"],
    #     references=processed_samples["references"],
    # )
    em_score = em_metric.compute(predictions=decoded_preds, references=decoded_labels)

    utilization = calculate_utilization()

    # report = {
    #     "metric": 0,
    # }
    report = {
        "rouge1": rouge_score["rouge1"],
        "rouge2": rouge_score["rouge2"],
        "rougeL": rouge_score["rougeL"],
        "rougeLsum": rouge_score["rougeLsum"],
        "sacreblue": sacreblue_score["score"],
        "memory_used": utilization["memory_used"] / 1024**2,
        "cuda_allocated": utilization["cuda_allocated"] / 1024**2,
        "cuda_reserved": utilization["cuda_reserved"] / 1024**2,
        "ram_usage": utilization["ram_usage"] / 1024**2,
        # "sari.sari": 0,
        "em": em_score["exact_match"],
    }

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    report["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in report.items()}


def logits_argmax(logits: torch.Tensor, labels):
    # if isinstance(logits, tuple):
    #     # Depending on the model and config, logits may contain extra tensors,
    #     # like past_key_values, but logits always come first
    #     logits = logits[0]
    # return logits.argmax(dim=-1), labels
    # logits = logits[0]
    return logits.argmax(dim=-1)


# debug
# trainer = Trainer(
#     model=training_model,
#     train_dataset=processed_dataset["train"],
#     eval_dataset=processed_dataset["test"],
#     # eval_dataset=processed_dataset["test"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
#     preprocess_logits_for_metrics=logits_argmax,
#     args=args,
# )

# predictions = trainer.predict(processed_dataset["train"].select(range(2)))
# print(type(predictions))
# # print(predictions)
# print(predictions.predictions)
# print(predictions.label_ids)
# metrics = compute_metrics((predictions.predictions, predictions.label_ids))
# print(metrics)

Total/trainable params: 774030080/774030080


In [12]:
%%capture
%load_ext tensorboard
%tensorboard --logdir "model-{training_model_path}-train/runs"

In [14]:
train_path = f"model-{training_model_path}-train"
print(f"train_path: {train_path}")

train_size = len(processed_dataset["train"])
batch_size = 150
gradient_accumulation_steps = 4
eval_batch_size = 150
eval_accumulation_steps = 4

per_epoch_steps = train_size / (batch_size * gradient_accumulation_steps)

max_steps = 100
epochs = per_epoch_steps / max_steps
print(f"train_size: {train_size}, batch_size: {batch_size}, per_epoch_steps: {per_epoch_steps}, max_steps: {max_steps}, epochs: {epochs}")

# if epochs < 1:
#     raise Exception(f"Training doesn't cover the entire training dataset with {train_size} samples")

epochs = 2
epoch_total_steps = epochs * per_epoch_steps
print(f"train_size: {train_size}, batch_size: {batch_size}, per_epoch_steps: {per_epoch_steps}, epochs: {epochs}, epoch_total_steps: {epoch_total_steps}")

train_path: model-iliazlobin_gpt2-large-coedit-train
train_size: 63703, batch_size: 150, per_epoch_steps: 106.17166666666667, max_steps: 100, epochs: 1.0617166666666666
train_size: 63703, batch_size: 150, per_epoch_steps: 106.17166666666667, epochs: 2, epoch_total_steps: 212.34333333333333


In [15]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, Trainer, TrainingArguments

args = TrainingArguments(
    output_dir=train_path,
    learning_rate=2e-5,
    weight_decay=0.01,
    # warmup_ratio=0.05,
    optim="paged_adamw_32bit",
    fp16=True,
    gradient_checkpointing=True,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=eval_batch_size,
    eval_accumulation_steps=eval_accumulation_steps,
    num_train_epochs=epochs,
    # max_steps=max_steps,
    warmup_steps=1,
    logging_steps=25,
    save_strategy="steps",
    save_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    report_to="tensorboard",
    push_to_hub_model_id=training_model_name,
    push_to_hub_organization="iliazlobin",
    push_to_hub=False,
    gradient_checkpointing_kwargs={
        "use_reentrant": False
    },  # Should be false for Lora (https://github.com/kohya-ss/sd-scripts/issues/323#issuecomment-1485073421)
)

trainer = Trainer(
    model=training_model,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=logits_argmax,
    args=args,
)
print(type(trainer))

model.config.use_cache = False
# trainer.train(resume_from_checkpoint=True)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


<class 'transformers.trainer.Trainer'>


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Sacreblue,Memory Used,Cuda Allocated,Cuda Reserved,Ram Usage,Em,Gen Len
50,0.8724,1.027417,0.4653,0.3509,0.4382,0.4459,19.0412,68475.5,3082.605,61060.0,5708.957,0.0,82.0895
100,0.7407,0.949904,0.4825,0.3651,0.4557,0.4656,19.2975,68475.5,3082.6152,61060.0,13842.9336,0.0,81.3952
150,0.6964,0.931841,0.4783,0.3627,0.452,0.4605,19.418,68475.5,3082.6182,61060.0,13958.2773,0.0,81.0295
200,0.6846,0.921462,0.4818,0.3649,0.4555,0.4643,19.1714,68475.5,3082.6328,61060.0,13976.5117,0.0,82.1798


TrainOutput(global_step=212, training_loss=1.028745507294277, metrics={'train_runtime': 2094.9714, 'train_samples_per_second': 60.815, 'train_steps_per_second': 0.101, 'total_flos': 7.86108222475776e+16, 'train_loss': 1.028745507294277, 'epoch': 2.0})

In [22]:
%load_ext dotenv
%dotenv ../.env

!huggingface-cli login --token os.getenv(key="HUGGING_FACE_TOKEN")

print(type(trainer))
print(type(trainer.model))

# print(f"saving locally: model-{training_model_path}")
# trainer.save_model(f"model-{training_model_path}")

print(f"pushing to hub: {training_model_name}, {training_model_id}")
trainer.push_to_hub(
    commit_message="complete: train_size: {train_size}, batch_size: {batch_size}, per_epoch_steps: {per_epoch_steps}, epochs: {epochs}, epoch_total_steps: {epoch_total_steps}",
    model_name=training_model_name,
)
# trainer.push_to_hub(commit_message="test")

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/bin/bash: -c: line 1: syntax error near unexpected token `('
/bin/bash: -c: line 1: `huggingface-cli login --token os.getenv(key="HUGGING_FACE_TOKEN")'
<class 'transformers.trainer.Trainer'>
<class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>
pushing to hub: gpt2-large-coedit, iliazlobin/gpt2-large-coedit


model.safetensors:   0%|          | 0.00/3.10G [00:00<?, ?B/s]

events.out.tfevents.1713658636.workstation.89102.0:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/iliazlobin/gpt2-large-coedit/commit/f9eaa6ae131e0b9fce90ade2cc8fc591b367f123', commit_message='complete: train_size: {train_size}, batch_size: {batch_size}, per_epoch_steps: {per_epoch_steps}, epochs: {epochs}, epoch_total_steps: {epoch_total_steps}', commit_description='', oid='f9eaa6ae131e0b9fce90ade2cc8fc591b367f123', pr_url=None, pr_revision=None, pr_num=None)

## Test inference

In [18]:
model = trainer.model

print(type(tokenizer))
# print(tokenizer.add_eos_token)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print(tokenizer.padding_side)


max_length = 350
max_new_tokens = 100
max_batch = 2

print(dataset["train"])

batch_size = len(dataset["train"]) if len(dataset["train"]) < max_batch else max_batch
input_batch = dataset["train"].select(range(batch_size))
# print(f"task: {input_batch['task']}")
# print(f"input: {input_batch['input']}")
print(f"request: {input_batch['request']}")
print(f"reference: {input_batch['reference']}")

inputs = tokenizer(input_batch["request"], return_tensors="pt", padding=True).to(device)
print(inputs)

model.config.use_cache = False
# model.config.pad_token_id = model.config.eos_token_id
# outputs = model.generate(**input_ids, max_new_tokens=128)
# outputs = model.generate(**inputs, max_new_tokens=128, num_return_sequences=1)
outputs = model.generate(
    **inputs,
    # max_length=max_length,
    max_new_tokens=max_new_tokens,
    # pad_token_id=tokenizer.eos_token_id,
    # eos_token_id=tokenizer.eos_token_id,
    num_return_sequences=1,
)
# outputs = model.generate(
#     **inputs,
#     do_sample=True,
#     top_k=10,
#     num_return_sequences=1,
#     pad_token_id=tokenizer.eos_token_id,
#     # return_attention_mask=True,
#     # max_length=max_length,
#     max_new_tokens=max_new_tokens
# )
print(outputs)

trimmed_output = outputs[:, inputs.input_ids.shape[1] :]
# result = tokenizer.batch_decode(trimmed_output, skip_special_tokens=False)
# result = tokenizer.batch_decode(outputs, skip_special_tokens=False)
result = tokenizer.batch_decode(trimmed_output, skip_special_tokens=True)
print(f"result: {result}")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


<class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>
50256
50256
left
Dataset({
    features: ['task', 'input', 'reference', 'references', 'request', 'prompt'],
    num_rows: 63703
})
request: ['Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.\nResponse:', 'Improve the grammaticality: As the number of people grows, the need of habitable environment is unquestionably essential.\nResponse:']
reference: ['For example, countries with a lot of deserts can transform their desert to increase their habitable land and use irrigation to provide clean water to the desert.', 'As the number of people grows, the need for a habitable environment is unquestionably increasing.']
{'input_ids': tensor([[27914,   477, 14599, 44935,  8563,   422,   428,  2420,    25,  1114,
          1672,    11,  2678,   351,   257,  1256,  

In [53]:
# input
sequence = [27914,   477, 14599, 44935,  8563,   422,   428,  2420,    25,  1114,
          1672,    11,  2678,   351,   257,  1256,   286, 45288,   460,  1059,
           430,   687,   511, 10326,   284,  2620,   511, 49055,  1956,   290,
          1262, 35425,   284,  2148,  3424,  1660,   284,   262, 10326,    13,
           198, 31077,    25]
print(tokenizer.decode(sequence))
# input
sequence = [47531,   262, 14599,
         44935,   414,    25,  1081,   262,  1271,   286,   661, 13676,    11,
           262,   761,   286, 49055,  2858,   318, 38766,  1346,  6393,    13,
           198, 31077,    25]
print(tokenizer.decode(sequence))
# labels
# sequence = [1114,  1672,    11,  2678,   351,   257,  1256,
#            286, 45288,   460,  6121,   511, 10326,   284,  2620,   511, 49055,
#           1956,   290,   779, 35425,   284,  2148,  3424,  1660,   284,   262,
#          10326,    13]
# print(tokenizer.decode(sequence))
# sequence = [27914,   477, 14599, 44935,  8563,   422,   428,  2420,    25,  1114,
#           1672,    11,  2678,   351,   257,  1256,   286, 45288,   460,  1059,
#            430,   687,   511, 10326,   284,  2620,   511, 49055,  1956,   290,
#           1262, 35425,   284,  2148,  3424,  1660,   284,   262, 10326,    13,
#            198, 31077,    25,   220]
# print(tokenizer.decode(token_ids=sequence))
# sequence = [25, 220, 25]
# print(tokenizer.decode(sequence))
# sequence = [1114]
# print(tokenizer.decode(sequence))

Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.
Response:
Improve the grammaticality: As the number of people grows, the need of habitable environment is unquestionably essential.
Response:


In [19]:
model = trainer.model

max_batch = 2
max_length = 350
max_new_tokens = 350

count = 0
# for task, batch in test_dataset_dict.items():
for task, batch in train_dataset_dict.items():
    print()
    print(f">> {task}")
    batch_size = len(batch) if len(batch) < max_batch else max_batch
    input_batch = batch.select(range(batch_size))
    print(f"request: {input_batch['request']}")
    print(f"reference: {input_batch['reference']}")

    inputs = tokenizer(input_batch["request"], return_tensors="pt", padding=True).to(device)
    # print(inputs)

    model.config.use_cache = False
    # model.config.pad_token_id = model.config.eos_token_id
    # outputs = model.generate(**input_ids, max_new_tokens=128)
    # outputs = model.generate(**input_ids, max_new_tokens=128, num_return_sequences=1)
    # outputs = model.generate(
    #     **inputs, max_new_tokens=max_new_tokens, num_return_sequences=1
    # )
    outputs = model.generate(
        **inputs,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        # pad_token_id=tokenizer.eos_token_id,
        # return_attention_mask=True,
        # max_length=256,
        max_new_tokens=max_new_tokens,
    )

    trimmed_output = outputs[:, inputs.input_ids.shape[1] :]
    result = tokenizer.batch_decode(trimmed_output, skip_special_tokens=True)
    print(f"result: {result}")

    # count += 1
    # if count > 3:
    #     break


>> neutralize
request: ['Make this text more neutral: chloroform "the molecular lifesaver" an article at oxford university providing interesting facts about chloroform.\nResponse:', 'Remove points of view: gaming system, an dice pool system where matched die results determine success.\nResponse:']
reference: ['chloroform "the molecular lifesaver" an article at oxford university providing facts about chloroform.', 'gaming system, a unique dice pool system where matched die results determine success.']
result: ['chloroform "the molecular lifesaver" is an article at oxford university providing interesting facts about chloroform.', 'gaming system, dice pool systems where matched dice results determine success.']

>> paraphrase
request: ['Reword this sentence: Item 5.1.2 shall be amended to read:\nResponse:', 'Reword this text: She stopped when she saw his expression.\nResponse:']
reference: ['Point 5.1.2 is replaced by the following:', 'Seeing the look on his face, she paused.']
result: [