In [1]:
%load_ext autoreload
%load_ext dotenv
%dotenv
!huggingface-cli login --token $HUGGING_FACE_TOKEN

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/izlobin/.cache/huggingface/token
Login successful


In [2]:
# %%capture
# %pip install transformers evaluate
# %pip install nltk absl-py rouge_score
# %pip install bleu sacrebleu
# %pip install sacremoses
# %pip install scipy
# %pip install sentencepiece
# %pip install optimum auto-gptq
# %pip install scikit-learn
# %pip install einops
# %pip install bitsandbytes
# %pip install accelerate
# %pip install pynvml

In [3]:
%load_ext autoreload
%autoreload 2

import os
import sys
import time
from pprint import pprint

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from huggingface_hub import HfApi
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BartForCausalLM,
    BartModel,
    BartTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5TokenizerFast,
)

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

from utils.dataset import get_iterater_samples_simplified, get_iterater_samples_with_instruction
from utils.metric import calculate_scores
from utils.monitoring import calculate_utilization, format_utilization_narrow, print_utilization

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(f"Device: {device}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'Device: cuda'


In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_id = "openai-community/gpt2-large"
model_name = "openai-community/gpt2-large"
model_alias = model_name.replace("/", "_")

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=True,
)
print(type(tokenizer))

tokenizer.add_eos_token = True
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)


# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

print(type(model))

<class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>
<class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>


In [5]:
%%script echo skip

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    # padding_side="right",
    # model_max_length=512,
    # pad_token="!",
    # add_eos_token=True,
    # add_bos_token=True,
    # padding='longest',
    use_fast=True,
    # trust_remote_code=True,
)
print(type(tokenizer))

# tokenizer.pad_token_id = tokenizer.eos_token_id

# print(tokenizer.eos_token_id)
# print(tokenizer.pad_token_id)
# print(tokenizer.special_tokens_map)

# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.add_special_tokens(
#     {
#         "pad_token": "<|endoftext|>",
#         "eos_token": "<|endoftext|>",
#         "bos_token": "<|endoftext|>",
#         "unk_token": "<|endoftext|>",
#     }
# )

tokenizer.add_eos_token = True
# tokenizer.padding_side = "right"

input_text = """Fix grammar: In the case of young people the best way is studying as hard as possible to get better grades so in the future they will have better chance to find better jobs
Response:
""".strip()
tokens = tokenizer(input_text, add_special_tokens=True)
print(tokens.input_ids)

skip


In [6]:
%%script echo skip

input_text = "Fix grammatical errors in this sentence: I goes to school every day."
# input_text = "Fix grammar: In the case of young people the best way is studying as hard as possible to get better grades so in the future they will have better chance to find better jobs ."

# input_ids = tokenizer(input_text, add_special_tokens=False, return_tensors="pt")
inputs = tokenizer(input_text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=100)
# outputs = model.generate(
#     **input_ids,
#     do_sample=True,
#     top_k=10,
#     num_return_sequences=1,
#     # eos_token_id=tokenizer.eos_token_id,
#     # return_attention_mask=False,
#     max_length=256,
# )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

skip


## Loading dataset

In [7]:
%%script echo skip

from datasets import load_dataset

full_ds = load_dataset("Abirate/english_quotes")
print(full_ds)
train_ds = load_dataset("Abirate/english_quotes", split="train[:30%]")
test_ds = load_dataset("Abirate/english_quotes", split="train[-05%:]")

processed_train_ds = train_ds.map(
    lambda samples: tokenizer(samples["quote"], max_length=512), batched=True
)
processed_test_ds = test_ds.map(
    lambda samples: tokenizer(samples["quote"], max_length=512), batched=True
)
print(processed_train_ds)
print(processed_test_ds)

# test_data = train_test_split(
#     test_size=0.1,
# )
# print(test_data)

skip


### Grammarly dataset

In [8]:
from datasets import DatasetDict

# full_dataset = load_dataset("grammarly/coedit")
# print(full_dataset)

# train_data = load_dataset("grammarly/coedit", split="train")
# test_data = load_dataset("grammarly/coedit", split="validation")

train_data = load_dataset("grammarly/coedit", split="train[:1000]")
test_data = load_dataset("grammarly/coedit", split="train[:100]")

dataset = DatasetDict({"train": train_data, "test": test_data})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 100
    })
})


In [9]:
def generate_grammar_prompt(src: str, tgt: str) -> str:
    return f"""{src}
Response: {tgt}
""".strip()


def prompt_dataset(item):
    return {
        "prompt": generate_grammar_prompt(item["src"], item["tgt"]),
        # "labels": item["tgt"],
    }


prompted_dataset = dataset.map(prompt_dataset, remove_columns=["task", "_id"])
print(prompted_dataset)
print(len(prompted_dataset))
print(prompted_dataset["train"][0]["prompt"])
print(prompted_dataset["train"][1]["prompt"])

DatasetDict({
    train: Dataset({
        features: ['src', 'tgt', 'prompt'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['src', 'tgt', 'prompt'],
        num_rows: 100
    })
})
2
Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.
Response: For example, countries with a lot of deserts can transform their desert to increase their habitable land and use irrigation to provide clean water to the desert.
Improve the grammaticality: As the number of people grows, the need of habitable environment is unquestionably essential.
Response: As the number of people grows, the need for a habitable environment is unquestionably increasing.


In [10]:
import numpy as np
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq


def tokenize(samples):
    # model_input = tokenizer(examples["prompt"], max_length=256, padding="max_length", truncation=True)

    model_inputs = tokenizer(samples["prompt"])
    model_reponses = tokenizer(samples["tgt"])

    new_input_ids = []
    new_labels = []
    for input_ids, response_ids in zip(model_inputs.input_ids, model_reponses.input_ids):
        # debug_labels = input_ids[-len(response_ids) :]
        # print(tokenizer.decode(input_ids, skip_special_tokens=False))
        # print(tokenizer.decode(debug_labels, skip_special_tokens=False))

        input_ids.append(tokenizer.eos_token_id)
        new_input_ids.append(input_ids)

        num_tokens_ignore = len(input_ids) - len(response_ids)
        labels = [-100] * num_tokens_ignore + input_ids[-len(response_ids) :]
        new_labels.append(labels)

    new_attention_mask = []
    for attention_mask in model_inputs.attention_mask:
        attention_mask.append(1)
        new_attention_mask.append(attention_mask)

    # labels = tokenizer(text_target=examples["tgt"], max_length=1024, padding=True).input_ids
    # model_inputs["labels"] = labels

    model_inputs["input_ids"] = new_input_ids
    model_inputs["attention_mask"] = new_attention_mask
    model_inputs["labels"] = new_labels

    # print(
    #     f">> input_ids: {len(model_inputs['input_ids'])},"
    #     "attention_mask: {len(model_inputs['attention_mask'])},"
    #     "labels: {len(model_inputs['labels'])}"
    # )
    # count = 0
    # for input_ids, labels, attention_masks in zip(new_input_ids, new_labels, new_attention_mask):
    #     count += 1
    #     if count > 3:
    #         break
    #     print(f">> input_ids: {len(input_ids)}, attention_mask: {len(attention_masks)}, labels: {len(labels)}")
    #     print(tokenizer.decode(input_ids, skip_special_tokens=False))
    #     print(labels)

    return model_inputs


processed_dataset = prompted_dataset.map(tokenize, batched=True, remove_columns=["tgt", "src", "prompt"])
print(processed_dataset)
print(processed_dataset["train"]["input_ids"][0])
print(processed_dataset["train"]["attention_mask"][0])
print(processed_dataset["train"]["labels"][0])

data_collator = DataCollatorForSeq2Seq(tokenizer, return_tensors="pt", padding=True)

# dataloader = DataLoader(processed_dataset["train"], batch_size=4, collate_fn=data_collator)
# for batch in dataloader:
# print(batch)
# break

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})
[27914, 477, 14599, 44935, 8563, 422, 428, 2420, 25, 1114, 1672, 11, 2678, 351, 257, 1256, 286, 45288, 460, 1059, 430, 687, 511, 10326, 284, 2620, 511, 49055, 1956, 290, 1262, 35425, 284, 2148, 3424, 1660, 284, 262, 10326, 13, 198, 31077, 25, 1114, 1672, 11, 2678, 351, 257, 1256, 286, 45288, 460, 6121, 511, 10326, 284, 2620, 511, 49055, 1956, 290, 779, 35425, 284, 2148, 3424, 1660, 284, 262, 10326, 13, 50256]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -

## Training

In [11]:
%reload_ext autoreload

utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)


available_memory = utilization["total_memory"] - utilization["memory_used"]
recommended_fraction = available_memory / utilization["total_memory"]

actual_fraction = 0.90
torch.cuda.set_per_process_memory_fraction(actual_fraction, 0)

print(
    f"total/used/available memory (Gb): {utilization['total_memory']/1024**3:.2f}/"
    f"{utilization['memory_used']/1024**3:.2f}/{available_memory/1024**3:.2f}"
)
print(f"recommended/actual fraction: {recommended_fraction:.2f}/{actual_fraction:.2f}")

total/used/cuda/res/ram (Gb): 10.00/1.82/0.51/0.57/14.12
total/used/available memory (Gb): 10.00/1.82/8.18
recommended/actual fraction: 0.82/0.90


In [12]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


# model.gradient_checkpointing_enable()
perf_model = model
perf_model = prepare_model_for_kbit_training(perf_model, use_gradient_checkpointing=True)


config = LoraConfig(
    r=128,
    lora_alpha=32,
    # target_modules=["query_key_value"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

perf_model = get_peft_model(perf_model, config)
print(type(perf_model))
print_trainable_parameters(perf_model)

<class 'peft.peft_model.PeftModelForCausalLM'>
trainable params: 23592960 || all params: 443728640 || trainable%: 5.316979314204285


In [13]:
%%capture
%load_ext tensorboard
%tensorboard --logdir 'model-gpt2-large-bnb4bit-coedit-train/runs'

In [14]:
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

training_model = perf_model

args = TrainingArguments(
    output_dir="model-gpt2-large-bnb4bit-coedit-train",
    learning_rate=2e-4,
    # warmup_ratio=0.05,
    # optim="paged_adamw_8bit",
    fp16=True,
    gradient_checkpointing=True,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    logging_steps=10,
    warmup_steps=2,
    # save_steps=50,
    max_steps=100,
    # num_train_epochs=2,
    save_strategy="epoch",
    eval_steps=10,
    # evaluation_strategy="epoch",
    report_to="tensorboard",
)

trainer = Trainer(
    model=training_model,
    train_dataset=processed_dataset["train"],
    # eval_dataset=processed_dataset["test"],
    args=args,
    data_collator=data_collator,
)
print(type(trainer))

model.config.use_cache = False
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


<class 'transformers.trainer.Trainer'>




Step,Training Loss
10,2.1836
20,1.389
30,0.7701
40,0.5938
50,0.6089
60,0.5059
70,0.4509
80,0.4395
90,0.4431
100,0.4681


TrainOutput(global_step=100, training_loss=0.7852902460098267, metrics={'train_runtime': 80.1026, 'train_samples_per_second': 9.987, 'train_steps_per_second': 1.248, 'total_flos': 237645717657600.0, 'train_loss': 0.7852902460098267, 'epoch': 0.8})

In [15]:
print(type(trainer))
print(type(trainer.model))

# trainer.save_model(f"model-{model_alias}-bnb4bit-coedit-test")
trained_model = trainer.model

<class 'transformers.trainer.Trainer'>
<class 'peft.peft_model.PeftModelForCausalLM'>


## Eval

In [143]:
# input_text = "Fix grammatical errors in this sentence: I goes to school every day."
# input_text = "Fix grammar: In the case of young people the best way is studying as hard as possible to get better grades so in the future they will have better chance to find better jobs ."

# input_text = f"""Fix grammar: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.
# Response:
# """.strip()
# input_text = f"""Fix grammar: Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.
# Response: For example, countries with a lot of deserts can transform their desert to increase their habitable land and use irrigation to provide clean water to the desert.
# """.strip()
# For example, countries with a lot of deserts can be terraformed to increase their habitable land and to use the irrigation system to provide clean water to the desert
# from scipy import special


# input_text = f"""Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.
# Response:
# """.strip()
# input_text = f"""Fix grammar: He drive a amazing car.
# Response:
# """.strip()
input_texts = [
    f"""Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.
Response:
""".strip(),
    f"""Fix grammar: He drive a amazing car.
Response:
""".strip(),
    f"""Fix grammar: He drive a amazing car.""",
]
# input_texts = [
#     "Fix grammar:  We don't have enough good Open Source games -- it's a waste to pour all the resources we have into one. :) Wesnoth has dwarves with guns, World of Warcraft'' has gnomes and goblins with explosives and flying machines -- where do you, personally, define the limits of the fantasy genre?",
#     "Fix grammar in this sentence:  In 2001, they successfully nominated Bohemian Hall, still a vibrant community center/beer garden started by Czech immigrants in Astoria, Queens, and the Casa Amadeo Music Store, the oldest, continuously occupied Latin music store in New York City,  as census sites to the National Register of Historic Places.",
# ]

# input_ids = tokenizer(input_text, add_special_tokens=False, return_tensors="pt")
inputs = tokenizer(input_texts, return_tensors="pt", padding=True).to(device)
print(inputs)

trained_model.config.use_cache = False
# trained_model.config.pad_token_id = trained_model.config.eos_token_id
# outputs = trained_model.generate(**input_ids, max_new_tokens=128)
# outputs = trained_model.generate(**input_ids, max_new_tokens=128, num_return_sequences=1)
outputs = trained_model.generate(**inputs, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id, num_return_sequences=1)
# outputs = trained_model.generate(
#     **inputs,
#     do_sample=True,
#     top_k=10,
#     num_return_sequences=1,
#     pad_token_id=tokenizer.eos_token_id,
#     # return_attention_mask=True,
#     max_length=256,
# )

trimmed_output = outputs[:, inputs.input_ids.shape[1] :]
trimmed_outputs = tokenizer.batch_decode(trimmed_output, skip_special_tokens=True)
print(trimmed_outputs)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


{'input_ids': tensor([[27914,   477, 14599, 44935,  8563,   422,   428,  2420,    25,  1114,
          1672,    11,  2678,   351,   257,  1256,   286, 45288,   460,  1059,
           430,   687,   511, 10326,   284,  2620,   511, 49055,  1956,   290,
          1262, 35425,   284,  2148,  3424,  1660,   284,   262, 10326,    13,
           198, 31077,    25],
        [22743, 23491,    25,   679,  3708,   257,  4998,  1097,    13,   198,
         31077,    25, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256],
        [22743, 23491,    25,   679,  3708,   257,  4998,  1097,    13, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 5025



[' For example, countries with a lot of deserts can terraform their deserts to increase their habitable land and use irrigation to provide clean water to the desert.', 'A remarkable car.', '\n\nHe drives a amazing car.']


In [120]:
rouge_metric = evaluate.load("rouge")
# glue_metric = evaluate.load("glue", "stsb")
sacreblue_metric = evaluate.load("sacrebleu")
sari_metric = evaluate.load("sari")
em_metric = evaluate.load("exact_match")

In [149]:
%load_ext autoreload

batch_size = 10
total_samples = 100
samples_map = {
    "fluency": get_iterater_samples_with_instruction(label="fluency", num_samples=total_samples, post_instruction="Response: "),
    "clarity": get_iterater_samples_with_instruction(label="clarity", num_samples=total_samples, post_instruction="Response: "),
    "coherence": get_iterater_samples_with_instruction(label="coherence", num_samples=total_samples, post_instruction="Response: "),
}
print(samples_map)
print(samples_map["fluency"]["task"][0])
print(samples_map["fluency"]["source"][0])
print(samples_map["fluency"]["reference"][0])
print(samples_map["fluency"]["task"][1])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Filter:   0%|          | 0/19705 [00:00<?, ? examples/s]

max_samples: 5078, num_samples: 100, selected: 100


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/19705 [00:00<?, ? examples/s]

max_samples: 5106, num_samples: 100, selected: 100


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/19705 [00:00<?, ? examples/s]

max_samples: 1676, num_samples: 100, selected: 100


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

{'fluency': Dataset({
    features: ['task', 'source', 'reference', 'references'],
    num_rows: 100
}), 'clarity': Dataset({
    features: ['task', 'source', 'reference', 'references'],
    num_rows: 100
}), 'coherence': Dataset({
    features: ['task', 'source', 'reference', 'references'],
    num_rows: 100
})}
Fix grammar:  We don't have enough good Open Source games -- it's a waste to pour all the resources we have into one. :) Wesnoth has dwarves with guns, World of Warcraft'' has gnomes and goblins with explosives and flying machines -- where do you, personally, define the limits of the fantasy genre? Response: 
 We don't have enough good Open Source games -- it's a waste to pour all the resources we have into one. :) Wesnoth has dwarves with guns, World of Warcraft'' has gnomes and goblins with explosives and flying machines -- where do you, personally, define the limits of the fantasy genre?
 We don't have enough good Open Source games — it's a waste to pour all the resources w

In [155]:
%%time
%load_ext autoreload

model = trained_model
model_name = "iliazlobin/gpt2-large-bnb4bit-coedit"
model_alias = model_name.replace("/", "_")
tokenizer.padding_side = "left"

batch_size = 5
total_samples = 10
samples_map = {
    "fluency": get_iterater_samples_with_instruction(label="fluency", num_samples=total_samples, post_instruction="Response: "),
    "clarity": get_iterater_samples_with_instruction(label="clarity", num_samples=total_samples, post_instruction="Response: "),
    "coherence": get_iterater_samples_with_instruction(label="coherence", num_samples=total_samples, post_instruction="Response: "),
}

def model_process(batch, idx, **kwargs):
    num_samples = len(batch["task"])
    start_time = time.time()

    model = kwargs.get("model")
    tokenizer = kwargs.get("tokenizer")
    total_samples = kwargs.get("total_samples")

    # print(batch["task"])
    inputs = tokenizer(batch["task"], padding=True, return_tensors="pt").to(device)
    # print(inputs)
    # inputs = tokenizer(batch['task'], return_tensors="pt").inputs.to(device)
    # inputs = tokenizer(item['task'], return_tensors="pt").inputs

    # outputs = model.generate(inputs, max_length=512)
    outputs = model.generate(
        **inputs,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        max_length=128,
    )
    # print(outputs)

    trimmed_output = outputs[:, inputs.input_ids.shape[1] :]
    processed = tokenizer.batch_decode(trimmed_output, skip_special_tokens=True)
    # print(processed)

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = num_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{idx[0]}-{idx[-1]}/{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
        f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']} | "
        f"batch/sps: {num_samples}/{sps_str}"
    )

    # return {"processed": processed}
    return {"processed": processed}
    # return {"processed": processed, "utilization": utilization, "tps": tps}
    # return {"processed": processed, "utilization": utilization}


utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)

processed_samples_map = {}

for category, samples in samples_map.items():
    total_samples = len(samples)

    print(f"Processing {total_samples} samples for {category}")
    print(samples)
    print(samples["task"][0])
    print(samples["source"][0])

    start_time = time.time()

    processed_samples = samples.map(
        model_process,
        fn_kwargs={
            "model": model,
            "tokenizer": tokenizer,
            "total_samples": total_samples,
        },
        num_proc=1,
        batched=True,
        batch_size=batch_size,
        with_indices=True,
    )

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = total_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    print(f"Finished processing {total_samples} samples for {category}.")
    print(processed_samples)
    print(processed_samples["reference"][0])
    print(processed_samples["processed"][0])

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
        f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']} | "
        f"sps: {sps_str}"
    )

    processed_samples_map[category] = {
        # "category": category,
        # "samples": samples,
        "samples": processed_samples,
        "sps": sps,
        "utilization": utilization,
    }

# processed_samples = samples.map(model_process, num_proc=torch.cuda.device_count())
# processed_samples = samples.map(
#     model_process,
#     fn_kwargs={
#         "model": model,
#         "tokenizer": tokenizer,
#         "total_samples": total_samples,
#     },
#     num_proc=1,
#     batched=True,
#     batch_size=batch_size,
#     with_indices=True,
# )

processed_fluency_samples = processed_samples_map["fluency"]["samples"]

pprint(processed_fluency_samples)
pprint(processed_fluency_samples["processed"][:2])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
max_samples: 5078, num_samples: 10, selected: 10


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

max_samples: 5106, num_samples: 10, selected: 10


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

max_samples: 1676, num_samples: 10, selected: 10


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

total/used/cuda/res/ram (Gb): 10.00/4.00/0.73/2.83/16.04
Processing 10 samples for fluency
Dataset({
    features: ['task', 'source', 'reference', 'references'],
    num_rows: 10
})
Fix grammar errors:  We don't have enough good Open Source games -- it's a waste to pour all the resources we have into one. :) Wesnoth has dwarves with guns, World of Warcraft'' has gnomes and goblins with explosives and flying machines -- where do you, personally, define the limits of the fantasy genre? Response: 
 We don't have enough good Open Source games -- it's a waste to pour all the resources we have into one. :) Wesnoth has dwarves with guns, World of Warcraft'' has gnomes and goblins with explosives and flying machines -- where do you, personally, define the limits of the fantasy genre?


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

0-4/10 | total/used/cuda/res/ram (Gb): 10.00/4.03/0.73/2.83/16.03 | batch/sps: 5/1.18
5-9/10 | total/used/cuda/res/ram (Gb): 10.00/4.03/0.73/2.83/16.01 | batch/sps: 5/1.38
Finished processing 10 samples for fluency.
10 | total/used/cuda/res/ram (Gb): 10.00/4.03/0.73/2.83/16.01 | sps: 0.82
Processing 10 samples for clarity
Dataset({
    features: ['task', 'source', 'reference', 'references'],
    num_rows: 10
})
Make this sentence more readable:  Dr. Farley Stillwell appears in Spider-Man: The Animated  , voiced by Michael Rye. Response: 
 Dr. Farley Stillwell appears in Spider-Man: The Animated  , voiced by Michael Rye.


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

0-4/10 | total/used/cuda/res/ram (Gb): 10.00/4.01/0.73/2.83/16.02 | batch/sps: 5/1.17
5-9/10 | total/used/cuda/res/ram (Gb): 10.00/4.06/0.73/2.83/16.03 | batch/sps: 5/1.38
Finished processing 10 samples for clarity.
10 | total/used/cuda/res/ram (Gb): 10.00/4.06/0.73/2.83/16.03 | sps: 0.88
Processing 10 samples for coherence
Dataset({
    features: ['task', 'source', 'reference', 'references'],
    num_rows: 10
})
Improve the coherence of the text:  The study also exhibits three methodological challenges with respect to assessing of interventions : a) the estimation of true infection dates , b) the usage of several indicators and c) the influence of test volume . In conclusion, the effectiveness of most German interventions remains questionable . Response: 
 The study also exhibits three methodological challenges with respect to assessing of interventions : a) the estimation of true infection dates , b) the usage of several indicators and c) the influence of test volume . In conclusion,

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

0-4/10 | total/used/cuda/res/ram (Gb): 10.00/4.02/0.73/2.83/16.12 | batch/sps: 5/1.55
5-9/10 | total/used/cuda/res/ram (Gb): 10.00/4.01/0.73/2.83/16.12 | batch/sps: 5/1.91
Finished processing 10 samples for coherence.
10 | total/used/cuda/res/ram (Gb): 10.00/4.01/0.73/2.83/16.12 | sps: 1.08
Dataset({
    features: ['task', 'source', 'reference', 'references', 'processed'],
    num_rows: 10
})
[" We don't have enough good Open Source games, as it's a waste of resources "
 'to pour all the resources into one. As for the World of Warcraft, I would '
 'argue that it is a waste of resources to pour all the resources into one '
 'game.',
 '']
CPU times: user 30.2 s, sys: 2.57 s, total: 32.8 s
Wall time: 32.8 s


In [156]:
hardware = "HomeDesktop (RTX3080)"

print(f"model_name: {model_name}")
print(f"model_alias: {model_alias}")

total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total/trainable params: {total_params}/{total_trainable_params}")

all_flats = []
all_scores = []
if os.path.exists("results/all-scores.csv"):
    all_scores = pd.read_csv("results/all-scores.csv").to_dict("records")

all_fulls = []

for category, obj in processed_samples_map.items():
    samples = obj["samples"]
    total_samples = len(samples)

    all_saved_samples = samples.remove_columns(["references"])
    saved_samples = all_saved_samples[:100] if len(all_saved_samples) > 100 else all_saved_samples
    flats_frame = pd.DataFrame.from_records(saved_samples)
    flats_frame.to_json(f"samples/{model_alias}_{category}.json", orient="records")

    scores = calculate_scores(samples)
    # pprint(scores)

    score_paths = [
        "rouge.rouge1",
        # "rouge.rouge2",
        # "rouge.rougeL",
        # "rouge.rougeLsum",
        "sacreblue.score",
        "sari.sari",
        "em.exact_match",
    ]

    normalized_scores = {}
    for k, v in scores.items():
        for k2, v2 in v.items():
            if not isinstance(v2, list):
                # normalized_scores[f"score.{k}.{k2}"] = v2
                path = f"{k}.{k2}"
                if path in score_paths:
                    normalized_scores[f"score.{k}.{k2}"] = v2
    # pprint(normalized_scores)

    normalized_utilization = {}
    for k, v in obj["utilization"].items():
        if not isinstance(v, list):
            normalized_utilization[f"utilization.{k}"] = v
    # print(normalized_utilization)

    flat_dict = {
        "model": model_name,
        "hardware": hardware,
        "total_params": total_params,
        "category": category,
        "total_samples": total_samples,
        "sps": obj["sps"],
    }
    flat_dict.update(normalized_scores)
    flat_dict.update(normalized_utilization)
    # pprint(frame)

    all_flats.append(flat_dict)
    all_scores.append(flat_dict)

    fulls_frame = {
        "category": category,
        "total_samples": total_samples,
        "sps": obj["sps"],
    }
    fulls_frame.update(
        {
            "scores": scores,
            "utilization": obj["utilization"],
        }
    )
    all_fulls.append(fulls_frame)

flats_frame = pd.DataFrame.from_records(all_flats)
flats_frame.to_csv(f"results/{model_alias}.csv", index=False)

scores_frame = pd.DataFrame.from_records(all_scores)
scores_frame.to_csv(f"results/all-scores.csv", index=False)

fulls_dict = {
    "model": model_name,
    "hardware": hardware,
    "total_params": total_params,
}
for full in all_fulls:
    fulls_dict[full["category"]] = full

print(fulls_dict)
fulls_frame = pd.DataFrame.from_records([fulls_dict])
fulls_frame.to_json(f"results/{model_alias}.json", orient="records", index=False)

model_name: iliazlobin/gpt2-large-bnb4bit-coedit
model_alias: iliazlobin_gpt2-large-bnb4bit-coedit
Total/trainable params: 443728640/23592960
{'model': 'iliazlobin/gpt2-large-bnb4bit-coedit', 'hardware': 'HomeDesktop (RTX3080)', 'total_params': 443728640, 'fluency': {'category': 'fluency', 'total_samples': 10, 'sps': 0.8247413760947063, 'scores': {'rouge': {'rouge1': 0.42422309885257353, 'rouge2': 0.3108144350176073, 'rougeL': 0.3976087074227543, 'rougeLsum': 0.4029114324560925}, 'sacreblue': {'score': 25.24792575861845, 'counts': [141, 101, 77, 59], 'totals': [345, 336, 327, 318], 'precisions': [40.869565217391305, 30.05952380952381, 23.547400611620795, 18.553459119496857], 'bp': 0.9327992966640325, 'sys_len': 345, 'ref_len': 369}, 'sari': {'sari': 14.228726961829649}, 'em': {'exact_match': 0.0}}, 'utilization': {'total_memory': 10736893952, 'memory_used': 4325888000, 'cuda_allocated': 789037568, 'cuda_reserved': 3036676096, 'ram_usage': 17190219776}}, 'clarity': {'category': 'clarity