In [2]:
%load_ext autoreload
%load_ext dotenv
%dotenv
!huggingface-cli login --token $HUGGING_FACE_TOKEN

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv
Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/izlobin/.cache/huggingface/token
Login successful


In [None]:
%%capture
%pip install transformers evaluate
%pip install nltk absl-py rouge_score
%pip install bleu sacrebleu
%pip install sacremoses
%pip install scipy
%pip install sentencepiece
%pip install optimum auto-gptq
%pip install scikit-learn
%pip install einops
%pip install bitsandbytes
%pip install accelerate

In [4]:
%load_ext autoreload

import os
import sys
import time
from pprint import pprint

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from huggingface_hub import HfApi
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BartForCausalLM,
    BartForConditionalGeneration,
    BartModel,
    BartTokenizer,
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5TokenizerFast,
)

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

from utils.dataset import get_iterater_samples_simplified
from utils.metric import calculate_scores
from utils.monitoring import calculate_utilization, format_utilization_narrow, print_utilization

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(f"Device: {device}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
'Device: cuda'


## Loading models

### Loading BART
* https://huggingface.co/facebook/bart-large

In [3]:
model_name = "facebook/bart-large"
model_alias = model_name.replace("/", "_")
tokenizer = BartTokenizer.from_pretrained(model_name)
# model = BartModel.from_pretrained(model_name)
# model = BartForCausalLM.from_pretrained(model_name, device_map=0)
model = BartForConditionalGeneration.from_pretrained(model_name, device_map=0)

print(type(tokenizer))
print(type(model))
print(model.config)

<class 'transformers.models.bart.tokenization_bart.BartTokenizer'>
<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>
BartConfig {
  "_name_or_path": "facebook/bart-large",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
 

In [3]:
model_name = "iliazlobin/bart-grammarly"
model_alias = model_name.replace("/", "_")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
# model = BartModel.from_pretrained(model_name)
# model = BartForCausalLM.from_pretrained(model_name, device_map=0)
model = BartForConditionalGeneration.from_pretrained(model_name, device_map=0)

print(type(tokenizer))
print(type(model))
print(model.config)

<class 'transformers.models.bart.tokenization_bart.BartTokenizer'>
<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>
BartConfig {
  "_name_or_path": "iliazlobin/bart-grammarly",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
 

### Loading t5
* https://huggingface.co/google-t5/t5-large

In [3]:
model_name = "google-t5/t5-large"
model_alias = model_name.replace("/", "_")

model = T5ForConditionalGeneration.from_pretrained(model_name, device_map=0)
tokenizer = T5TokenizerFast.from_pretrained(model_name, model_max_length=512)

print(type(tokenizer))
print(type(model))
print(model.config)

<class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>
<class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>
T5Config {
  "_name_or_path": "google-t5/t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 4096,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_siz

### Loading coedit / flan-t5

In [6]:
model_name = "google/flan-t5-large"
model_alias = model_name.replace("/", "_")

model = T5ForConditionalGeneration.from_pretrained(model_name, device_map=0)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(type(tokenizer))
print(type(model))
print(model.config)

<class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>
<class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>
T5Config {
  "_name_or_path": "google/flan-t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.39.3",
  "use_cache": true,
  "vocab_size": 32128
}



### Loading grammarly/coedit

In [3]:
model_name = "grammarly/coedit-large"
model_alias = model_name.replace("/", "_")

model = T5ForConditionalGeneration.from_pretrained(model_name, device_map=0)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(type(tokenizer))
print(type(model))
print(model.config)

<class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>
<class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>
T5Config {
  "_name_or_path": "grammarly/coedit-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.39.3",
  "use_cache": true,
  "vocab_size": 32100
}



### Loading Gemma

In [18]:
model_name = "google/gemma-2b-it"
# model_name = "google/gemma-7b-it"
# model_name = "google/gemma-7b"
model_alias = model_name.replace("/", "_")

torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = T5ForConditionalGeneration.from_pretrained(model_name, device_map=0)
# model = AutoModelForCausalLM.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.bfloat16,
#     # torch_dtype=torch.float16,
#     # revision="float16",
#     device_map="auto",
# )

print(model.config)

T5Config {
  "_name_or_path": "google/flan-t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.39.3",
  "use_cache": true,
  "vocab_size": 32128
}



### Loading Phi-2
* https://huggingface.co/microsoft/phi-2
* https://huggingface.co/TheBloke/phi-2-GPTQ

In [3]:
# model_name = 'microsoft/phi-2'
model_name = 'TheBloke/phi-2-GPTQ'
model_alias = model_name.replace('/', '_')

torch.cuda.empty_cache()

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=0,
    trust_remote_code=True,
)

# print(f"model.config.eos_token_id: {model.config.eos_token_id}")
# eos_token_id = 50256 # https://huggingface.co/microsoft/phi-2/blob/main/config.json

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
)

print(f"tokenizer.eos_token: {tokenizer.eos_token}")
tokenizer.pad_token = tokenizer.eos_token

# model = AutoModelForCausalLM.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.bfloat16,
#     # torch_dtype=torch.float16,
#     # revision="float16",
#     device_map="auto",
# )

print(model.config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


tokenizer.eos_token: <|endoftext|>
PhiConfig {
  "_name_or_path": "TheBloke/phi-2-GPTQ",
  "activation_function": "gelu_new",
  "architectures": [
    "PhiForCausalLM"
  ],
  "attn_pdrop": 0.0,
  "auto_map": {
    "AutoConfig": "TheBloke/phi-2-GPTQ--configuration_phi.PhiConfig",
    "AutoModelForCausalLM": "TheBloke/phi-2-GPTQ--modeling_phi.PhiForCausalLM"
  },
  "embd_pdrop": 0.0,
  "flash_attn": false,
  "flash_rotary": false,
  "fused_dense": false,
  "img_processor": null,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "phi-msft",
  "n_embd": 2560,
  "n_head": 32,
  "n_head_kv": null,
  "n_inner": null,
  "n_layer": 32,
  "n_positions": 2048,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "quantization_config": {
    "batch_size": 1,
    "bits": 4,
    "block_name_to_quantize": null,
    "cache_block_outputs": true,
    "damp_percent": 0.1,
    "dataset": null,
    "desc_act": true,
    "exllama_config": {
      "version": 1
    },
    "group_size": 128,

### Loading mixtral-8x7B
* https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
* https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GPTQ

Quantization
* https://medium.com/@rakeshrajpurohit/model-quantization-with-hugging-face-transformers-and-bitsandbytes-integration-b4c9983e8996

In [7]:
# model_name = "mistralai/Mixtral-8x7B-v0.1"
model_name = 'TheBloke/Mixtral-8x7B-v0.1-GPTQ'
model_alias = model_name.replace('/', '_')

torch.cuda.empty_cache()

# from transformers import BitsAndBytesConfig
# quantization_config = BitsAndBytesConfig(load_in_4bit=4)

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map=0,
#     # torch_dtype=torch.float16,
#     # load_in_4bit=True,
#     # quantization_config=quantization_config,
# )

# print(f"model.config.eos_token_id: {model.config.eos_token_id}")
# eos_token_id = 50256 # https://huggingface.co/microsoft/phi-2/blob/main/config.json

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=0,
    load_in_4bit=True,
    trust_remote_code=False,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True,
)

# print(f"tokenizer.eos_token: {tokenizer.eos_token}")
# tokenizer.pad_token = tokenizer.eos_token


print(model.config)

# text = 'Hello my name is'
# inputs = tokenizer(text, return_tensors='pt')
# outputs = model.generate(**inputs, max_new_tokens=20)
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


AttributeError: 'BitsAndBytesConfig' object has no attribute 'get_loading_attributes'

### Loading llama-2

In [3]:
# Loading llama-2

from transformers import LlamaForCausalLM, LlamaTokenizer
import sentencepiece as spm

# model_name = "meta-llama/Llama-2-7b-hf"
model_name = "TheBloke/Llama-2-7B-GPTQ"
# model_name = "TheBloke/Nous-Hermes-Llama-2-7B-GPTQ"
model_alias = model_name.replace("/", "_")

tokenizer = LlamaTokenizer.from_pretrained(model_name, padding_side='left')
model = LlamaForCausalLM.from_pretrained(model_name, device_map=0)

print(type(tokenizer))
print(type(model))
print(model.config)

from auto_gptq import exllama_set_max_input_length
model = exllama_set_max_input_length(model, max_input_length=2400)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class
The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class


LlamaConfig {
  "_name_or_path": "TheBloke/Llama-2-7B-GPTQ",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_length": 4096,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "quantization_config": {
    "batch_size": 1,
    "bits": 4,
    "block_name_to_quantize": null,
    "cache_block_outputs": true,
    "damp_percent": 0.01,
    "dataset": null,
    "desc_act": false,
    "exllama_config": {
      "version": 1
    },
    "group_size": 128,
    "max_input_length": null,
    "model_seqlen": null,
    "module_name_preceding_first_block": null,
    "modules_in_block_to_quantize": null,
    "pad_token_id": null,
    "quant_method": "gptq",
  

## Configuring memory (VRAM)

In [4]:
%reload_ext autoreload

from utils.monitoring import calculate_utilization, format_utilization_narrow, print_utilization

total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total/trainable params: {total_params}/{total_trainable_params}')

utilization = calculate_utilization()
print(utilization)
utilization_str = format_utilization_narrow(utilization)
print(utilization_str)
print(
    f"total/used/cuda/res/ram(Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)

actual_fraction = 0.90
available_memory = utilization['total_memory'] - utilization['memory_used']
recommended_fraction = available_memory / utilization['total_memory']
torch.cuda.set_per_process_memory_fraction(actual_fraction, 0)

print(
    f"Total/used/available memory (Gb): {utilization['total_memory']/1024**3:.2f}/"
    "{utilization['memory_used']/1024**3:.2f}/{available_memory/1024**3:.2f}"
)
print(f'Recommended/actual fraction: {recommended_fraction:.2f}/{actual_fraction:.2f}')

# torch.cuda.empty_cache()
# torch.empty(utilization['total_memory'] // 2, dtype=torch.int8, device='cuda')
# print_utilization()

Total/trainable params: 406291456/406291456
{'total_memory': 10736893952, 'memory_used': 2749890560, 'cuda_allocated': 1625367040, 'cuda_reserved': 1639972864, 'ram_usage': 15176204288}
{'total_memory': '10.00', 'memory_used': '2.56', 'cuda_allocated': '1.51', 'cuda_reserved': '1.53', 'ram_usage': '14.13'}
total/used/cuda/res/ram(Gb): 10.00/2.56/1.51/1.53/14.13
Total/used/available memory (Gb): 10.00/{utilization['memory_used']/1024**3:.2f}/{available_memory/1024**3:.2f}
Recommended/actual fraction: 0.74/0.90


## Test inference

In [6]:
# input_text = "Fix grammatical errors in this sentence: I goes to school every day."
input_text = "Fix grammar: In the case of young people the best way is studying as hard as possible to get better grades so in the future they will have better chance to find better jobs ."

# input_ids = tokenizer(input_text, add_special_tokens=False, return_tensors="pt")
input_ids = tokenizer(input_text, return_tensors="pt").to(device)

outputs = model.generate(**input_ids, max_new_tokens=100)
# outputs = model.generate(
#     **input_ids,
#     do_sample=True,
#     top_k=10,
#     num_return_sequences=1,
#     # eos_token_id=tokenizer.eos_token_id,
#     # return_attention_mask=False,
#     max_length=256,
# )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In the case of young people, the best way is to study as hard as possible to get better grades so that in the future, they will have a better chance to find better jobs.


### Metrics

In [5]:
rouge_metric = evaluate.load("rouge")
# glue_metric = evaluate.load("glue", "stsb")
sacreblue_metric = evaluate.load("sacrebleu")
sari_metric = evaluate.load("sari")
em_metric = evaluate.load("exact_match")

In [10]:
# api = HfApi()
# coedit_info = api.dataset_info("grammarly/coedit")
# pprint(coedit_info)

grammarly_dataset = load_dataset("grammarly/coedit")
pprint(grammarly_dataset)

unique_categories = set(grammarly_dataset)
pprint(unique_categories)

unique_tasks = set(grammarly_dataset["train"]["task"])
pprint(unique_tasks)

DatasetDict({
    train: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 69071
    })
    validation: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 1712
    })
})
{'train', 'validation'}
{'coherence', 'clarity', 'paraphrase', 'simplification', 'gec', 'neutralize'}


In [6]:
def get_samples(dataset, category="validation", task="gec", num_samples=1, seed=42):
    return dataset[category].shuffle(seed=seed).filter(lambda item: item["task"] == task).select(range(num_samples))

def print_samples(samples) -> None:
    for item in samples:
        pfx, src = item["src"].split(": ", 1)
        print(f"[{item['task']}] {pfx}")
        print(f"src: {src}")
        print(f"tgt: {item['tgt']}")


print_samples(get_samples(grammarly_dataset, num_samples=2))

# input_ids = tokenizer(item["task"], return_tensors="pt").input_ids.to(device)
# outputs = model.generate(input_ids, max_length=256)
# corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
# return {"processed": corrected}


[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
[gec] Fix grammaticality in this sentence
src: Dear friends, I hope you should correctly but I can gives you some opinion, I guess that is a good idea if you go to a small schools, under you can met a lot on people and there are more closed friend of course you cannot like that opcion if you like the biggest once, so in that ways you can go from the other school.
tgt: Dear friend, I hope you choose correctly but I can give you my opinion. I guess that it's a good idea if you go to a small school, because you can meet a lot of people and make more close friends of course you won't like that option if you like the bigger one, so in that case you should go to the other school.


#### Rouge metric

In [7]:
samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

score = rouge_metric.compute(
    predictions=samples['src'], references=samples['tgt']
)
pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 100
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
{'rouge1': 0.7150747966331101,
 'rouge2': 0.5167050375929942,
 'rougeL': 0.7005677840072502,
 'rougeLsum': 0.7007083564381789}


#### _GLUE metric_

In [8]:
samples = get_samples(grammarly_dataset, task="gec", num_samples=2)
pprint(object=samples)
print_samples([samples[0]])

src_input_ids = tokenizer(samples["src"][0], return_tensors="pt", padding=True).input_ids
tgt_input_ids = tokenizer(samples["tgt"][0], return_tensors="pt", padding=True).input_ids
pprint(src_input_ids[0])
pprint(tgt_input_ids[0])

# score = glue_metric.compute(predictions=src_input_ids[0], references=tgt_input_ids[0])
# score = glue_metric.compute(predictions=samples["src"], references=samples["tgt"])
# pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 2
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
tensor([14269, 19519,    16,    48,  7142,    10,   493,  6195,   388,    55,
            1])
tensor([ 493, 6195,    6,  388,   55,    1])


#### SacreBLEU metric

In [12]:
samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

score = sacreblue_metric.compute(predictions=samples["src"], references=samples["tgt"])
pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 100
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
{'bp': 1.0,
 'counts': [3886, 2743, 1965, 1419],
 'precisions': [70.79613773000547,
                50.899981443681575,
                37.152580828133864,
                27.346309500867218],
 'ref_len': 5090,
 'score': 43.74251258938969,
 'sys_len': 5489,
 'totals': [5489, 5389, 5289, 5189]}


#### SARI metric

In [14]:
samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

new_samples = samples.map(lambda item: {"tgts": [item["tgt"]]})
new_samples["tgts"][:5]

# sources=["About 95 species are currently accepted.","About 95 species are currently accepted."]
# predictions=["About 95 you now get in.","About 95 you now get in."]
# references=[["About 95 species are currently known.","About 95 species are now accepted.","95 species are now accepted."],["About 95 species are currently known.","About 95 species are now accepted.","95 species are now accepted."]]

score = sari_metric.compute(
  sources=new_samples['src'],
  predictions=new_samples['src'],
  references=new_samples['tgts']
)
pprint(score)

NameError: name 'get_samples' is not defined

#### Exact match (EM) metric

In [13]:
samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

score = em_metric.compute(
    predictions=samples['tgt'], references=samples['tgt']
)
pprint(score)

NameError: name 'get_samples' is not defined

## Evaluation (IteraTeR)

### IteraTeR
* https://huggingface.co/datasets/wanyu/IteraTeR_v2
* https://huggingface.co/datasets/wanyu/IteraTeR_full_sent

In [7]:
%load_ext autoreload

batch_size = 10
total_samples = 100
samples_map = {
    "fluency": get_iterater_samples_simplified(label="fluency", num_samples=total_samples),
    "clarity": get_iterater_samples_simplified(label="clarity", num_samples=total_samples),
    "coherence": get_iterater_samples_simplified(label="coherence", num_samples=total_samples),
}
print(samples_map)
print(samples_map["fluency"]["task"][0])
print(samples_map["fluency"]["source"][0])
print(samples_map["fluency"]["reference"][0])
print(samples_map["fluency"]["task"][1])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
max_samples: 5078, num_samples: 100, selected: 100
max_samples: 5106, num_samples: 100, selected: 100


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

max_samples: 1676, num_samples: 100, selected: 100


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

{'fluency': Dataset({
    features: ['task', 'source', 'reference', 'references'],
    num_rows: 100
}), 'clarity': Dataset({
    features: ['task', 'source', 'reference', 'references'],
    num_rows: 100
}), 'coherence': Dataset({
    features: ['task', 'source', 'reference', 'references'],
    num_rows: 100
})}
Fix all grammatical errors:  We don't have enough good Open Source games -- it's a waste to pour all the resources we have into one. :) Wesnoth has dwarves with guns, World of Warcraft'' has gnomes and goblins with explosives and flying machines -- where do you, personally, define the limits of the fantasy genre?
 We don't have enough good Open Source games -- it's a waste to pour all the resources we have into one. :) Wesnoth has dwarves with guns, World of Warcraft'' has gnomes and goblins with explosives and flying machines -- where do you, personally, define the limits of the fantasy genre?
 We don't have enough good Open Source games — it's a waste to pour all the resourc

In [6]:
%%time
%load_ext autoreload

batch_size = 10
total_samples = 100
samples_map = {
    "fluency": get_iterater_samples_simplified(label="fluency", num_samples=total_samples),
    "clarity": get_iterater_samples_simplified(label="clarity", num_samples=total_samples),
    "coherence": get_iterater_samples_simplified(label="coherence", num_samples=total_samples),
}

def model_process(batch, idx, **kwargs):
    num_samples = len(batch["task"])
    start_time = time.time()

    model = kwargs.get("model")
    tokenizer = kwargs.get("tokenizer")
    total_samples = kwargs.get("total_samples")

    input_ids = tokenizer(batch["task"], padding=True, return_tensors="pt").input_ids.to(device)
    # input_ids = tokenizer(batch['task'], return_tensors="pt").input_ids.to(device)
    # input_ids = tokenizer(item['task'], return_tensors="pt").input_ids
    # outputs = model.generate(input_ids, max_length=512)
    outputs = model.generate(input_ids, max_length=312)
    # print(f"outputs: {outputs}")
    processed = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = num_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{idx[0]}-{idx[-1]}/{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
        f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']} | "
        f"batch/sps: {num_samples}/{sps_str}"
    )

    # return {"processed": processed}
    return {"processed": processed}
    # return {"processed": processed, "utilization": utilization, "tps": tps}
    # return {"processed": processed, "utilization": utilization}


utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)

processed_samples_map = {}

for category, samples in samples_map.items():
    total_samples = len(samples)

    print(f"Processing {total_samples} samples for {category}")

    start_time = time.time()

    processed_samples = samples.map(
        model_process,
        fn_kwargs={
            "model": model,
            "tokenizer": tokenizer,
            "total_samples": total_samples,
        },
        num_proc=1,
        batched=True,
        batch_size=batch_size,
        with_indices=True,
    )

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = total_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    print(f"Finished processing {total_samples} samples for {category}.")

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
        f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']} | "
        f"sps: {sps_str}"
    )

    processed_samples_map[category] = {
        # "category": category,
        # "samples": samples,
        "samples": processed_samples,
        "sps": sps,
        "utilization": utilization,
    }

# processed_samples = samples.map(model_process, num_proc=torch.cuda.device_count())
# processed_samples = samples.map(
#     model_process,
#     fn_kwargs={
#         "model": model,
#         "tokenizer": tokenizer,
#         "total_samples": total_samples,
#     },
#     num_proc=1,
#     batched=True,
#     batch_size=batch_size,
#     with_indices=True,
# )

processed_fluency_samples = processed_samples_map["fluency"]["samples"]

pprint(processed_fluency_samples)
pprint(processed_fluency_samples["processed"][:2])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
max_samples: 5078, num_samples: 100, selected: 100
max_samples: 5106, num_samples: 100, selected: 100
max_samples: 1676, num_samples: 100, selected: 100
total/used/cuda/res/ram (Gb): 10.00/2.51/1.51/1.53/14.20
Processing 100 samples for fluency


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

0-9/100 | total/used/cuda/res/ram (Gb): 10.00/4.14/1.52/3.10/17.00 | batch/sps: 10/5.51
10-19/100 | total/used/cuda/res/ram (Gb): 10.00/5.51/1.52/4.53/17.14 | batch/sps: 10/4.31
20-29/100 | total/used/cuda/res/ram (Gb): 10.00/4.56/1.52/3.79/17.01 | batch/sps: 10/2.66
30-39/100 | total/used/cuda/res/ram (Gb): 10.00/4.76/1.52/3.86/17.00 | batch/sps: 10/6.41
40-49/100 | total/used/cuda/res/ram (Gb): 10.00/4.84/1.52/3.86/17.01 | batch/sps: 10/5.89
50-59/100 | total/used/cuda/res/ram (Gb): 10.00/4.90/1.52/3.92/17.02 | batch/sps: 10/3.58
60-69/100 | total/used/cuda/res/ram (Gb): 10.00/7.59/1.52/6.84/16.99 | batch/sps: 10/0.51
70-79/100 | total/used/cuda/res/ram (Gb): 10.00/7.79/1.52/6.91/16.99 | batch/sps: 10/2.00
80-89/100 | total/used/cuda/res/ram (Gb): 10.00/7.78/1.52/6.91/17.00 | batch/sps: 10/3.81
90-99/100 | total/used/cuda/res/ram (Gb): 10.00/7.78/1.52/6.91/17.00 | batch/sps: 10/4.59
Finished processing 100 samples for fluency.
100 | total/used/cuda/res/ram (Gb): 10.00/7.78/1.52/6.91/

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

0-9/100 | total/used/cuda/res/ram (Gb): 10.00/7.81/1.52/6.91/17.32 | batch/sps: 10/7.21
10-19/100 | total/used/cuda/res/ram (Gb): 10.00/7.81/1.52/6.91/17.31 | batch/sps: 10/7.48
20-29/100 | total/used/cuda/res/ram (Gb): 10.00/7.91/1.52/6.91/17.32 | batch/sps: 10/5.87
30-39/100 | total/used/cuda/res/ram (Gb): 10.00/7.90/1.52/6.91/17.35 | batch/sps: 10/4.86
40-49/100 | total/used/cuda/res/ram (Gb): 10.00/7.90/1.52/6.91/17.36 | batch/sps: 10/5.42
50-59/100 | total/used/cuda/res/ram (Gb): 10.00/7.89/1.52/6.91/17.36 | batch/sps: 10/4.43
60-69/100 | total/used/cuda/res/ram (Gb): 10.00/7.89/1.52/6.91/17.38 | batch/sps: 10/5.47
70-79/100 | total/used/cuda/res/ram (Gb): 10.00/7.99/1.52/6.91/17.38 | batch/sps: 10/4.72
80-89/100 | total/used/cuda/res/ram (Gb): 10.00/7.99/1.52/6.91/17.38 | batch/sps: 10/6.99
90-99/100 | total/used/cuda/res/ram (Gb): 10.00/7.99/1.52/6.91/17.38 | batch/sps: 10/8.19
Finished processing 100 samples for clarity.
100 | total/used/cuda/res/ram (Gb): 10.00/7.99/1.52/6.91/

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

0-9/100 | total/used/cuda/res/ram (Gb): 10.00/7.94/1.52/6.91/17.37 | batch/sps: 10/4.56
10-19/100 | total/used/cuda/res/ram (Gb): 10.00/7.90/1.52/6.91/17.34 | batch/sps: 10/2.34
20-29/100 | total/used/cuda/res/ram (Gb): 10.00/7.93/1.52/6.91/17.32 | batch/sps: 10/3.29
30-39/100 | total/used/cuda/res/ram (Gb): 10.00/7.89/1.52/6.91/17.32 | batch/sps: 10/2.72
40-49/100 | total/used/cuda/res/ram (Gb): 10.00/7.92/1.52/6.91/17.19 | batch/sps: 10/1.90
50-59/100 | total/used/cuda/res/ram (Gb): 10.00/7.97/1.52/6.91/17.19 | batch/sps: 10/1.87
60-69/100 | total/used/cuda/res/ram (Gb): 10.00/7.97/1.52/6.91/17.20 | batch/sps: 10/3.67
70-79/100 | total/used/cuda/res/ram (Gb): 10.00/7.96/1.52/6.91/17.20 | batch/sps: 10/2.37
80-89/100 | total/used/cuda/res/ram (Gb): 10.00/7.95/1.52/6.91/17.20 | batch/sps: 10/3.47
90-99/100 | total/used/cuda/res/ram (Gb): 10.00/7.95/1.52/6.91/17.21 | batch/sps: 10/2.36
Finished processing 100 samples for coherence.
100 | total/used/cuda/res/ram (Gb): 10.00/7.95/1.52/6.9

In [11]:
hardware = "HomeDesktop (RTX3080)"

print(f"model_name: {model_name}")
print(f"model_alias: {model_alias}")

total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total/trainable params: {total_params}/{total_trainable_params}")

all_flats = []
all_scores = []
if os.path.exists("results/all-scores.csv"):
    all_scores = pd.read_csv("results/all-scores.csv").to_dict("records")

all_fulls = []

for category, obj in processed_samples_map.items():
    samples = obj["samples"]
    total_samples = len(samples)

    all_saved_samples = samples.remove_columns(["references"])
    saved_samples = all_saved_samples[:100] if len(all_saved_samples) > 100 else all_saved_samples
    flats_frame = pd.DataFrame.from_records(saved_samples)
    flats_frame.to_json(f"samples/{model_alias}_{category}.json", orient="records")

    scores = calculate_scores(samples)
    # pprint(scores)

    score_paths = [
        "rouge.rouge1",
        # "rouge.rouge2",
        # "rouge.rougeL",
        # "rouge.rougeLsum",
        "sacreblue.score",
        "sari.sari",
        "em.exact_match",
    ]

    normalized_scores = {}
    for k, v in scores.items():
        for k2, v2 in v.items():
            if not isinstance(v2, list):
                # normalized_scores[f"score.{k}.{k2}"] = v2
                path = f"{k}.{k2}"
                if path in score_paths:
                    normalized_scores[f"score.{k}.{k2}"] = v2
    # pprint(normalized_scores)

    normalized_utilization = {}
    for k, v in obj["utilization"].items():
        if not isinstance(v, list):
            normalized_utilization[f"utilization.{k}"] = v
    # print(normalized_utilization)

    flat_dict = {
        "model": model_name,
        "hardware": hardware,
        "total_params": total_params,
        "category": category,
        "total_samples": total_samples,
        "sps": obj["sps"],
    }
    flat_dict.update(normalized_scores)
    flat_dict.update(normalized_utilization)
    # pprint(frame)

    all_flats.append(flat_dict)
    all_scores.append(flat_dict)

    fulls_frame = {
        "category": category,
        "total_samples": total_samples,
        "sps": obj["sps"],
    }
    fulls_frame.update(
        {
            "scores": scores,
            "utilization": obj["utilization"],
        }
    )
    all_fulls.append(fulls_frame)

flats_frame = pd.DataFrame.from_records(all_flats)
flats_frame.to_csv(f"results/{model_alias}.csv", index=False)

scores_frame = pd.DataFrame.from_records(all_scores)
scores_frame.to_csv(f"results/all-scores.csv", index=False)

fulls_dict = {
    "model": model_name,
    "hardware": hardware,
    "total_params": total_params,
}
for full in all_fulls:
    fulls_dict[full["category"]] = full

print(fulls_dict)
fulls_frame = pd.DataFrame.from_records([fulls_dict])
fulls_frame.to_json(f"results/{model_alias}.json", orient="records", index=False)

model_name: iliazlobin/bart-grammarly
model_alias: iliazlobin_bart-grammarly
Total/trainable params: 406291456/406291456
{'model': 'iliazlobin/bart-grammarly', 'hardware': 'HomeDesktop (RTX3080)', 'total_params': 406291456, 'fluency': {'category': 'fluency', 'total_samples': 100, 'sps': 2.1616074151551734, 'scores': {'rouge': {'rouge1': 0.9580054732590608, 'rouge2': 0.931971857420296, 'rougeL': 0.9562312872862376, 'rougeLsum': 0.9560444747346676}, 'sacreblue': {'score': 87.7132314163189, 'counts': [4028, 3754, 3497, 3259], 'totals': [4260, 4160, 4060, 3960], 'precisions': [94.55399061032864, 90.24038461538461, 86.13300492610837, 82.29797979797979], 'bp': 0.9946154877022089, 'sys_len': 4260, 'ref_len': 4283}, 'sari': {'sari': 55.969051747569964}, 'em': {'exact_match': 0.0}}, 'utilization': {'total_memory': 10736893952, 'memory_used': 8351469568, 'cuda_allocated': 1633886720, 'cuda_reserved': 7417626624, 'ram_usage': 18248675328}}, 'clarity': {'category': 'clarity', 'total_samples': 100,