In [1]:
%load_ext autoreload
%load_ext dotenv
%dotenv
!huggingface-cli login --token $HUGGING_FACE_TOKEN

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/izlobin/.cache/huggingface/token
Login successful


In [2]:
%%capture
%pip install transformers evaluate
%pip install nltk absl-py rouge_score
%pip install bleu sacrebleu
%pip install sacremoses
%pip install scipy
%pip install sentencepiece
%pip install optimum auto-gptq
%pip install scikit-learn
%pip install einops
%pip install bitsandbytes
%pip install accelerate

In [2]:
%load_ext autoreload
%autoreload 2

import os
import sys
import time
from pprint import pprint

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from huggingface_hub import HfApi
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BartForCausalLM,
    BartModel,
    BartTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5TokenizerFast,
)

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

from utils.dataset import get_iterater_samples_simplified, get_iterater_samples_with_instruction
from utils.metric import calculate_scores
from utils.monitoring import calculate_utilization, format_utilization_narrow, print_utilization

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(f"Device: {device}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
'Device: cuda'


## Loading models

### Loading custom gpt2 / gpt2-large

In [3]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "gpt2-large"
model_repo = f"openai-community"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")


tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=True,
)
tokenizer.add_eos_token = True
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"

print(type(tokenizer))


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map=0)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=0)
print(type(model))
# print(model.config)
# print(model)

trained_model_name = "gpt2-large-bnb8-coedit"
trained_model_repo = f"iliazlobin"
trained_model_id = f"{trained_model_repo}/{trained_model_name}"
trained_model_checkpoint = f"{trained_model_repo}/{trained_model_name}"
trained_model_path = f"{trained_model_repo}_{trained_model_name}"
print(
    f"trained_model_name: {trained_model_name},"
    f"trained_model_id: {trained_model_id},"
    f"trained_model_path: {trained_model_path}"
)


adapters_path = f"../model-train/model-{trained_model_repo}_{trained_model_name}"
peft_model = PeftModel.from_pretrained(model, adapters_path)

print(type(peft_model))
print(peft_model.config)
print(peft_model)

origin_model = model
model = peft_model
model_name = trained_model_name
model_repo = trained_model_repo
model_id = trained_model_id
model_checkpoint = trained_model_checkpoint
model_path = trained_model_path
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

model_name: gpt2-large,model_id: openai-community/gpt2-large,model_path: openai-community_gpt2-large
<class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>
<class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>
trained_model_name: gpt2-large-bnb8-coedit,trained_model_id: iliazlobin/gpt2-large-bnb8-coedit,trained_model_path: iliazlobin_gpt2-large-bnb8-coedit
<class 'peft.peft_model.PeftModelForCausalLM'>
GPT2Config {
  "_name_or_path": "openai-community/gpt2-large",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1280,
  "n_head": 20,
  "n_inner": null,
  "n_layer": 36,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": tr



### Loading BART
* https://huggingface.co/facebook/bart-large

In [3]:
model_name = "facebook/bart-large"
model_alias = model_name.replace("/", "_")
tokenizer = BartTokenizer.from_pretrained(model_name)
# model = BartModel.from_pretrained(model_name)
# model = BartForCausalLM.from_pretrained(model_name, device_map=0)
model = BartForConditionalGeneration.from_pretrained(model_name, device_map=0)

print(type(tokenizer))
print(type(model))
print(model.config)

<class 'transformers.models.bart.tokenization_bart.BartTokenizer'>
<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>
BartConfig {
  "_name_or_path": "facebook/bart-large",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
 

In [3]:
model_name = "iliazlobin/bart-grammarly"
model_alias = model_name.replace("/", "_")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
# model = BartModel.from_pretrained(model_name)
# model = BartForCausalLM.from_pretrained(model_name, device_map=0)
model = BartForConditionalGeneration.from_pretrained(model_name, device_map=0)

print(type(tokenizer))
print(type(model))
print(model.config)

<class 'transformers.models.bart.tokenization_bart.BartTokenizer'>
<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>
BartConfig {
  "_name_or_path": "iliazlobin/bart-grammarly",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
 

### Loading t5
* https://huggingface.co/google-t5/t5-large

In [3]:
model_name = "google-t5/t5-large"
model_alias = model_name.replace("/", "_")

model = T5ForConditionalGeneration.from_pretrained(model_name, device_map=0)
tokenizer = T5TokenizerFast.from_pretrained(model_name, model_max_length=512)

print(type(tokenizer))
print(type(model))
print(model.config)

<class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>
<class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>
T5Config {
  "_name_or_path": "google-t5/t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 4096,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_siz

### Loading coedit / flan-t5

In [6]:
model_name = "google/flan-t5-large"
model_alias = model_name.replace("/", "_")

model = T5ForConditionalGeneration.from_pretrained(model_name, device_map=0)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(type(tokenizer))
print(type(model))
print(model.config)

<class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>
<class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>
T5Config {
  "_name_or_path": "google/flan-t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.39.3",
  "use_cache": true,
  "vocab_size": 32128
}



### Loading grammarly/coedit

In [3]:
model_name = "grammarly/coedit-large"
model_alias = model_name.replace("/", "_")

model = T5ForConditionalGeneration.from_pretrained(model_name, device_map=0)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(type(tokenizer))
print(type(model))
print(model.config)

<class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>
<class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>
T5Config {
  "_name_or_path": "grammarly/coedit-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.39.3",
  "use_cache": true,
  "vocab_size": 32100
}



### Loading Gemma

In [18]:
model_name = "google/gemma-2b-it"
# model_name = "google/gemma-7b-it"
# model_name = "google/gemma-7b"
model_alias = model_name.replace("/", "_")

torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = T5ForConditionalGeneration.from_pretrained(model_name, device_map=0)
# model = AutoModelForCausalLM.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.bfloat16,
#     # torch_dtype=torch.float16,
#     # revision="float16",
#     device_map="auto",
# )

print(model.config)

T5Config {
  "_name_or_path": "google/flan-t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.39.3",
  "use_cache": true,
  "vocab_size": 32128
}



### Loading Phi-2
* https://huggingface.co/microsoft/phi-2
* https://huggingface.co/TheBloke/phi-2-GPTQ

In [3]:
# model_name = 'microsoft/phi-2'
model_name = 'TheBloke/phi-2-GPTQ'
model_alias = model_name.replace('/', '_')

torch.cuda.empty_cache()

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=0,
    trust_remote_code=True,
)

# print(f"model.config.eos_token_id: {model.config.eos_token_id}")
# eos_token_id = 50256 # https://huggingface.co/microsoft/phi-2/blob/main/config.json

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
)

print(f"tokenizer.eos_token: {tokenizer.eos_token}")
tokenizer.pad_token = tokenizer.eos_token

# model = AutoModelForCausalLM.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.bfloat16,
#     # torch_dtype=torch.float16,
#     # revision="float16",
#     device_map="auto",
# )

print(model.config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


tokenizer.eos_token: <|endoftext|>
PhiConfig {
  "_name_or_path": "TheBloke/phi-2-GPTQ",
  "activation_function": "gelu_new",
  "architectures": [
    "PhiForCausalLM"
  ],
  "attn_pdrop": 0.0,
  "auto_map": {
    "AutoConfig": "TheBloke/phi-2-GPTQ--configuration_phi.PhiConfig",
    "AutoModelForCausalLM": "TheBloke/phi-2-GPTQ--modeling_phi.PhiForCausalLM"
  },
  "embd_pdrop": 0.0,
  "flash_attn": false,
  "flash_rotary": false,
  "fused_dense": false,
  "img_processor": null,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "phi-msft",
  "n_embd": 2560,
  "n_head": 32,
  "n_head_kv": null,
  "n_inner": null,
  "n_layer": 32,
  "n_positions": 2048,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "quantization_config": {
    "batch_size": 1,
    "bits": 4,
    "block_name_to_quantize": null,
    "cache_block_outputs": true,
    "damp_percent": 0.1,
    "dataset": null,
    "desc_act": true,
    "exllama_config": {
      "version": 1
    },
    "group_size": 128,

### Loading mixtral-8x7B
* https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
* https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GPTQ

Quantization
* https://medium.com/@rakeshrajpurohit/model-quantization-with-hugging-face-transformers-and-bitsandbytes-integration-b4c9983e8996

In [7]:
# model_name = "mistralai/Mixtral-8x7B-v0.1"
model_name = 'TheBloke/Mixtral-8x7B-v0.1-GPTQ'
model_alias = model_name.replace('/', '_')

torch.cuda.empty_cache()

# from transformers import BitsAndBytesConfig
# quantization_config = BitsAndBytesConfig(load_in_4bit=4)

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map=0,
#     # torch_dtype=torch.float16,
#     # load_in_4bit=True,
#     # quantization_config=quantization_config,
# )

# print(f"model.config.eos_token_id: {model.config.eos_token_id}")
# eos_token_id = 50256 # https://huggingface.co/microsoft/phi-2/blob/main/config.json

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=0,
    load_in_4bit=True,
    trust_remote_code=False,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True,
)

# print(f"tokenizer.eos_token: {tokenizer.eos_token}")
# tokenizer.pad_token = tokenizer.eos_token


print(model.config)

# text = 'Hello my name is'
# inputs = tokenizer(text, return_tensors='pt')
# outputs = model.generate(**inputs, max_new_tokens=20)
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


AttributeError: 'BitsAndBytesConfig' object has no attribute 'get_loading_attributes'

### Loading llama-2

In [3]:
# model_name = "meta-llama/Llama-2-7b-hf"
model_name = "TheBloke/Llama-2-7B-GPTQ"
# model_name = "TheBloke/Nous-Hermes-Llama-2-7B-GPTQ"
model_alias = model_name.replace("/", "_")

from transformers import LlamaTokenizer, LlamaForCausalLM

tokenizer = LlamaTokenizer.from_pretrained(model_name, padding_side='left')
model = LlamaForCausalLM.from_pretrained(model_name, device_map=0)

tokenizer.pad_token = tokenizer.eos_token

print(type(tokenizer))
print(type(model))
print(model.config)

# from auto_gptq import exllama_set_max_input_length
# model = exllama_set_max_input_length(model, max_input_length=2400)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class
The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class


<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>
<class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>
LlamaConfig {
  "_name_or_path": "TheBloke/Llama-2-7B-GPTQ",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_length": 4096,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "quantization_config": {
    "batch_size": 1,
    "bits": 4,
    "block_name_to_quantize": null,
    "cache_block_outputs": true,
    "damp_percent": 0.01,
    "dataset": null,
    "desc_act": false,
    "exllama_config": {
      "version": 1
    },
    "group_size": 128,
    "max_input_length": null,
    "model_seqlen": null,
    "mo

## Configuring memory (VRAM)

In [4]:
%reload_ext autoreload

from utils.monitoring import calculate_utilization, format_utilization_narrow, print_utilization

total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total/trainable params: {total_params}/{total_trainable_params}')

utilization = calculate_utilization()
print(utilization)
utilization_str = format_utilization_narrow(utilization)
print(utilization_str)
print(
    f"total/used/cuda/res/ram(Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)

actual_fraction = 0.90
available_memory = utilization['total_memory'] - utilization['memory_used']
recommended_fraction = available_memory / utilization['total_memory']
torch.cuda.set_per_process_memory_fraction(actual_fraction, 0)

print(
    f"Total/used/available memory (Gb): {utilization['total_memory']/1024**3:.2f}/"
    "{utilization['memory_used']/1024**3:.2f}/{available_memory/1024**3:.2f}"
)
print(f'Recommended/actual fraction: {recommended_fraction:.2f}/{actual_fraction:.2f}')

# torch.cuda.empty_cache()
# torch.empty(utilization['total_memory'] // 2, dtype=torch.int8, device='cuda')
# print_utilization()

Total/trainable params: 785826560/0
{'total_memory': 10736893952, 'memory_used': 5864493056, 'cuda_allocated': 3257204736, 'cuda_reserved': 3391094784, 'ram_usage': 15431069696}
{'total_memory': '10.00', 'memory_used': '5.46', 'cuda_allocated': '3.03', 'cuda_reserved': '3.16', 'ram_usage': '14.37'}
total/used/cuda/res/ram(Gb): 10.00/5.46/3.03/3.16/14.37
Total/used/available memory (Gb): 10.00/{utilization['memory_used']/1024**3:.2f}/{available_memory/1024**3:.2f}
Recommended/actual fraction: 0.45/0.90


## Test Inference

In [5]:
# input_text = "Fix grammatical errors in this sentence: I goes to school every day."
input_text = "Fix grammar: In the case of young people the best way is studying as hard as possible to get better grades so in the future they will have better chance to find better jobs ."

# input_ids = tokenizer(input_text, add_special_tokens=False, return_tensors="pt")
input_ids = tokenizer(input_text, return_tensors="pt").to(device)

outputs = model.generate(**input_ids, max_new_tokens=100)
# outputs = model.generate(
#     **input_ids,
#     do_sample=True,
#     top_k=10,
#     num_return_sequences=1,
#     # eos_token_id=tokenizer.eos_token_id,
#     # return_attention_mask=False,
#     max_length=256,
# )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Fix grammar: In the case of young people the best way is studying as hard as possible to get better grades so in the future they will have better chance to find better jobs.


In [5]:
input_texts = [
    f"""Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.
Response:
""".strip(),
    f"""Fix grammar: He drive a amazing car.
Response:
""".strip(),
]
# input_texts = [
#     "Fix grammar:  We don't have enough good Open Source games -- it's a waste to pour all the resources we have into one. :) Wesnoth has dwarves with guns, World of Warcraft'' has gnomes and goblins with explosives and flying machines -- where do you, personally, define the limits of the fantasy genre?",
#     "Fix grammar in this sentence:  In 2001, they successfully nominated Bohemian Hall, still a vibrant community center/beer garden started by Czech immigrants in Astoria, Queens, and the Casa Amadeo Music Store, the oldest, continuously occupied Latin music store in New York City,  as census sites to the National Register of Historic Places.",
# ]

# input_ids = tokenizer(input_text, add_special_tokens=False, return_tensors="pt")
inputs = tokenizer(input_texts, return_tensors="pt", padding=True).to(device)
print(inputs)

model.config.use_cache = False
# model.config.pad_token_id = model.config.eos_token_id
# outputs = model.generate(**input_ids, max_new_tokens=128)
# outputs = model.generate(**input_ids, max_new_tokens=128, num_return_sequences=1)
outputs = model.generate(**inputs, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id, num_return_sequences=1)
# outputs = model.generate(
#     **inputs,
#     do_sample=True,
#     top_k=10,
#     num_return_sequences=1,
#     pad_token_id=tokenizer.eos_token_id,
#     # return_attention_mask=True,
#     max_length=256,
# )

trimmed_output = outputs[:, inputs.input_ids.shape[1] :]
trimmed_outputs = tokenizer.batch_decode(trimmed_output, skip_special_tokens=True)
print(trimmed_outputs)

{'input_ids': tensor([[27914,   477, 14599, 44935,  8563,   422,   428,  2420,    25,  1114,
          1672,    11,  2678,   351,   257,  1256,   286, 45288,   460,  1059,
           430,   687,   511, 10326,   284,  2620,   511, 49055,  1956,   290,
          1262, 35425,   284,  2148,  3424,  1660,   284,   262, 10326,    13,
           198, 31077,    25],
        [50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 22743, 23491,    25,   679,  3708,   257,  4998,  1097,    13,
           198, 31077,    25]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0,

### Metrics

In [6]:
rouge_metric = evaluate.load("rouge")
# glue_metric = evaluate.load("glue", "stsb")
sacreblue_metric = evaluate.load("sacrebleu")
sari_metric = evaluate.load("sari")
em_metric = evaluate.load("exact_match")

In [8]:
# api = HfApi()
# coedit_info = api.dataset_info("grammarly/coedit")
# pprint(coedit_info)

grammarly_dataset = load_dataset("grammarly/coedit")
pprint(grammarly_dataset)

unique_categories = set(grammarly_dataset)
pprint(unique_categories)

unique_tasks = set(grammarly_dataset["train"]["task"])
pprint(unique_tasks)

DatasetDict({
    train: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 69071
    })
    validation: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 1712
    })
})
{'validation', 'train'}
{'clarity', 'coherence', 'simplification', 'gec', 'neutralize', 'paraphrase'}


In [18]:
def get_samples(dataset, category="validation", task="gec", num_samples=1, seed=42):
    return dataset[category].shuffle(seed=seed).filter(lambda item: item["task"] == task).select(range(num_samples))

def print_samples(samples) -> None:
    for item in samples:
        pfx, src = item["src"].split(": ", 1)
        print(f"[{item['task']}] {pfx}")
        print(f"src: {src}")
        print(f"tgt: {item['tgt']}")


print_samples(get_samples(grammarly_dataset, num_samples=2))

# input_ids = tokenizer(item["task"], return_tensors="pt").input_ids.to(device)
# outputs = model.generate(input_ids, max_length=256)
# corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
# return {"processed": corrected}


[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
[gec] Fix grammaticality in this sentence
src: Dear friends, I hope you should correctly but I can gives you some opinion, I guess that is a good idea if you go to a small schools, under you can met a lot on people and there are more closed friend of course you cannot like that opcion if you like the biggest once, so in that ways you can go from the other school.
tgt: Dear friend, I hope you choose correctly but I can give you my opinion. I guess that it's a good idea if you go to a small school, because you can meet a lot of people and make more close friends of course you won't like that option if you like the bigger one, so in that case you should go to the other school.


#### Rouge metric

In [7]:
samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

score = rouge_metric.compute(
    predictions=samples['src'], references=samples['tgt']
)
pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 100
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
{'rouge1': 0.7150747966331101,
 'rouge2': 0.5167050375929942,
 'rougeL': 0.7005677840072502,
 'rougeLsum': 0.7007083564381789}


#### _GLUE metric_

In [8]:
samples = get_samples(grammarly_dataset, task="gec", num_samples=2)
pprint(object=samples)
print_samples([samples[0]])

src_input_ids = tokenizer(samples["src"][0], return_tensors="pt", padding=True).input_ids
tgt_input_ids = tokenizer(samples["tgt"][0], return_tensors="pt", padding=True).input_ids
pprint(src_input_ids[0])
pprint(tgt_input_ids[0])

# score = glue_metric.compute(predictions=src_input_ids[0], references=tgt_input_ids[0])
# score = glue_metric.compute(predictions=samples["src"], references=samples["tgt"])
# pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 2
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
tensor([14269, 19519,    16,    48,  7142,    10,   493,  6195,   388,    55,
            1])
tensor([ 493, 6195,    6,  388,   55,    1])


#### SacreBLEU metric

In [12]:
samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

score = sacreblue_metric.compute(predictions=samples["src"], references=samples["tgt"])
pprint(score)

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 100
})
[gec] Fix grammar in this sentence
src: Be careful man!
tgt: Be careful, man!
{'bp': 1.0,
 'counts': [3886, 2743, 1965, 1419],
 'precisions': [70.79613773000547,
                50.899981443681575,
                37.152580828133864,
                27.346309500867218],
 'ref_len': 5090,
 'score': 43.74251258938969,
 'sys_len': 5489,
 'totals': [5489, 5389, 5289, 5189]}


#### SARI metric

In [14]:
samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

new_samples = samples.map(lambda item: {"tgts": [item["tgt"]]})
new_samples["tgts"][:5]

# sources=["About 95 species are currently accepted.","About 95 species are currently accepted."]
# predictions=["About 95 you now get in.","About 95 you now get in."]
# references=[["About 95 species are currently known.","About 95 species are now accepted.","95 species are now accepted."],["About 95 species are currently known.","About 95 species are now accepted.","95 species are now accepted."]]

score = sari_metric.compute(
  sources=new_samples['src'],
  predictions=new_samples['src'],
  references=new_samples['tgts']
)
pprint(score)

NameError: name 'get_samples' is not defined

#### Exact match (EM) metric

In [13]:
samples = get_samples(grammarly_dataset, task="gec", num_samples=100)
pprint(samples)
print_samples([samples[0]])

score = em_metric.compute(
    predictions=samples['tgt'], references=samples['tgt']
)
pprint(score)

NameError: name 'get_samples' is not defined

## Evaluation
Datasets
* https://huggingface.co/datasets/wanyu/IteraTeR_v2
* https://huggingface.co/datasets/wanyu/IteraTeR_full_sent
* https://huggingface.co/datasets/grammarly/coedit

### Iterater dataset

In [25]:
%%script echo skip
%load_ext autoreload

batch_size = 10
total_samples = 100
samples_map = {
    "fluency": get_iterater_samples_simplified(label="fluency", num_samples=total_samples),
    "clarity": get_iterater_samples_simplified(label="clarity", num_samples=total_samples),
    "coherence": get_iterater_samples_simplified(label="coherence", num_samples=total_samples),
}
print(samples_map)
print(samples_map["fluency"]["task"][0])
print(samples_map["fluency"]["source"][0])
print(samples_map["fluency"]["reference"][0])
print(samples_map["fluency"]["task"][1])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
max_samples: 5078, num_samples: 100, selected: 100
max_samples: 5106, num_samples: 100, selected: 100
max_samples: 1676, num_samples: 100, selected: 100
{'fluency': Dataset({
    features: ['task', 'source', 'reference', 'references'],
    num_rows: 100
}), 'clarity': Dataset({
    features: ['task', 'source', 'reference', 'references'],
    num_rows: 100
}), 'coherence': Dataset({
    features: ['task', 'source', 'reference', 'references'],
    num_rows: 100
})}
Fix all grammatical errors:  We don't have enough good Open Source games -- it's a waste to pour all the resources we have into one. :) Wesnoth has dwarves with guns, World of Warcraft'' has gnomes and goblins with explosives and flying machines -- where do you, personally, define the limits of the fantasy genre?
 We don't have enough good Open Source games -- it's a waste to pour all the resources we have into one. :) Wesnoth has dwarves w

### Grammarly/coedit dataset

In [9]:
from datasets import DatasetDict, load_dataset

# full_dataset = load_dataset("grammarly/coedit")
# print(full_dataset)

# train_dataset = load_dataset("grammarly/coedit", split="train[:50000]")
# test_dataset = load_dataset("grammarly/coedit", split="train[10000:]")
# # test_dataset = load_dataset("grammarly/coedit", split="validation")

all_dataset = load_dataset("grammarly/coedit", split="train+validation")
print(all_dataset)

# print()
# print(f"train set {set(all_dataset['task'])}")
# print(f"total len: {len(all_dataset)}")
# print(f"gec len: {len(all_dataset.filter(lambda x: x['task'] == 'gec'))}")
# print(f"simplification len: {len(all_dataset.filter(lambda x: x['task'] == 'simplification'))}")
# print(f"clarity len: {len(all_dataset.filter(lambda x: x['task'] == 'clarity'))}")
# print(f"coherence len: {len(all_dataset.filter(lambda x: x['task'] == 'coherence'))}")
# print(f"paraphrase len: {len(all_dataset.filter(lambda x: x['task'] == 'paraphrase'))}")
# print(f"neutralize len: {len(all_dataset.filter(lambda x: x['task'] == 'neutralize'))}")
# print()

gec_dataset = all_dataset.filter(lambda x: x["task"] == "gec")
gec_dataset_split = int(0.9 * len(gec_dataset))
train_gec_dataset = gec_dataset.select(range(0, gec_dataset_split))
test_gec_dataset = gec_dataset.select(range(gec_dataset_split, len(gec_dataset)))

simplification_dataset = all_dataset.filter(lambda x: x["task"] == "simplification")
simplification_dataset_split = int(0.9 * len(simplification_dataset))
train_simplification_dataset = simplification_dataset.select(range(0, simplification_dataset_split))
test_simplification_dataset = simplification_dataset.select(
    range(simplification_dataset_split, len(simplification_dataset))
)

clarity_dataset = all_dataset.filter(lambda x: x["task"] == "clarity")
clarity_dataset_split = int(0.9 * len(clarity_dataset))
train_clarity_dataset = clarity_dataset.select(range(0, clarity_dataset_split))
test_clarity_dataset = clarity_dataset.select(range(clarity_dataset_split, len(clarity_dataset)))

coherence_dataset = all_dataset.filter(lambda x: x["task"] == "coherence")
coherence_dataset_split = int(0.9 * len(coherence_dataset))
train_coherence_dataset = coherence_dataset.select(range(0, coherence_dataset_split))
test_coherence_dataset = coherence_dataset.select(range(coherence_dataset_split, len(coherence_dataset)))

paraphrase_dataset = all_dataset.filter(lambda x: x["task"] == "paraphrase")
paraphrase_dataset_split = int(0.9 * len(paraphrase_dataset))
train_paraphrase_dataset = paraphrase_dataset.select(range(0, paraphrase_dataset_split))
test_paraphrase_dataset = paraphrase_dataset.select(range(paraphrase_dataset_split, len(paraphrase_dataset)))

neutralize_dataset = all_dataset.filter(lambda x: x["task"] == "neutralize")
neutralize_dataset_split = int(0.9 * len(neutralize_dataset))
train_neutralize_dataset = neutralize_dataset.select(range(0, neutralize_dataset_split))
test_neutralize_dataset = neutralize_dataset.select(range(neutralize_dataset_split, len(neutralize_dataset)))

from datasets import concatenate_datasets

train_dataset = concatenate_datasets(
    [
        train_gec_dataset,
        train_simplification_dataset,
        train_clarity_dataset,
        train_coherence_dataset,
        train_paraphrase_dataset,
        train_neutralize_dataset,
    ]
)
print(train_dataset)
print(f"train set {set(train_dataset['task'])}")

test_dataset = concatenate_datasets(
    [
        test_gec_dataset,
        test_simplification_dataset,
        test_clarity_dataset,
        test_coherence_dataset,
        test_paraphrase_dataset,
        test_neutralize_dataset,
    ]
)
print(test_dataset)
print(f"test set {set(test_dataset['task'])}")

dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
# dataset = dataset.rename_column("task", "label")
dataset = dataset.map(
    lambda item: {
        "input": item["src"],
        "reference": item["tgt"],
        "references": [item["tgt"]],
    },
    remove_columns=["src", "tgt", "_id"],
)
print(dataset)
print(dataset["train"][0])

Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 70783
})
Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 63703
})
train set {'clarity', 'coherence', 'simplification', 'gec', 'neutralize', 'paraphrase'}
Dataset({
    features: ['_id', 'task', 'src', 'tgt'],
    num_rows: 7080
})
test set {'clarity', 'coherence', 'simplification', 'gec', 'neutralize', 'paraphrase'}
DatasetDict({
    train: Dataset({
        features: ['task', 'input', 'reference', 'references'],
        num_rows: 63703
    })
    test: Dataset({
        features: ['task', 'input', 'reference', 'references'],
        num_rows: 7080
    })
})
{'task': 'gec', 'input': 'Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.', 'reference': 'For example, countries with a lot of deserts can transform their desert to increase their hab

In [26]:
# find the longest sequence in the dataset
max_input_length = max(len(tokenizer.encode(item["input"])) for item in dataset["train"])
print(f"max_input_length: {max_input_length}")

max_input_length: 171


In [24]:
from datasets import Dataset

test_lists_map = {}

for task in set(train_dataset['task']):
    test_lists_map[task] = []

for item in dataset["test"]:
    test_lists_map[item["task"]].append(item)

test_dataset_map = {}
for task, list in test_lists_map.items():
    test_dataset_map[task] = Dataset.from_list(list)
# print(test_dataset_map)

test_dataset_dict = DatasetDict(test_dataset_map)
# print(test_dataset_dict)

# for task, ds in test_dataset_dict.items():
#     print(f"{task}: {ds}")

for task in set(train_dataset['task']):
    print(f"test/{task}: {len(test_lists_map[task])}")

clarity - Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 126
})
coherence - Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 1062
})
simplification - Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 1144
})
gec - Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 2031
})
neutralize - Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 1127
})
paraphrase - Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 1590
})
test/clarity: 126
test/coherence: 1062
test/simplification: 1144
test/gec: 2031
test/neutralize: 1127
test/paraphrase: 1590


### Eval encoder-decoder models

In [28]:
%%time
%load_ext autoreload

batch_size = 5
max_length = 256

def model_process(batch, idx, **kwargs):
    num_samples = len(batch["input"])
    start_time = time.time()

    model = kwargs.get("model")
    tokenizer = kwargs.get("tokenizer")
    total_samples = kwargs.get("total_samples")

    input_ids = tokenizer(batch["input"], padding=True, return_tensors="pt").input_ids.to(device)
    # input_ids = tokenizer(batch['task'], return_tensors="pt").input_ids.to(device)
    # input_ids = tokenizer(item['task'], return_tensors="pt").input_ids
    # outputs = model.generate(input_ids, max_length=512)
    outputs = model.generate(input_ids, max_length=max_length)
    # print(f"outputs: {outputs}")
    processed = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = num_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{idx[0]}-{idx[-1]}/{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
        f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']} | "
        f"batch/sps: {num_samples}/{sps_str}"
    )

    # return {"processed": processed}
    return {"processed": processed}
    # return {"processed": processed, "utilization": utilization, "tps": tps}
    # return {"processed": processed, "utilization": utilization}


utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)

processed_samples_map = {}

for task, samples in test_dataset_dict.items():
    total_samples = len(samples)

    print(f"Processing {total_samples} samples for {task}")

    start_time = time.time()

    processed_samples = samples.map(
        model_process,
        fn_kwargs={
            "model": model,
            "tokenizer": tokenizer,
            "total_samples": total_samples,
        },
        num_proc=1,
        batched=True,
        batch_size=batch_size,
        with_indices=True,
    )

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = total_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    print(f"Finished processing {total_samples} samples for {task}.")

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
        f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']} | "
        f"sps: {sps_str}"
    )

    processed_samples_map[task] = {
        # "task": task,
        # "samples": samples,
        "samples": processed_samples,
        "sps": sps,
        "utilization": utilization,
    }

# processed_samples = samples.map(model_process, num_proc=torch.cuda.device_count())
# processed_samples = samples.map(
#     model_process,
#     fn_kwargs={
#         "model": model,
#         "tokenizer": tokenizer,
#         "total_samples": total_samples,
#     },
#     num_proc=1,
#     batched=True,
#     batch_size=batch_size,
#     with_indices=True,
# )

processed_fluency_samples = processed_samples_map["fluency"]["samples"]

pprint(processed_fluency_samples)
pprint(processed_fluency_samples["processed"][:2])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
total/used/cuda/res/ram (Gb): 10.00/7.00/3.04/4.46/20.02
Processing 126 samples for clarity


Map:   0%|          | 0/126 [00:00<?, ? examples/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0-4/126 | total/used/cuda/res/ram (Gb): 10.00/7.02/3.04/4.46/20.05 | batch/sps: 5/3.91


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


5-9/126 | total/used/cuda/res/ram (Gb): 10.00/7.01/3.04/4.46/20.01 | batch/sps: 5/1.47


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


10-14/126 | total/used/cuda/res/ram (Gb): 10.00/6.93/3.04/4.46/20.02 | batch/sps: 5/1.34


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


15-19/126 | total/used/cuda/res/ram (Gb): 10.00/6.90/3.04/4.46/20.01 | batch/sps: 5/2.80


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


20-24/126 | total/used/cuda/res/ram (Gb): 10.00/6.84/3.04/4.46/20.02 | batch/sps: 5/1.20


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


25-29/126 | total/used/cuda/res/ram (Gb): 10.00/6.84/3.04/4.46/20.01 | batch/sps: 5/1.32
30-34/126 | total/used/cuda/res/ram (Gb): 10.00/6.84/3.04/4.46/20.01 | batch/sps: 5/123.53


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


35-39/126 | total/used/cuda/res/ram (Gb): 10.00/6.83/3.04/4.46/20.02 | batch/sps: 5/1.11


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


40-44/126 | total/used/cuda/res/ram (Gb): 10.00/6.83/3.04/4.46/20.01 | batch/sps: 5/3.38


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


45-49/126 | total/used/cuda/res/ram (Gb): 10.00/6.83/3.04/4.46/20.02 | batch/sps: 5/4.27


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


50-54/126 | total/used/cuda/res/ram (Gb): 10.00/6.78/3.04/4.46/20.07 | batch/sps: 5/1.16


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


55-59/126 | total/used/cuda/res/ram (Gb): 10.00/6.80/3.04/4.46/20.08 | batch/sps: 5/10.09


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


60-64/126 | total/used/cuda/res/ram (Gb): 10.00/6.82/3.04/4.46/20.06 | batch/sps: 5/3.36


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


65-69/126 | total/used/cuda/res/ram (Gb): 10.00/6.84/3.04/4.46/20.06 | batch/sps: 5/5.44


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


70-74/126 | total/used/cuda/res/ram (Gb): 10.00/6.86/3.04/4.46/20.06 | batch/sps: 5/1.28


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


75-79/126 | total/used/cuda/res/ram (Gb): 10.00/6.84/3.04/4.46/20.07 | batch/sps: 5/1.38


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


80-84/126 | total/used/cuda/res/ram (Gb): 10.00/6.90/3.04/4.46/20.07 | batch/sps: 5/1.28


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


85-89/126 | total/used/cuda/res/ram (Gb): 10.00/6.85/3.04/4.46/20.09 | batch/sps: 5/3.54


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


90-94/126 | total/used/cuda/res/ram (Gb): 10.00/6.85/3.04/4.46/20.06 | batch/sps: 5/3.60


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


95-99/126 | total/used/cuda/res/ram (Gb): 10.00/6.85/3.04/4.46/20.06 | batch/sps: 5/5.88


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


100-104/126 | total/used/cuda/res/ram (Gb): 10.00/6.88/3.04/4.46/20.06 | batch/sps: 5/1.12


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


105-109/126 | total/used/cuda/res/ram (Gb): 10.00/6.90/3.04/4.46/20.06 | batch/sps: 5/3.22


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


110-114/126 | total/used/cuda/res/ram (Gb): 10.00/6.88/3.04/4.46/20.06 | batch/sps: 5/1.32


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


115-119/126 | total/used/cuda/res/ram (Gb): 10.00/6.80/3.04/4.46/20.06 | batch/sps: 5/1.20


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


120-124/126 | total/used/cuda/res/ram (Gb): 10.00/6.82/3.04/4.46/20.11 | batch/sps: 5/1.34
125-125/126 | total/used/cuda/res/ram (Gb): 10.00/6.82/3.04/4.46/20.11 | batch/sps: 1/43.74
Finished processing 126 samples for clarity.
126 | total/used/cuda/res/ram (Gb): 10.00/6.82/3.04/4.46/20.11 | sps: 1.82
Processing 1062 samples for coherence


Map:   0%|          | 0/1062 [00:00<?, ? examples/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0-4/1062 | total/used/cuda/res/ram (Gb): 10.00/6.80/3.04/4.46/20.14 | batch/sps: 5/2.96


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


5-9/1062 | total/used/cuda/res/ram (Gb): 10.00/6.86/3.04/4.46/20.14 | batch/sps: 5/1.11


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


10-14/1062 | total/used/cuda/res/ram (Gb): 10.00/6.86/3.04/4.46/20.14 | batch/sps: 5/5.65


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


15-19/1062 | total/used/cuda/res/ram (Gb): 10.00/6.80/3.04/4.46/20.14 | batch/sps: 5/1.15


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


20-24/1062 | total/used/cuda/res/ram (Gb): 10.00/6.78/3.04/4.46/20.14 | batch/sps: 5/3.41


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


25-29/1062 | total/used/cuda/res/ram (Gb): 10.00/6.79/3.04/4.46/20.14 | batch/sps: 5/2.30


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


30-34/1062 | total/used/cuda/res/ram (Gb): 10.00/6.79/3.04/4.46/20.16 | batch/sps: 5/3.39


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


35-39/1062 | total/used/cuda/res/ram (Gb): 10.00/6.80/3.04/4.46/20.48 | batch/sps: 5/3.35


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


40-44/1062 | total/used/cuda/res/ram (Gb): 10.00/6.79/3.04/4.46/20.39 | batch/sps: 5/2.27


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


45-49/1062 | total/used/cuda/res/ram (Gb): 10.00/6.81/3.04/4.46/20.45 | batch/sps: 5/5.02


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


50-54/1062 | total/used/cuda/res/ram (Gb): 10.00/6.80/3.04/4.46/20.45 | batch/sps: 5/5.00


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


55-59/1062 | total/used/cuda/res/ram (Gb): 10.00/6.80/3.04/4.46/20.45 | batch/sps: 5/3.92


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


60-64/1062 | total/used/cuda/res/ram (Gb): 10.00/6.84/3.04/4.46/20.45 | batch/sps: 5/2.61


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


65-69/1062 | total/used/cuda/res/ram (Gb): 10.00/6.84/3.04/4.46/20.43 | batch/sps: 5/1.17


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


70-74/1062 | total/used/cuda/res/ram (Gb): 10.00/6.85/3.04/4.46/20.44 | batch/sps: 5/1.14


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


75-79/1062 | total/used/cuda/res/ram (Gb): 10.00/6.86/3.04/4.46/20.47 | batch/sps: 5/4.31


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


80-84/1062 | total/used/cuda/res/ram (Gb): 10.00/6.86/3.04/4.46/20.44 | batch/sps: 5/4.41


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


85-89/1062 | total/used/cuda/res/ram (Gb): 10.00/6.83/3.04/4.46/20.43 | batch/sps: 5/1.18


KeyboardInterrupt: 

### Eval decoder-only models

In [29]:
%%time
%load_ext autoreload

batch_size = 20
max_length = 256

def model_process(batch, idx, **kwargs):
    num_samples = len(batch["input"])
    start_time = time.time()

    model = kwargs.get("model")
    tokenizer = kwargs.get("tokenizer")
    total_samples = kwargs.get("total_samples")

    # print(batch["input"])
    inputs = tokenizer(batch["input"], padding=True, return_tensors="pt").to(device)
    # print(inputs)
    # inputs = tokenizer(batch["input"], return_tensors="pt").inputs.to(device)
    # inputs = tokenizer(item["input"], return_tensors="pt").inputs

    # outputs = model.generate(inputs, max_length=512)
    outputs = model.generate(
        **inputs,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        max_length=max_length,
    )
    # print(outputs)

    trimmed_output = outputs[:, inputs.input_ids.shape[1] :]
    processed = tokenizer.batch_decode(trimmed_output, skip_special_tokens=True)
    # print(processed)

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = num_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{idx[0]}-{idx[-1]}/{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
        f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']} | "
        f"batch/sps: {num_samples}/{sps_str}"
    )

    # return {"processed": processed}
    return {"processed": processed}
    # return {"processed": processed, "utilization": utilization, "tps": tps}
    # return {"processed": processed, "utilization": utilization}


utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)

processed_samples_map = {}

for task, samples in test_dataset_dict.items():
    total_samples = len(samples)

    print(f"Processing {total_samples} samples for {task}")

    start_time = time.time()

    processed_samples = samples.map(
        model_process,
        fn_kwargs={
            "model": model,
            "tokenizer": tokenizer,
            "total_samples": total_samples,
        },
        num_proc=1,
        batched=True,
        batch_size=batch_size,
        with_indices=True,
    )

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = total_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    print(f"Finished processing {total_samples} samples for {task}.")
    print(processed_samples)
    print(processed_samples["reference"][0])
    print(processed_samples["processed"][0])

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
        f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']} | "
        f"sps: {sps_str}"
    )

    processed_samples_map[task] = {
        # "task": task,
        # "samples": samples,
        "samples": processed_samples,
        "sps": sps,
        "utilization": utilization,
    }

# processed_samples = samples.map(model_process, num_proc=torch.cuda.device_count())
# processed_samples = samples.map(
#     model_process,
#     fn_kwargs={
#         "model": model,
#         "tokenizer": tokenizer,
#         "total_samples": total_samples,
#     },
#     num_proc=1,
#     batched=True,
#     batch_size=batch_size,
#     with_indices=True,
# )

processed_fluency_samples = processed_samples_map["fluency"]["samples"]

pprint(processed_fluency_samples)
pprint(processed_fluency_samples["processed"][:2])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
total/used/cuda/res/ram (Gb): 10.00/6.80/3.31/4.46/20.26
Processing 126 samples for clarity


Parameter 'fn_kwargs'={'model': PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 1280)
        (wpe): Embedding(1024, 1280)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-35): 36 x GPT2Block(
            (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D()
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1280, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=3840, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): Para

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

0-4/126 | total/used/cuda/res/ram (Gb): 10.00/6.81/3.31/4.46/20.27 | batch/sps: 5/22.75
5-9/126 | total/used/cuda/res/ram (Gb): 10.00/6.79/3.31/4.46/20.26 | batch/sps: 5/6.75
10-14/126 | total/used/cuda/res/ram (Gb): 10.00/6.80/3.31/4.46/20.28 | batch/sps: 5/8.73
15-19/126 | total/used/cuda/res/ram (Gb): 10.00/6.78/3.31/4.46/20.32 | batch/sps: 5/7.71
20-24/126 | total/used/cuda/res/ram (Gb): 10.00/6.78/3.31/4.46/20.33 | batch/sps: 5/3.66
25-29/126 | total/used/cuda/res/ram (Gb): 10.00/6.78/3.31/4.46/20.33 | batch/sps: 5/6.32
30-34/126 | total/used/cuda/res/ram (Gb): 10.00/6.78/3.31/4.46/20.33 | batch/sps: 5/115.38
35-39/126 | total/used/cuda/res/ram (Gb): 10.00/6.79/3.31/4.46/20.33 | batch/sps: 5/94.47
40-44/126 | total/used/cuda/res/ram (Gb): 10.00/6.79/3.31/4.46/20.33 | batch/sps: 5/7.65
45-49/126 | total/used/cuda/res/ram (Gb): 10.00/6.79/3.31/4.46/20.33 | batch/sps: 5/119.70
50-54/126 | total/used/cuda/res/ram (Gb): 10.00/6.79/3.31/4.46/20.33 | batch/sps: 5/5.03
55-59/126 | total/u

Map:   0%|          | 0/1062 [00:00<?, ? examples/s]

0-4/1062 | total/used/cuda/res/ram (Gb): 10.00/6.79/3.31/4.46/20.19 | batch/sps: 5/3.78
5-9/1062 | total/used/cuda/res/ram (Gb): 10.00/6.79/3.31/4.46/20.27 | batch/sps: 5/18.27
10-14/1062 | total/used/cuda/res/ram (Gb): 10.00/6.79/3.31/4.46/20.21 | batch/sps: 5/10.30
15-19/1062 | total/used/cuda/res/ram (Gb): 10.00/6.79/3.31/4.46/20.20 | batch/sps: 5/91.49
20-24/1062 | total/used/cuda/res/ram (Gb): 10.00/6.79/3.31/4.46/20.20 | batch/sps: 5/5.73
25-29/1062 | total/used/cuda/res/ram (Gb): 10.00/6.79/3.31/4.46/20.20 | batch/sps: 5/104.25
30-34/1062 | total/used/cuda/res/ram (Gb): 10.00/6.82/3.31/4.46/20.22 | batch/sps: 5/4.27
35-39/1062 | total/used/cuda/res/ram (Gb): 10.00/6.82/3.31/4.46/20.20 | batch/sps: 5/4.85
40-44/1062 | total/used/cuda/res/ram (Gb): 10.00/6.83/3.31/4.46/20.20 | batch/sps: 5/12.24
45-49/1062 | total/used/cuda/res/ram (Gb): 10.00/6.83/3.31/4.46/20.20 | batch/sps: 5/8.38
50-54/1062 | total/used/cuda/res/ram (Gb): 10.00/6.84/3.31/4.46/20.20 | batch/sps: 5/12.83
55-59/1

### Saving eval results

In [27]:
hardware = "HomeDesktop (RTX3080)"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total/trainable params: {total_params}/{total_trainable_params}")

all_flats = []
all_scores = []
if os.path.exists("results/all-scores.csv"):
    all_scores = pd.read_csv("results/all-scores.csv").to_dict("records")

all_fulls = []

for split, obj in processed_samples_map.items():
    samples = obj["samples"]
    total_samples = len(samples)

    all_saved_samples = samples.remove_columns(["references"])
    saved_samples = all_saved_samples[:100] if len(all_saved_samples) > 100 else all_saved_samples
    flats_frame = pd.DataFrame.from_records(saved_samples)
    flats_frame.to_json(f"samples/{model_path}_{split}.json", orient="records")

    scores = calculate_scores(samples)
    # pprint(scores)

    score_paths = [
        "rouge.rouge1",
        # "rouge.rouge2",
        # "rouge.rougeL",
        # "rouge.rougeLsum",
        "sacreblue.score",
        "sari.sari",
        "em.exact_match",
    ]

    normalized_scores = {}
    for task, ds in scores.items():
        for k2, v2 in ds.items():
            if not isinstance(v2, list):
                # normalized_scores[f"score.{k}.{k2}"] = v2
                path = f"{task}.{k2}"
                if path in score_paths:
                    normalized_scores[f"score.{task}.{k2}"] = v2
    # pprint(normalized_scores)

    normalized_utilization = {}
    for task, ds in obj["utilization"].items():
        if not isinstance(ds, list):
            normalized_utilization[f"utilization.{task}"] = ds
    # print(normalized_utilization)

    flat_dict = {
        "model": model_name,
        "hardware": hardware,
        "total_params": total_params,
        "category": split,
        "total_samples": total_samples,
        "sps": obj["sps"],
    }
    flat_dict.update(normalized_scores)
    flat_dict.update(normalized_utilization)
    # pprint(frame)

    all_flats.append(flat_dict)
    all_scores.append(flat_dict)

    fulls_frame = {
        "category": split,
        "total_samples": total_samples,
        "sps": obj["sps"],
    }
    fulls_frame.update(
        {
            "scores": scores,
            "utilization": obj["utilization"],
        }
    )
    all_fulls.append(fulls_frame)

flats_frame = pd.DataFrame.from_records(all_flats)
flats_frame.to_csv(f"results/{model_path}.csv", index=False)

scores_frame = pd.DataFrame.from_records(all_scores)
scores_frame.to_csv(f"results/all-scores.csv", index=False)

fulls_dict = {
    "model": model_name,
    "hardware": hardware,
    "total_params": total_params,
}
for full in all_fulls:
    fulls_dict[full["category"]] = full

print(fulls_dict)
fulls_frame = pd.DataFrame.from_records([fulls_dict])
fulls_frame.to_json(f"results/{model_path}.json", orient="records", index=False)

model_name: gpt2-large-bnb8-coedit,model_id: iliazlobin/gpt2-large-bnb8-coedit,model_path: iliazlobin_gpt2-large-bnb8-coedit
Total/trainable params: 785826560/0
{'model': 'gpt2-large-bnb8-coedit', 'hardware': 'HomeDesktop (RTX3080)', 'total_params': 785826560, 'fluency': {'category': 'fluency', 'total_samples': 10, 'sps': 1.4038752382088802, 'scores': {'rouge': {'rouge1': 0.6285879338544873, 'rouge2': 0.5650044282777548, 'rougeL': 0.6110223542727786, 'rougeLsum': 0.6128952302178108}, 'sacreblue': {'score': 36.89335999040467, 'counts': [182, 160, 143, 126], 'totals': [219, 211, 203, 195], 'precisions': [83.10502283105023, 75.82938388625593, 70.44334975369458, 64.61538461538461], 'bp': 0.5041247574853277, 'sys_len': 219, 'ref_len': 369}, 'sari': {'sari': 29.490016265428093}, 'em': {'exact_match': 0.0}}, 'utilization': {'total_memory': 10736893952, 'memory_used': 8572403712, 'cuda_allocated': 3266646528, 'cuda_reserved': 6719275008, 'ram_usage': 24726765568}}, 'clarity': {'category': 'cla