In [1]:
%load_ext autoreload
%load_ext dotenv
%dotenv
!huggingface-cli login --token $HUGGING_FACE_TOKEN

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/izlobin/.cache/huggingface/token
Login successful


In [2]:
# %%capture
# %pip install nltk absl-py rouge_score
# %pip install bleu sacrebleu
# %pip install sacremoses
# %pip install scipy
# %pip install sentencepiece
# %pip install optimum auto-gptq
# %pip install scikit-learn
# %pip install einops
# %pip install bitsandbytes
# %pip install huggingface_hub
# %pip install transformers evaluate gradio datasets chardet cchardet librosa ipython sentencepiece plotly phonemizer
# %pip install accelerate
# %pip install pynvml

In [2]:
%load_ext autoreload
%autoreload 2

import os
import sys
import time
from pprint import pprint

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from huggingface_hub import HfApi
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BartForCausalLM,
    BartModel,
    BartTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5TokenizerFast,
)

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

from utils.dataset import get_iterater_samples_simplified, get_iterater_samples_with_instruction
from utils.metric import calculate_scores
from utils.monitoring import calculate_utilization, format_utilization_narrow, print_utilization

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(f"Device: {device}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
'Device: cuda'


## Loading models

### Loading custom gpt2 / gpt2-large

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "gpt2-large"
model_repo = f"openai-community"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=True,
)
tokenizer.add_eos_token = True
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"
print(type(tokenizer))
# print(tokenizer.add_eos_token)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print(tokenizer.padding_side)



# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
# )

# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", max_memory={0: "80GiB"})
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=0)
print(type(model))

model.generation_config.max_new_tokens = 0
# model.generation_config.new_tokens = 350
model.generation_config.pad_token_id = model.generation_config.eos_token_id
model.generation_config.padding_side = "left"

print(model.generation_config)
print(model)

model_name: gpt2-large,model_id: openai-community/gpt2-large,model_path: openai-community_gpt2-large
<class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>
50256
50256
left
<class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>
GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "pad_token_id": 50256,
  "padding_side": "left"
}

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
         

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "gpt2-large-coedit"
model_repo = f"iliazlobin"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=True,
)
tokenizer.add_eos_token = True
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"
print(type(tokenizer))
# print(tokenizer.add_eos_token)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print(tokenizer.padding_side)



# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
# )

# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", max_memory={0: "80GiB"})
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=0)
print(type(model))
print(model.config)

# model.generation_config.max_new_tokens = 350
# model.generation_config.new_tokens = 350
model.generation_config.pad_token_id = model.generation_config.eos_token_id
model.generation_config.padding_side = "left"

print(model.generation_config)
print(model)

model_name: gpt2-large-coedit,model_id: iliazlobin/gpt2-large-coedit,model_path: iliazlobin_gpt2-large-coedit
<class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>
50256
50256
left
<class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>
GPT2Config {
  "_name_or_path": "iliazlobin/gpt2-large-coedit",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1280,
  "n_head": 20,
  "n_inner": null,
  "n_layer": 36,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_sp

In [None]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "gpt2-large"
model_repo = f"openai-community"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")


tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=True,
)
tokenizer.add_eos_token = True
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"

print(type(tokenizer))


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map=0)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=0)
print(type(model))
# print(model.config)
# print(model)

trained_model_name = "gpt2-large-bnb8-coedit"
trained_model_repo = f"iliazlobin"
trained_model_id = f"{trained_model_repo}/{trained_model_name}"
trained_model_checkpoint = f"{trained_model_repo}/{trained_model_name}"
trained_model_path = f"{trained_model_repo}_{trained_model_name}"
print(
    f"trained_model_name: {trained_model_name},"
    f"trained_model_id: {trained_model_id},"
    f"trained_model_path: {trained_model_path}"
)


adapters_path = f"../model-train/model-{trained_model_repo}_{trained_model_name}"
peft_model = PeftModel.from_pretrained(model, adapters_path)

print(type(peft_model))
print(peft_model.config)
print(peft_model)

origin_model = model
model = peft_model
model_name = trained_model_name
model_repo = trained_model_repo
model_id = trained_model_id
model_checkpoint = trained_model_checkpoint
model_path = trained_model_path
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

### Loading BART
* https://huggingface.co/facebook/bart-large

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BartForConditionalGeneration, BitsAndBytesConfig

model_name = "bart-large"
model_repo = f"facebook"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = BartTokenizer.from_pretrained(model_id)
tokenizer.add_eos_token = True
# tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"
print(type(tokenizer))

model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map=0)
print(type(model))

model_name: bart-large,model_id: facebook/bart-large,model_path: facebook_bart-large
<class 'transformers.models.bart.tokenization_bart.BartTokenizer'>
<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BartForConditionalGeneration, BitsAndBytesConfig

model_name = "bart-large-coedit"
model_repo = f"iliazlobin"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = BartTokenizer.from_pretrained(model_id)
tokenizer.add_eos_token = True
# tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"
print(type(tokenizer))

# model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map=0, max_memory={0: "20GIB"})
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map=0)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="auto")
print(type(model))

model_name: bart-large-coedit,model_id: iliazlobin/bart-large-coedit,model_path: iliazlobin_bart-large-coedit
<class 'transformers.models.bart.tokenization_bart.BartTokenizer'>
<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>


In [3]:
model_name = "iliazlobin/bart-grammarly"
model_alias = model_name.replace("/", "_")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
# model = BartModel.from_pretrained(model_name)
# model = BartForCausalLM.from_pretrained(model_name, device_map=0)
model = BartForConditionalGeneration.from_pretrained(model_name, device_map=0)

print(type(tokenizer))
print(type(model))
print(model.config)

<class 'transformers.models.bart.tokenization_bart.BartTokenizer'>
<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>
BartConfig {
  "_name_or_path": "iliazlobin/bart-grammarly",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
 

### Loading google/t5-large
* https://huggingface.co/google-t5/t5-large

In [3]:
model_name = "t5-large"
model_repo = f"google-t5"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = T5TokenizerFast.from_pretrained(model_id)
# tokenizer.add_eos_token = True
# tokenizer.pad_token_id = tokenizer.eos_token_id
# tokenizer.padding_side = "right"
print(type(tokenizer))

# model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map=0, max_memory={0: "20GIB"})
model = T5ForConditionalGeneration.from_pretrained(model_id, device_map=0)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="auto")
print(type(model))
print(model.config)

model_name: t5-large,model_id: google-t5/t5-large,model_path: google-t5_t5-large


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on google-t5/t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


<class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>
<class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>
T5Config {
  "_name_or_path": "google-t5/t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 4096,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_siz

In [3]:
from transformers import BartTokenizer, AutoModelForSeq2SeqLM

model_name = "t5-large-coedit"
model_repo = f"iliazlobin"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = T5TokenizerFast.from_pretrained(model_id)
# tokenizer.add_eos_token = True
# tokenizer.pad_token_id = tokenizer.eos_token_id
# tokenizer.padding_side = "right"
print(type(tokenizer))

model = T5ForConditionalGeneration.from_pretrained(model_id, device_map=0)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map=0)
print(type(model))
print(model.config)

model_name: t5-large-coedit,model_id: iliazlobin/t5-large-coedit,model_path: iliazlobin_t5-large-coedit


tokenizer_config.json:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

<class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>


config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

<class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>
T5Config {
  "_name_or_path": "iliazlobin/t5-large-coedit",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 4096,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
   

### Loading grammarly/coedit

In [4]:
model_name = "coedit-large"
model_repo = f"grammarly"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")


tokenizer = AutoTokenizer.from_pretrained(model_id)
print(type(tokenizer))

model = T5ForConditionalGeneration.from_pretrained(model_id, device_map=0)
print(type(model))
print(model.config)

model_name: coedit-large,model_id: grammarly/coedit-large,model_path: grammarly_coedit-large
<class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>
<class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>
T5Config {
  "_name_or_path": "grammarly/coedit-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_ver

### Loading Gemma

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "gemma-2b"
# model_name = "gemma-7b-it"
# model_name = "gemma-7b"
model_repo = f"google"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=True,
)
# tokenizer.add_eos_token = True
# tokenizer.pad_token_id = tokenizer.eos_token_id
# tokenizer.padding_side = "left"
print(type(tokenizer))
# print(tokenizer.add_eos_token)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print(tokenizer.padding_side)


# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
# )

# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", max_memory={0: "80GiB"})
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=0)
print(type(model))

# model.generation_config.max_new_tokens = 350
# # model.generation_config.new_tokens = 350
# model.generation_config.pad_token_id = model.generation_config.eos_token_id
# model.generation_config.padding_side = "left"

print(model.generation_config)
print(model)

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

from AutoGPTQ.auto_gptq import quantization

model_name = "gemma-2b-coedit"
model_repo = f"iliazlobin"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=True,
)
# tokenizer.add_eos_token = True
# tokenizer.pad_token_id = tokenizer.eos_token_id
# tokenizer.padding_side = "left"
print(type(tokenizer))
# print(tokenizer.add_eos_token)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print(tokenizer.padding_side)


# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
# )
bnb_config = BitsAndBytesConfig(load_in_8bit=True, bnb_4bit_compute_dtype=torch.bfloat16)

# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", max_memory={0: "80GiB"})
# model = AutoModelForCausalLM.from_pretrained(model_id, device_map=0)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map=0)
print(type(model))

# model.generation_config.max_new_tokens = 350
# # model.generation_config.new_tokens = 350
# model.generation_config.pad_token_id = model.generation_config.eos_token_id
# model.generation_config.padding_side = "left"

print(model.generation_config)
print(model)

model_name: gemma-2b-coedit,model_id: iliazlobin/gemma-2b-coedit,model_path: iliazlobin_gemma-2b-coedit


Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


<class 'transformers.models.gemma.tokenization_gemma_fast.GemmaTokenizerFast'>
1
0
left


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

<class 'transformers.models.gemma.modeling_gemma.GemmaForCausalLM'>
GenerationConfig {
  "bos_token_id": 2,
  "eos_token_id": 1,
  "pad_token_id": 0
}

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear8bitLt(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear8bitLt(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear8bitLt(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear8bitLt(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear8bitLt(in_features=16384, out_featur

### Loading Phi-2
* https://huggingface.co/microsoft/phi-2
* https://huggingface.co/TheBloke/phi-2-GPTQ

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "phi-2"
model_repo = f"microsoft"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=True,
)
# tokenizer.add_eos_token = True
tokenizer.pad_token_id = tokenizer.eos_token_id
# tokenizer.padding_side = "left"
print(type(tokenizer))
# print(tokenizer.add_eos_token)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print(tokenizer.padding_side)


# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
# )
bnb_config = BitsAndBytesConfig(load_in_8bit=True, bnb_4bit_compute_dtype=torch.bfloat16)

# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", max_memory={0: "80GiB"})
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map=0)
print(type(model))

model.generation_config.max_new_tokens = 350
# # model.generation_config.new_tokens = 350
model.generation_config.pad_token_id = model.generation_config.eos_token_id
model.generation_config.padding_side = "left"

print(model.generation_config)
print(model.config)
print(model)

total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total/trainable params: {total_params}/{total_trainable_params}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model_name: phi-2,model_id: microsoft/phi-2,model_path: microsoft_phi-2
<class 'transformers.models.codegen.tokenization_codegen_fast.CodeGenTokenizerFast'>
50256
50256
right


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<class 'transformers.models.phi.modeling_phi.PhiForCausalLM'>
GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "max_new_tokens": 350,
  "pad_token_id": 50256,
  "padding_side": "left"
}

PhiConfig {
  "_name_or_path": "microsoft/phi-2",
  "architectures": [
    "PhiForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/phi-2--configuration_phi.PhiConfig",
    "AutoModelForCausalLM": "microsoft/phi-2--modeling_phi.PhiForCausalLM"
  },
  "bos_token_id": 50256,
  "embd_pdrop": 0.0,
  "eos_token_id": 50256,
  "hidden_act": "gelu_new",
  "hidden_size": 2560,
  "initializer_range": 0.02,
  "intermediate_size": 10240,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "phi",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "partial_rotary_factor": 0.4,
  "qk_layernorm": false,
  "quantization_config": {
    "_load_in_4bit": false,
    "_load_in_8bit": true,
    "bnb_4bit_

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "phi-2-coedit"
model_repo = f"iliazlobin"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=True,
)
# tokenizer.add_eos_token = True
tokenizer.pad_token_id = tokenizer.eos_token_id
# tokenizer.padding_side = "left"
print(type(tokenizer))
# print(tokenizer.add_eos_token)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print(tokenizer.padding_side)


# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
# )
bnb_config = BitsAndBytesConfig(load_in_8bit=True, torch_dtype=torch.bfloat16)

# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", max_memory={0: "80GiB"})
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, torch_dtype=torch.bfloat16, device_map=0)
print(type(model))

model.generation_config.max_new_tokens = 350
# # model.generation_config.new_tokens = 350
model.generation_config.pad_token_id = model.generation_config.eos_token_id
model.generation_config.padding_side = "left"

print(model.generation_config)
print(model.config)
print(model)

total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total/trainable params: {total_params}/{total_trainable_params}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model_name: phi-2-coedit,model_id: iliazlobin/phi-2-coedit,model_path: iliazlobin_phi-2-coedit
<class 'transformers.models.codegen.tokenization_codegen_fast.CodeGenTokenizerFast'>
50256
50256
left


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

<class 'transformers.models.phi.modeling_phi.PhiForCausalLM'>
GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "max_new_tokens": 350,
  "pad_token_id": 50256,
  "padding_side": "left"
}

PhiConfig {
  "_name_or_path": "iliazlobin/phi-2-coedit",
  "architectures": [
    "PhiForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/phi-2--configuration_phi.PhiConfig",
    "AutoModelForCausalLM": "microsoft/phi-2--modeling_phi.PhiForCausalLM"
  },
  "bos_token_id": 50256,
  "embd_pdrop": 0.0,
  "eos_token_id": 50256,
  "hidden_act": "gelu_new",
  "hidden_size": 2560,
  "initializer_range": 0.02,
  "intermediate_size": 10240,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "phi",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "partial_rotary_factor": 0.4,
  "qk_layernorm": false,
  "quantization_config": {
    "_load_in_4bit": false,
    "_load_in_8bit": true,
    "b

### Loading mixtral-8x7B
* https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
* https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GPTQ

Quantization
* https://medium.com/@rakeshrajpurohit/model-quantization-with-hugging-face-transformers-and-bitsandbytes-integration-b4c9983e8996

In [7]:
# model_name = "mistralai/Mixtral-8x7B-v0.1"
model_name = 'TheBloke/Mixtral-8x7B-v0.1-GPTQ'
model_alias = model_name.replace('/', '_')

torch.cuda.empty_cache()

# from transformers import BitsAndBytesConfig
# quantization_config = BitsAndBytesConfig(load_in_4bit=4)

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map=0,
#     # torch_dtype=torch.float16,
#     # load_in_4bit=True,
#     # quantization_config=quantization_config,
# )

# print(f"model.config.eos_token_id: {model.config.eos_token_id}")
# eos_token_id = 50256 # https://huggingface.co/microsoft/phi-2/blob/main/config.json

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=0,
    load_in_4bit=True,
    trust_remote_code=False,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True,
)

# print(f"tokenizer.eos_token: {tokenizer.eos_token}")
# tokenizer.pad_token = tokenizer.eos_token


print(model.config)

# text = 'Hello my name is'
# inputs = tokenizer(text, return_tensors='pt')
# outputs = model.generate(**inputs, max_new_tokens=20)
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


AttributeError: 'BitsAndBytesConfig' object has no attribute 'get_loading_attributes'

### Loading llama-2

In [3]:
# model_name = "meta-llama/Llama-2-7b-hf"
model_name = "TheBloke/Llama-2-7B-GPTQ"
# model_name = "TheBloke/Nous-Hermes-Llama-2-7B-GPTQ"
model_alias = model_name.replace("/", "_")

from transformers import LlamaTokenizer, LlamaForCausalLM

tokenizer = LlamaTokenizer.from_pretrained(model_name, padding_side='left')
model = LlamaForCausalLM.from_pretrained(model_name, device_map=0)

tokenizer.pad_token = tokenizer.eos_token

print(type(tokenizer))
print(type(model))
print(model.config)

# from auto_gptq import exllama_set_max_input_length
# model = exllama_set_max_input_length(model, max_input_length=2400)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class
The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class


<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>
<class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>
LlamaConfig {
  "_name_or_path": "TheBloke/Llama-2-7B-GPTQ",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_length": 4096,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "quantization_config": {
    "batch_size": 1,
    "bits": 4,
    "block_name_to_quantize": null,
    "cache_block_outputs": true,
    "damp_percent": 0.01,
    "dataset": null,
    "desc_act": false,
    "exllama_config": {
      "version": 1
    },
    "group_size": 128,
    "max_input_length": null,
    "model_seqlen": null,
    "mo

## Evaluation

### Grammarly/coedit dataset
* https://huggingface.co/datasets/grammarly/coedit

In [4]:
from datasets import DatasetDict, load_dataset

# full_dataset = load_dataset("grammarly/coedit")
# print(full_dataset)

# train_dataset = load_dataset("grammarly/coedit", split="train[:50000]")
# test_dataset = load_dataset("grammarly/coedit", split="train[10000:]")
# # test_dataset = load_dataset("grammarly/coedit", split="validation")

all_dataset = load_dataset("grammarly/coedit", split="train+validation")
# print(all_dataset)

# print()
# print(f"train set {set(all_dataset['task'])}")
# print(f"total len: {len(all_dataset)}")
# print(f"gec len: {len(all_dataset.filter(lambda x: x['task'] == 'gec'))}")
# print(f"simplification len: {len(all_dataset.filter(lambda x: x['task'] == 'simplification'))}")
# print(f"clarity len: {len(all_dataset.filter(lambda x: x['task'] == 'clarity'))}")
# print(f"coherence len: {len(all_dataset.filter(lambda x: x['task'] == 'coherence'))}")
# print(f"paraphrase len: {len(all_dataset.filter(lambda x: x['task'] == 'paraphrase'))}")
# print(f"neutralize len: {len(all_dataset.filter(lambda x: x['task'] == 'neutralize'))}")
# print()

# train_ratio = 0.01
# test_ratio = 0.001
# train_ratio = 0.1
# test_ratio = 0.01
train_ratio = 0.9
test_ratio = 0.1

gec_dataset = all_dataset.filter(lambda x: x["task"] == "gec")
train_gec_dataset = gec_dataset.select(range(0, int(train_ratio * len(gec_dataset))))
test_gec_dataset = gec_dataset.select(range(int((1 - test_ratio) * len(gec_dataset)), len(gec_dataset)))

simplification_dataset = all_dataset.filter(lambda x: x["task"] == "simplification")
train_simplification_dataset = simplification_dataset.select(range(0, int(train_ratio * len(simplification_dataset))))
test_simplification_dataset = simplification_dataset.select(
    range(int((1 - test_ratio) * len(simplification_dataset)), len(simplification_dataset))
)

clarity_dataset = all_dataset.filter(lambda x: x["task"] == "clarity")
train_clarity_dataset = clarity_dataset.select(range(0, int(train_ratio * len(clarity_dataset))))
test_clarity_dataset = clarity_dataset.select(range(int((1 - test_ratio) * len(clarity_dataset)), len(clarity_dataset)))

coherence_dataset = all_dataset.filter(lambda x: x["task"] == "coherence")
train_coherence_dataset = coherence_dataset.select(range(0, int(train_ratio * len(coherence_dataset))))
test_coherence_dataset = coherence_dataset.select(
    range(int((1 - test_ratio) * len(coherence_dataset)), len(coherence_dataset))
)

paraphrase_dataset = all_dataset.filter(lambda x: x["task"] == "paraphrase")
train_paraphrase_dataset = paraphrase_dataset.select(range(0, int(train_ratio * len(paraphrase_dataset))))
test_paraphrase_dataset = paraphrase_dataset.select(
    range(int((1 - test_ratio) * len(paraphrase_dataset)), len(paraphrase_dataset))
)

neutralize_dataset = all_dataset.filter(lambda x: x["task"] == "neutralize")
neutralize_dataset_split = int(train_ratio * len(neutralize_dataset))
train_neutralize_dataset = neutralize_dataset.select(range(0, int(train_ratio * len(neutralize_dataset))))
test_neutralize_dataset = neutralize_dataset.select(
    range(int((1 - test_ratio) * len(neutralize_dataset)), len(neutralize_dataset))
)

from datasets import concatenate_datasets

train_dataset = concatenate_datasets(
    [
        train_gec_dataset,
        train_simplification_dataset,
        train_clarity_dataset,
        train_coherence_dataset,
        train_paraphrase_dataset,
        train_neutralize_dataset,
    ]
)
train_dataset = train_dataset.map(
    lambda item: {
        "input": item["src"],
        "reference": item["tgt"],
        "references": [item["tgt"]],
    },
    remove_columns=["src", "tgt", "_id"],
)
print(f"train set {set(train_dataset['task'])}")
print(train_dataset)

test_dataset = concatenate_datasets(
    [
        test_gec_dataset,
        test_simplification_dataset,
        test_clarity_dataset,
        test_coherence_dataset,
        test_paraphrase_dataset,
        test_neutralize_dataset,
    ]
)
test_dataset = test_dataset.map(
    lambda item: {
        "input": item["src"],
        "reference": item["tgt"],
        "references": [item["tgt"]],
    },
    remove_columns=["src", "tgt", "_id"],
)
print(f"test set {set(test_dataset['task'])}")
print(test_dataset)


def add_prompt(item):
    return {
        "request": f"{item['input']}\nResponse:",
        "prompt": f"{item['input']}\nResponse:{item['reference']}",
    }


dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
# dataset = dataset.rename_column("task", "label")
dataset = dataset.map(add_prompt)
print(dataset)
print(dataset["train"][0])

train set {'neutralize', 'simplification', 'paraphrase', 'coherence', 'clarity', 'gec'}
Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 63703
})
test set {'neutralize', 'simplification', 'paraphrase', 'coherence', 'clarity', 'gec'}
Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 7080
})
DatasetDict({
    train: Dataset({
        features: ['task', 'input', 'reference', 'references', 'request', 'prompt'],
        num_rows: 63703
    })
    test: Dataset({
        features: ['task', 'input', 'reference', 'references', 'request', 'prompt'],
        num_rows: 7080
    })
})
{'task': 'gec', 'input': 'Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.', 'reference': 'For example, countries with a lot of deserts can transform their desert to increase their habitable

In [5]:
# find the longest sequence in the dataset
max_input_length = max(len(tokenizer.encode(item["input"])) for item in dataset["train"])
print(f"max_input_length train: {max_input_length}")
max_input_length = max(len(tokenizer.encode(item["input"])) for item in dataset["test"])
print(f"max_input_length test: {max_input_length}")

max_input_length train: 171
max_input_length test: 239


In [5]:
from datasets import Dataset

train_lists_map = {}

for task in set(train_dataset['task']):
    train_lists_map[task] = []

for item in dataset["train"]:
    train_lists_map[item["task"]].append(item)

train_dataset_map = {}
for task, l in train_lists_map.items():
    train_dataset_map[task] = Dataset.from_list(l)
# print(train_dataset_map)

train_dataset_dict = DatasetDict(train_dataset_map)
# print(train_dataset_dict)

# for task, ds in train_dataset_dict.items():
#     print(f"{task}: {ds}")

for task in set(train_dataset['task']):
    print(f"train/{task}: {len(train_lists_map[task])}")

train/neutralize: 10143
train/simplification: 10296
train/paraphrase: 14307
train/coherence: 9554
train/clarity: 1126
train/gec: 18277


In [6]:
from datasets import Dataset

test_lists_map = {}

for task in set(train_dataset['task']):
    test_lists_map[task] = []

for item in dataset["test"]:
    test_lists_map[item["task"]].append(item)

test_dataset_map = {}
for task, l in test_lists_map.items():
    test_dataset_map[task] = Dataset.from_list(l)
# print(test_dataset_map)

test_dataset_dict = DatasetDict(test_dataset_map)
# print(test_dataset_dict)

# for task, ds in test_dataset_dict.items():
#     print(f"{task}: {ds}")

for task in set(train_dataset['task']):
    print(f"test/{task}: {len(test_lists_map[task])}")

test/neutralize: 1127
test/simplification: 1144
test/paraphrase: 1590
test/coherence: 1062
test/clarity: 126
test/gec: 2031


In [8]:
print(dataset["test"][0]["task"])
print(dataset["test"][0]["input"])
print(dataset["test"][0]["reference"])
print(dataset["test"][0]["prompt"])
print(dataset["test"][0]["request"])

gec
Fix grammaticality in this sentence: Despite strict japanese society, I feel happy when I had dinner with my family.
Despite the strict Japanese society, I feel happy when I have dinner with my family.
Fix grammaticality in this sentence: Despite strict japanese society, I feel happy when I had dinner with my family.
Response:Despite the strict Japanese society, I feel happy when I have dinner with my family.
Fix grammaticality in this sentence: Despite strict japanese society, I feel happy when I had dinner with my family.
Response:


### Test inference

In [7]:
max_batch = 2
max_length = 350
max_new_tokens = 100

for task, batch in test_dataset_dict.items():
    print()
    print(f">> {task}")
    batch_size = len(batch) if len(batch) < max_batch else max_batch
    input_batch = batch.select(range(batch_size))
    print(f"input: {input_batch['input']}")

    input = tokenizer(input_batch["input"], padding=True, return_tensors="pt").to(device)
    # outputs = model.generate(input.input_ids, max_length=max_length)
    outputs = model.generate(input.input_ids, max_new_tokens=max_new_tokens)
    result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    print(f"result: {result}")

    break


>> neutralize
input: ['Remove non-neutral POV: new moon received poor reviews from critics.', 'Remove POVs in this text: rhonda shear (born 1954), american television personality, comedienne, and actress']
result: ['new moon received poor reviews from critics.', 'rhonda shear (born 1954), american television personality, comedian, and actress']


In [8]:
max_batch = 2
max_length = 350
max_new_tokens = 350

for task, batch in test_dataset_dict.items():
    print()
    print(f">> {task}")
    batch_size = len(batch) if len(batch) < max_batch else max_batch
    input_batch = batch.select(range(batch_size))
    print(f"request: {input_batch['request']}")
    print(f"reference: {input_batch['reference']}")

    inputs = tokenizer(input_batch['request'], return_tensors="pt", padding=True).to(device)

    model.config.use_cache = False
    # outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, num_return_sequences=1)
    outputs = model.generate(**inputs, num_return_sequences=1)
    # outputs = model.generate(**inputs, pad_token_id=tokenizer.eos_token_id, num_return_sequences=1)
    # outputs = model.generate(
    #     **inputs,
    #     do_sample=True,
    #     top_k=10,
    #     num_return_sequences=1,
    #     pad_token_id=tokenizer.eos_token_id,
    #     # return_attention_mask=True,
    #     max_length=256,
    # )

    trimmed_output = outputs[:, inputs.input_ids.shape[1] :]
    result = tokenizer.batch_decode(trimmed_output, skip_special_tokens=True)
    print(f"result: {result}")

    break


>> neutralize
request: ['Remove non-neutral POV: new moon received poor reviews from critics.\nResponse:', 'Remove POVs in this text: rhonda shear (born 1954), american television personality, comedienne, and actress\nResponse:']
reference: ['new moon received negative reviews from critics.', 'rhonda shear (born 1954), american television personality, comedian, and actress']
result: ['new moon received mixed reviews from critics.', 'rhonda shear (born 1954), american television personality, comedian, and actress']


### Eval prep

In [8]:
%reload_ext autoreload

from utils.monitoring import calculate_utilization, format_utilization_narrow, print_utilization

total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total/trainable params: {total_params}/{total_trainable_params}')

utilization = calculate_utilization()
print(utilization)
utilization_str = format_utilization_narrow(utilization)
print(utilization_str)
print(
    f"total/used/cuda/res/ram(Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)


torch.cuda.empty_cache()

actual_fraction = 0.95
available_memory = utilization['total_memory'] - utilization['memory_used']
recommended_fraction = available_memory / utilization['total_memory']
torch.cuda.set_per_process_memory_fraction(actual_fraction, 0)

print(
    f"Total/used/available memory (Gb): {utilization['total_memory']/1024**3:.2f}/"
    "{utilization['memory_used']/1024**3:.2f}/{available_memory/1024**3:.2f}"
)
print(f'Recommended/actual fraction: {recommended_fraction:.2f}/{actual_fraction:.2f}')

# torch.cuda.empty_cache()
# torch.empty(utilization['total_memory'] // 2, dtype=torch.int8, device='cuda')
# print_utilization()

Total/trainable params: 737668096/737668096
{'total_memory': 10736893952, 'memory_used': 5470830592, 'cuda_allocated': 2959717888, 'cuda_reserved': 3001024512, 'ram_usage': 15106895872}
{'total_memory': '10.00', 'memory_used': '5.10', 'cuda_allocated': '2.76', 'cuda_reserved': '2.79', 'ram_usage': '14.07'}
total/used/cuda/res/ram(Gb): 10.00/5.10/2.76/2.79/14.07
Total/used/available memory (Gb): 10.00/{utilization['memory_used']/1024**3:.2f}/{available_memory/1024**3:.2f}
Recommended/actual fraction: 0.49/0.95


### Eval encoder-decoder models

In [9]:
%%time
%load_ext autoreload

batch_size = 20 # home: t5, grammarly/coedit
# batch_size = 100
max_length = 350


def model_process(batch, idx, **kwargs):
    num_samples = len(batch["input"])
    start_time = time.time()

    model = kwargs.get("model")
    tokenizer = kwargs.get("tokenizer")
    total_samples = kwargs.get("total_samples")

    inputs = tokenizer(batch["input"], max_length=max_length, padding=True, return_tensors="pt").to(device)
    # input_ids = tokenizer(batch['task'], return_tensors="pt").input_ids.to(device)
    # input_ids = tokenizer(item['task'], return_tensors="pt").input_ids
    # outputs = model.generate(input_ids, max_length=512)
    outputs = model.generate(inputs.input_ids, max_length=max_length)
    # print(f"outputs: {outputs}")
    processed = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = num_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{idx[0]}-{idx[-1]}/{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
        f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']} | "
        f"batch/sps: {num_samples}/{sps_str}"
    )

    return {"processed": processed}


utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)

processed_samples_map = {}

for task, samples in test_dataset_dict.items():
    total_samples = len(samples)

    print(f"Processing {total_samples} samples for {task}")

    start_time = time.time()

    processed_samples = samples.map(
        model_process,
        fn_kwargs={
            "model": model,
            "tokenizer": tokenizer,
            "total_samples": total_samples,
        },
        num_proc=1,
        batched=True,
        batch_size=batch_size,
        with_indices=True,
    )

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = total_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    print(f"Finished processing {total_samples} samples for {task}.")

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
        f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']} | "
        f"sps: {sps_str}"
    )

    processed_samples_map[task] = {
        # "task": task,
        # "samples": samples,
        "samples": processed_samples,
        "total_samples": total_samples,
        "elapsed_time": elapsed_time,
        "sps": sps,
        "batch_size": batch_size,
        "max_length": max_length,
        "utilization": utilization,
    }

# processed_samples = samples.map(model_process, num_proc=torch.cuda.device_count())
# processed_samples = samples.map(
#     model_process,
#     fn_kwargs={
#         "model": model,
#         "tokenizer": tokenizer,
#         "total_samples": total_samples,
#     },
#     num_proc=1,
#     batched=True,
#     batch_size=batch_size,
#     with_indices=True,
# )

processed_gec_samples = processed_samples_map["gec"]["samples"]

pprint(processed_gec_samples)
pprint(processed_gec_samples["processed"][:2])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
total/used/cuda/res/ram (Gb): 10.00/5.10/2.76/2.77/14.08
Processing 1127 samples for neutralize


Map:   0%|          | 0/1127 [00:00<?, ? examples/s]



0-19/1127 | total/used/cuda/res/ram (Gb): 10.00/5.57/2.76/3.25/16.72 | batch/sps: 20/20.54
20-39/1127 | total/used/cuda/res/ram (Gb): 10.00/5.65/2.76/3.33/16.72 | batch/sps: 20/27.30
40-59/1127 | total/used/cuda/res/ram (Gb): 10.00/5.65/2.76/3.33/16.72 | batch/sps: 20/25.34
60-79/1127 | total/used/cuda/res/ram (Gb): 10.00/5.65/2.76/3.33/16.72 | batch/sps: 20/32.37
80-99/1127 | total/used/cuda/res/ram (Gb): 10.00/5.65/2.76/3.33/16.72 | batch/sps: 20/30.04
100-119/1127 | total/used/cuda/res/ram (Gb): 10.00/5.66/2.76/3.33/16.73 | batch/sps: 20/26.21
120-139/1127 | total/used/cuda/res/ram (Gb): 10.00/5.66/2.76/3.33/16.73 | batch/sps: 20/28.13
140-159/1127 | total/used/cuda/res/ram (Gb): 10.00/5.66/2.76/3.33/16.73 | batch/sps: 20/31.43
160-179/1127 | total/used/cuda/res/ram (Gb): 10.00/5.66/2.76/3.33/16.74 | batch/sps: 20/32.94
180-199/1127 | total/used/cuda/res/ram (Gb): 10.00/5.66/2.76/3.33/16.73 | batch/sps: 20/32.22
200-219/1127 | total/used/cuda/res/ram (Gb): 10.00/5.66/2.76/3.33/16.73

Map:   0%|          | 0/1144 [00:00<?, ? examples/s]

0-19/1144 | total/used/cuda/res/ram (Gb): 10.00/7.84/2.76/4.90/19.29 | batch/sps: 20/11.24
20-39/1144 | total/used/cuda/res/ram (Gb): 10.00/7.80/2.76/4.90/19.31 | batch/sps: 20/14.74
40-59/1144 | total/used/cuda/res/ram (Gb): 10.00/7.79/2.76/4.90/19.30 | batch/sps: 20/15.45
60-79/1144 | total/used/cuda/res/ram (Gb): 10.00/7.81/2.76/4.90/19.30 | batch/sps: 20/14.41
80-99/1144 | total/used/cuda/res/ram (Gb): 10.00/7.87/2.76/4.90/19.30 | batch/sps: 20/15.20
100-119/1144 | total/used/cuda/res/ram (Gb): 10.00/7.87/2.76/4.90/19.30 | batch/sps: 20/18.12
120-139/1144 | total/used/cuda/res/ram (Gb): 10.00/7.87/2.76/4.90/19.30 | batch/sps: 20/12.99
140-159/1144 | total/used/cuda/res/ram (Gb): 10.00/7.83/2.76/4.90/19.30 | batch/sps: 20/17.44
160-179/1144 | total/used/cuda/res/ram (Gb): 10.00/7.83/2.76/4.90/19.30 | batch/sps: 20/24.91
180-199/1144 | total/used/cuda/res/ram (Gb): 10.00/7.83/2.76/4.90/19.30 | batch/sps: 20/22.61
200-219/1144 | total/used/cuda/res/ram (Gb): 10.00/7.81/2.76/4.90/19.30

Map:   0%|          | 0/1590 [00:00<?, ? examples/s]

0-19/1590 | total/used/cuda/res/ram (Gb): 10.00/8.03/2.76/5.10/19.30 | batch/sps: 20/18.14
20-39/1590 | total/used/cuda/res/ram (Gb): 10.00/8.03/2.76/5.10/19.31 | batch/sps: 20/21.11
40-59/1590 | total/used/cuda/res/ram (Gb): 10.00/8.03/2.76/5.10/19.31 | batch/sps: 20/16.20
60-79/1590 | total/used/cuda/res/ram (Gb): 10.00/8.03/2.76/5.10/19.30 | batch/sps: 20/13.87
80-99/1590 | total/used/cuda/res/ram (Gb): 10.00/8.03/2.76/5.10/19.30 | batch/sps: 20/14.37
100-119/1590 | total/used/cuda/res/ram (Gb): 10.00/8.03/2.76/5.10/19.30 | batch/sps: 20/19.97
120-139/1590 | total/used/cuda/res/ram (Gb): 10.00/8.04/2.76/5.10/19.32 | batch/sps: 20/15.89
140-159/1590 | total/used/cuda/res/ram (Gb): 10.00/8.07/2.76/5.10/19.30 | batch/sps: 20/19.68
160-179/1590 | total/used/cuda/res/ram (Gb): 10.00/8.07/2.76/5.10/19.30 | batch/sps: 20/15.78
180-199/1590 | total/used/cuda/res/ram (Gb): 10.00/8.10/2.76/5.10/19.29 | batch/sps: 20/22.55
200-219/1590 | total/used/cuda/res/ram (Gb): 10.00/8.11/2.76/5.10/19.30

Map:   0%|          | 0/1062 [00:00<?, ? examples/s]

0-19/1062 | total/used/cuda/res/ram (Gb): 10.00/8.03/2.76/5.10/19.29 | batch/sps: 20/13.08
20-39/1062 | total/used/cuda/res/ram (Gb): 10.00/8.03/2.76/5.10/19.30 | batch/sps: 20/15.56
40-59/1062 | total/used/cuda/res/ram (Gb): 10.00/8.03/2.76/5.10/19.29 | batch/sps: 20/17.91
60-79/1062 | total/used/cuda/res/ram (Gb): 10.00/8.02/2.76/5.10/19.30 | batch/sps: 20/12.71
80-99/1062 | total/used/cuda/res/ram (Gb): 10.00/8.02/2.76/5.10/19.29 | batch/sps: 20/13.12
100-119/1062 | total/used/cuda/res/ram (Gb): 10.00/8.02/2.76/5.10/19.29 | batch/sps: 20/11.31
120-139/1062 | total/used/cuda/res/ram (Gb): 10.00/8.03/2.76/5.10/19.29 | batch/sps: 20/13.72
140-159/1062 | total/used/cuda/res/ram (Gb): 10.00/8.03/2.76/5.10/19.31 | batch/sps: 20/13.91
160-179/1062 | total/used/cuda/res/ram (Gb): 10.00/8.03/2.76/5.10/19.29 | batch/sps: 20/12.78
180-199/1062 | total/used/cuda/res/ram (Gb): 10.00/8.03/2.76/5.10/19.30 | batch/sps: 20/15.90
200-219/1062 | total/used/cuda/res/ram (Gb): 10.00/8.02/2.76/5.10/19.29

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

0-19/126 | total/used/cuda/res/ram (Gb): 10.00/8.01/2.76/5.10/19.29 | batch/sps: 20/9.06
20-39/126 | total/used/cuda/res/ram (Gb): 10.00/8.02/2.76/5.10/19.28 | batch/sps: 20/8.71
40-59/126 | total/used/cuda/res/ram (Gb): 10.00/8.02/2.76/5.10/19.29 | batch/sps: 20/10.34
60-79/126 | total/used/cuda/res/ram (Gb): 10.00/8.02/2.76/5.10/19.29 | batch/sps: 20/9.08
80-99/126 | total/used/cuda/res/ram (Gb): 10.00/8.02/2.76/5.10/19.29 | batch/sps: 20/10.33
100-119/126 | total/used/cuda/res/ram (Gb): 10.00/8.02/2.76/5.10/19.29 | batch/sps: 20/9.76
120-125/126 | total/used/cuda/res/ram (Gb): 10.00/8.02/2.76/5.10/19.32 | batch/sps: 6/3.23
Finished processing 126 samples for clarity.
126 | total/used/cuda/res/ram (Gb): 10.00/8.02/2.76/5.10/19.32 | sps: 6.98
Processing 2031 samples for gec


Map:   0%|          | 0/2031 [00:00<?, ? examples/s]

0-19/2031 | total/used/cuda/res/ram (Gb): 10.00/8.02/2.76/5.10/19.31 | batch/sps: 20/20.47
20-39/2031 | total/used/cuda/res/ram (Gb): 10.00/8.02/2.76/5.10/19.29 | batch/sps: 20/18.85
40-59/2031 | total/used/cuda/res/ram (Gb): 10.00/8.01/2.76/5.10/19.29 | batch/sps: 20/23.79
60-79/2031 | total/used/cuda/res/ram (Gb): 10.00/8.01/2.76/5.10/19.29 | batch/sps: 20/40.78
80-99/2031 | total/used/cuda/res/ram (Gb): 10.00/8.01/2.76/5.10/19.29 | batch/sps: 20/36.09
100-119/2031 | total/used/cuda/res/ram (Gb): 10.00/8.01/2.76/5.10/19.31 | batch/sps: 20/34.27
120-139/2031 | total/used/cuda/res/ram (Gb): 10.00/8.01/2.76/5.10/19.30 | batch/sps: 20/27.51
140-159/2031 | total/used/cuda/res/ram (Gb): 10.00/8.01/2.76/5.10/19.29 | batch/sps: 20/30.90
160-179/2031 | total/used/cuda/res/ram (Gb): 10.00/8.01/2.76/5.10/19.29 | batch/sps: 20/25.17
180-199/2031 | total/used/cuda/res/ram (Gb): 10.00/8.01/2.76/5.10/19.29 | batch/sps: 20/21.73
200-219/2031 | total/used/cuda/res/ram (Gb): 10.00/8.01/2.76/5.10/19.29

### Eval decoder-only models

In [10]:
%%time
%load_ext autoreload

# batch_size = 8  # gpt2-large
batch_size = 10  # gemma-2b, phi-2
max_length = 350
max_new_tokens = 350

def model_process(batch, idx, **kwargs):
    num_samples = len(batch["input"])
    start_time = time.time()

    model = kwargs.get("model")
    tokenizer = kwargs.get("tokenizer")
    total_samples = kwargs.get("total_samples")

    # print(batch["request"])
    inputs = tokenizer(batch["request"], padding=True, return_tensors="pt").to(device)
    # inputs = tokenizer(batch["request"], return_tensors="pt").inputs.to(device)
    # inputs = tokenizer(item["request"], return_tensors="pt").inputs
    # print(inputs)

    model.config.use_cache = False
    # outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, num_return_sequences=1)
    outputs = model.generate(**inputs, num_return_sequences=1)
    # outputs = model.generate(
    #     **inputs,
    #     do_sample=True,
    #     top_k=10,
    #     num_return_sequences=1,
    #     pad_token_id=tokenizer.eos_token_id,
    #     max_length=max_length,
    # )
    # print(outputs)

    trimmed_output = outputs[:, inputs.input_ids.shape[1] :]
    processed = tokenizer.batch_decode(trimmed_output, skip_special_tokens=True)
    # print(processed)

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = num_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{idx[0]}-{idx[-1]}/{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
        f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']} | "
        f"batch/sps: {num_samples}/{sps_str}"
    )

    # return {"processed": processed}
    return {"processed": processed}
    # return {"processed": processed, "utilization": utilization, "tps": tps}
    # return {"processed": processed, "utilization": utilization}


utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)

processed_samples_map = {}

for task, samples in test_dataset_dict.items():
    total_samples = len(samples)

    print(f"Processing {total_samples} samples for {task}")

    start_time = time.time()

    processed_samples = samples.map(
        model_process,
        fn_kwargs={
            "model": model,
            "tokenizer": tokenizer,
            "total_samples": total_samples,
        },
        num_proc=1,
        batched=True,
        batch_size=batch_size,
        with_indices=True,
    )

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = total_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    print(f"Finished processing {total_samples} samples for {task}.")
    print(processed_samples)
    print(processed_samples["reference"][0])
    print(processed_samples["processed"][0])

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
        f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']} | "
        f"sps: {sps_str}"
    )

    processed_samples_map[task] = {
        # "task": task,
        # "samples": samples,
        "samples": processed_samples,
        "total_samples": total_samples,
        "elapsed_time": elapsed_time,
        "sps": sps,
        "batch_size": batch_size,
        "max_length": max_length,
        "utilization": utilization,
    }

# processed_samples = samples.map(model_process, num_proc=torch.cuda.device_count())
# processed_samples = samples.map(
#     model_process,
#     fn_kwargs={
#         "model": model,
#         "tokenizer": tokenizer,
#         "total_samples": total_samples,
#     },
#     num_proc=1,
#     batched=True,
#     batch_size=batch_size,
#     with_indices=True,
# )

processed_gec_samples = processed_samples_map["gec"]["samples"]

pprint(processed_gec_samples)
pprint(processed_gec_samples["processed"][:2])

Parameter 'fn_kwargs'={'model': PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
          (dense): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear8bitLt(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear8bitLt(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
total/used/cuda/res/ram (Gb): 10.00/4.82/2.92/3.02/13.85
Processing 1127 samples for neutralize


Map:   0%|          | 0/1127 [00:00<?, ? examples/s]

0-9/1127 | total/used/cuda/res/ram (Gb): 10.00/5.20/2.92/3.38/13.85 | batch/sps: 10/2.54
10-19/1127 | total/used/cuda/res/ram (Gb): 10.00/5.26/2.92/3.45/13.84 | batch/sps: 10/2.50
20-29/1127 | total/used/cuda/res/ram (Gb): 10.00/5.24/2.92/3.45/13.84 | batch/sps: 10/3.08
30-39/1127 | total/used/cuda/res/ram (Gb): 10.00/5.30/2.92/3.53/13.84 | batch/sps: 10/2.44
40-49/1127 | total/used/cuda/res/ram (Gb): 10.00/5.25/2.92/3.53/13.84 | batch/sps: 10/2.27
50-59/1127 | total/used/cuda/res/ram (Gb): 10.00/5.25/2.92/3.53/13.85 | batch/sps: 10/2.65
60-69/1127 | total/used/cuda/res/ram (Gb): 10.00/5.24/2.92/3.53/13.84 | batch/sps: 10/2.82
70-79/1127 | total/used/cuda/res/ram (Gb): 10.00/5.22/2.92/3.53/13.84 | batch/sps: 10/3.23
80-89/1127 | total/used/cuda/res/ram (Gb): 10.00/5.22/2.92/3.53/13.84 | batch/sps: 10/2.52
90-99/1127 | total/used/cuda/res/ram (Gb): 10.00/5.22/2.92/3.53/13.85 | batch/sps: 10/3.39
100-109/1127 | total/used/cuda/res/ram (Gb): 10.00/5.22/2.92/3.53/13.84 | batch/sps: 10/3.47

Map:   0%|          | 0/1144 [00:00<?, ? examples/s]

0-9/1144 | total/used/cuda/res/ram (Gb): 10.00/7.20/2.92/5.46/13.86 | batch/sps: 10/2.21
10-19/1144 | total/used/cuda/res/ram (Gb): 10.00/7.20/2.92/5.46/13.85 | batch/sps: 10/2.30
20-29/1144 | total/used/cuda/res/ram (Gb): 10.00/7.20/2.92/5.46/13.86 | batch/sps: 10/3.72
30-39/1144 | total/used/cuda/res/ram (Gb): 10.00/7.20/2.92/5.46/13.85 | batch/sps: 10/1.58
40-49/1144 | total/used/cuda/res/ram (Gb): 10.00/7.20/2.92/5.46/13.86 | batch/sps: 10/3.11
50-59/1144 | total/used/cuda/res/ram (Gb): 10.00/7.20/2.92/5.46/13.86 | batch/sps: 10/2.28
60-69/1144 | total/used/cuda/res/ram (Gb): 10.00/7.19/2.92/5.46/13.86 | batch/sps: 10/1.30
70-79/1144 | total/used/cuda/res/ram (Gb): 10.00/7.20/2.92/5.46/13.85 | batch/sps: 10/1.90
80-89/1144 | total/used/cuda/res/ram (Gb): 10.00/7.21/2.92/5.46/13.86 | batch/sps: 10/2.07
90-99/1144 | total/used/cuda/res/ram (Gb): 10.00/7.20/2.92/5.46/13.86 | batch/sps: 10/2.08
100-109/1144 | total/used/cuda/res/ram (Gb): 10.00/7.18/2.92/5.46/13.85 | batch/sps: 10/1.85

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

0-9/126 | total/used/cuda/res/ram (Gb): 10.00/7.33/2.92/5.56/13.91 | batch/sps: 10/0.66
10-19/126 | total/used/cuda/res/ram (Gb): 10.00/7.32/2.92/5.56/13.92 | batch/sps: 10/0.69
20-29/126 | total/used/cuda/res/ram (Gb): 10.00/7.31/2.92/5.56/13.91 | batch/sps: 10/0.72
30-39/126 | total/used/cuda/res/ram (Gb): 10.00/7.30/2.92/5.56/13.91 | batch/sps: 10/1.27
40-49/126 | total/used/cuda/res/ram (Gb): 10.00/7.29/2.92/5.56/13.89 | batch/sps: 10/1.44
50-59/126 | total/used/cuda/res/ram (Gb): 10.00/7.33/2.92/5.56/13.94 | batch/sps: 10/0.75
60-69/126 | total/used/cuda/res/ram (Gb): 10.00/7.33/2.92/5.56/13.92 | batch/sps: 10/1.23
70-79/126 | total/used/cuda/res/ram (Gb): 10.00/7.34/2.92/5.56/13.91 | batch/sps: 10/0.86
80-89/126 | total/used/cuda/res/ram (Gb): 10.00/7.33/2.92/5.56/13.92 | batch/sps: 10/0.93
90-99/126 | total/used/cuda/res/ram (Gb): 10.00/7.34/2.92/5.56/13.91 | batch/sps: 10/1.13
100-109/126 | total/used/cuda/res/ram (Gb): 10.00/7.31/2.92/5.56/13.91 | batch/sps: 10/1.31
110-119/12

Map:   0%|          | 0/1590 [00:00<?, ? examples/s]

0-9/1590 | total/used/cuda/res/ram (Gb): 10.00/7.31/2.92/5.56/13.92 | batch/sps: 10/1.93
10-19/1590 | total/used/cuda/res/ram (Gb): 10.00/7.28/2.92/5.56/13.92 | batch/sps: 10/2.11
20-29/1590 | total/used/cuda/res/ram (Gb): 10.00/7.28/2.92/5.56/13.92 | batch/sps: 10/1.91
30-39/1590 | total/used/cuda/res/ram (Gb): 10.00/7.33/2.92/5.56/13.92 | batch/sps: 10/2.01
40-49/1590 | total/used/cuda/res/ram (Gb): 10.00/7.35/2.92/5.56/13.92 | batch/sps: 10/2.11
50-59/1590 | total/used/cuda/res/ram (Gb): 10.00/7.29/2.92/5.56/13.92 | batch/sps: 10/2.14
60-69/1590 | total/used/cuda/res/ram (Gb): 10.00/7.32/2.92/5.56/13.92 | batch/sps: 10/2.36
70-79/1590 | total/used/cuda/res/ram (Gb): 10.00/7.28/2.92/5.56/13.92 | batch/sps: 10/2.11
80-89/1590 | total/used/cuda/res/ram (Gb): 10.00/7.28/2.92/5.56/13.92 | batch/sps: 10/2.36
90-99/1590 | total/used/cuda/res/ram (Gb): 10.00/7.32/2.92/5.56/13.92 | batch/sps: 10/2.22
100-109/1590 | total/used/cuda/res/ram (Gb): 10.00/7.35/2.92/5.56/13.93 | batch/sps: 10/1.77

Map:   0%|          | 0/1062 [00:00<?, ? examples/s]

0-9/1062 | total/used/cuda/res/ram (Gb): 10.00/7.28/2.92/5.56/13.98 | batch/sps: 10/1.35
10-19/1062 | total/used/cuda/res/ram (Gb): 10.00/7.32/2.92/5.56/14.00 | batch/sps: 10/1.15
20-29/1062 | total/used/cuda/res/ram (Gb): 10.00/7.34/2.92/5.56/13.97 | batch/sps: 10/1.24
30-39/1062 | total/used/cuda/res/ram (Gb): 10.00/7.36/2.92/5.56/13.96 | batch/sps: 10/1.35
40-49/1062 | total/used/cuda/res/ram (Gb): 10.00/7.33/2.92/5.56/13.97 | batch/sps: 10/1.34
50-59/1062 | total/used/cuda/res/ram (Gb): 10.00/7.33/2.92/5.56/13.97 | batch/sps: 10/1.42
60-69/1062 | total/used/cuda/res/ram (Gb): 10.00/7.34/2.92/5.56/13.98 | batch/sps: 10/1.07
70-79/1062 | total/used/cuda/res/ram (Gb): 10.00/7.33/2.92/5.56/13.97 | batch/sps: 10/1.15
80-89/1062 | total/used/cuda/res/ram (Gb): 10.00/7.28/2.92/5.56/13.97 | batch/sps: 10/1.14
90-99/1062 | total/used/cuda/res/ram (Gb): 10.00/7.32/2.92/5.56/13.97 | batch/sps: 10/1.12
100-109/1062 | total/used/cuda/res/ram (Gb): 10.00/7.29/2.92/5.56/13.97 | batch/sps: 10/1.04

Map:   0%|          | 0/2031 [00:00<?, ? examples/s]

0-9/2031 | total/used/cuda/res/ram (Gb): 10.00/7.40/2.92/5.56/13.97 | batch/sps: 10/2.68
10-19/2031 | total/used/cuda/res/ram (Gb): 10.00/7.35/2.92/5.56/13.98 | batch/sps: 10/1.86
20-29/2031 | total/used/cuda/res/ram (Gb): 10.00/7.34/2.92/5.56/13.96 | batch/sps: 10/1.51
30-39/2031 | total/used/cuda/res/ram (Gb): 10.00/7.40/2.92/5.56/13.99 | batch/sps: 10/2.02
40-49/2031 | total/used/cuda/res/ram (Gb): 10.00/7.38/2.92/5.56/14.01 | batch/sps: 10/1.52
50-59/2031 | total/used/cuda/res/ram (Gb): 10.00/7.40/2.92/5.56/14.00 | batch/sps: 10/1.70
60-69/2031 | total/used/cuda/res/ram (Gb): 10.00/7.42/2.92/5.56/14.00 | batch/sps: 10/3.72
70-79/2031 | total/used/cuda/res/ram (Gb): 10.00/7.41/2.92/5.56/14.00 | batch/sps: 10/3.08
80-89/2031 | total/used/cuda/res/ram (Gb): 10.00/7.39/2.92/5.56/13.97 | batch/sps: 10/2.50
90-99/2031 | total/used/cuda/res/ram (Gb): 10.00/7.35/2.92/5.56/13.98 | batch/sps: 10/3.37
100-109/2031 | total/used/cuda/res/ram (Gb): 10.00/7.35/2.92/5.56/13.98 | batch/sps: 10/2.83

### Saving eval results

In [10]:
hardware = "HomeDesktop (RTX3080)"
# hardware = "NC24 (A100)"
print(f"model_name: {model_name}, model_id: {model_id}, model_path: {model_path}")

total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total/trainable params: {total_params}/{total_trainable_params}")

if list in globals():
    del list

all_flats = []
all_scores = []
if os.path.exists("results/all-scores.csv"):
    all_scores = pd.read_csv("results/all-scores.csv").to_dict("records")

all_fulls = []

rouge_metric = evaluate.load("rouge")
# glue_metric = evaluate.load("glue", "stsb")
sacreblue_metric = evaluate.load("sacrebleu")
sari_metric = evaluate.load("sari")
em_metric = evaluate.load("exact_match")


def calculate_scores(processed_samples):
    rouge_score = rouge_metric.compute(
        predictions=processed_samples["processed"], references=processed_samples["references"]
    )
    # pprint(rouge_score)

    sacreblue_score = sacreblue_metric.compute(
        predictions=processed_samples["processed"], references=processed_samples["references"]
    )
    # pprint(sacreblue_score)

    sari_score = sari_metric.compute(
        sources=processed_samples["input"],
        predictions=processed_samples["processed"],
        references=processed_samples["references"],
    )
    # pprint(sari_score)

    em_score = em_metric.compute(predictions=processed_samples["processed"], references=processed_samples["reference"])
    # pprint(em_score)

    return {
        "rouge": rouge_score,
        "sacreblue": sacreblue_score,
        "sari": sari_score,
        "em": em_score,
    }


for task, obj in processed_samples_map.items():
    print(f"task: {task}, samples: {len(obj['samples'])}")

    batch = obj["samples"]
    total_samples = len(batch)

    all_saved_samples = batch.remove_columns(["references"])
    saved_samples = all_saved_samples[:100] if len(all_saved_samples) > 100 else all_saved_samples
    flats_frame = pd.DataFrame.from_records(saved_samples)
    flats_frame.to_json(f"samples/{model_path}_{task}.json", orient="records")

    scores = calculate_scores(batch)
    # pprint(scores)

    score_paths = [
        "rouge.rouge1",
        # "rouge.rouge2",
        # "rouge.rougeL",
        # "rouge.rougeLsum",
        "sacreblue.score",
        "sari.sari",
        "em.exact_match",
    ]

    normalized_scores = {}
    for s, v in scores.items():
        print(f"s: {s}, v: {v}")
        for ss, vv in v.items():
            if not isinstance(vv, list):
                # normalized_scores[f"score.{k}.{ss}"] = vv
                path = f"{s}.{ss}"
                if path in score_paths:
                    normalized_scores[f"score.{s}.{ss}"] = vv
    # pprint(normalized_scores)

    normalized_utilization = {}
    for s, v in obj["utilization"].items():
        if not isinstance(v, list):
            normalized_utilization[f"utilization.{s}"] = v
    # print(normalized_utilization)

    flat_dict = {
        "model": model_id,
        "hardware": hardware,
        "total_params": total_params,
        "total_samples": total_samples,
        "elapsed_time": obj["elapsed_time"],
        "sps": obj["sps"],
        "batch_size": obj["batch_size"],
        "max_length": obj["max_length"],
        "task": task,
    }
    flat_dict.update(normalized_scores)
    flat_dict.update(normalized_utilization)
    # pprint(frame)

    all_flats.append(flat_dict)
    all_scores.append(flat_dict)

    fulls_frame = {
        "task": task,
        "total_samples": total_samples,
        "elapsed_time": obj["elapsed_time"],
        "sps": obj["sps"],
        "batch_size": obj["batch_size"],
        "max_length": obj["max_length"],
    }
    fulls_frame.update(
        {
            "scores": scores,
            "utilization": obj["utilization"],
        }
    )
    all_fulls.append(fulls_frame)

flats_frame = pd.DataFrame.from_records(all_flats)
flats_frame.to_csv(f"results/{model_path}.csv", index=False)

scores_frame = pd.DataFrame.from_records(all_scores)
scores_frame.to_csv(f"results/all-scores.csv", index=False)

fulls_dict = {
    "model": model_id,
    "hardware": hardware,
    "total_params": total_params,
}
for full in all_fulls:
    fulls_dict[full["task"]] = full

print(fulls_dict)
fulls_frame = pd.DataFrame.from_records([fulls_dict])
fulls_frame.to_json(f"results/{model_path}.json", orient="records", index=False)

model_name: t5-large-coedit, model_id: iliazlobin/t5-large-coedit, model_path: iliazlobin_t5-large-coedit
Total/trainable params: 737668096/737668096
task: neutralize, samples: 1127
s: rouge, v: {'rouge1': 0.9345984440771747, 'rouge2': 0.8660910861945696, 'rougeL': 0.9342957494577149, 'rougeLsum': 0.9343557800065598}
s: sacreblue, v: {'score': 88.88925200809757, 'counts': [24720, 22581, 20618, 18795], 'totals': [25905, 24778, 23651, 22526], 'precisions': [95.4255935147655, 91.13326337880378, 87.17601792736036, 83.4369173399627], 'bp': 0.9967241574770993, 'sys_len': 25905, 'ref_len': 25990}
s: sari, v: {'sari': 69.22160529964609}
s: em, v: {'exact_match': 0.07808340727595386}
task: simplification, samples: 1144
s: rouge, v: {'rouge1': 0.5756454028137281, 'rouge2': 0.3750410604150481, 'rougeL': 0.5300814613166529, 'rougeLsum': 0.529782834578091}
s: sacreblue, v: {'score': 28.159721395031095, 'counts': [11777, 6757, 4337, 2838], 'totals': [21409, 20265, 19121, 17977], 'precisions': [55.00