In [1]:
%load_ext autoreload
%load_ext dotenv
%dotenv
!huggingface-cli login --token $HUGGING_FACE_TOKEN

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/izlobin/.cache/huggingface/token
Login successful


In [2]:
# %%capture
# %pip install nltk absl-py rouge_score
# %pip install bleu sacrebleu
# %pip install sacremoses
# %pip install scipy
# %pip install sentencepiece
# %pip install optimum auto-gptq
# %pip install scikit-learn
# %pip install einops
# %pip install bitsandbytes
# %pip install huggingface_hub
# %pip install transformers evaluate gradio datasets chardet cchardet librosa ipython sentencepiece plotly phonemizer
# %pip install accelerate
# %pip install pynvml

In [2]:
%load_ext autoreload
%autoreload 2

import os
import sys
import time
from pprint import pprint

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from huggingface_hub import HfApi
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BartForCausalLM,
    BartModel,
    BartTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5TokenizerFast,
)

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

from utils.dataset import get_iterater_samples_simplified, get_iterater_samples_with_instruction
from utils.metric import calculate_scores
from utils.monitoring import calculate_utilization, format_utilization_narrow, print_utilization

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(f"Device: {device}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
'Device: cuda'


## Loading models

### Loading custom gpt2 / gpt2-large

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "gpt2-large"
model_repo = f"openai-community"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=True,
)
tokenizer.add_eos_token = True
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"
print(type(tokenizer))
# print(tokenizer.add_eos_token)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print(tokenizer.padding_side)



# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
# )

# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", max_memory={0: "80GiB"})
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=0)
print(type(model))

model.generation_config.max_new_tokens = 0
# model.generation_config.new_tokens = 350
model.generation_config.pad_token_id = model.generation_config.eos_token_id
model.generation_config.padding_side = "left"

print(model.generation_config)
print(model)

model_name: gpt2-large,model_id: openai-community/gpt2-large,model_path: openai-community_gpt2-large
<class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>
50256
50256
left
<class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>
GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "pad_token_id": 50256,
  "padding_side": "left"
}

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
         

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "gpt2-large-coedit"
model_repo = f"iliazlobin"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=True,
)
tokenizer.add_eos_token = True
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"
print(type(tokenizer))
# print(tokenizer.add_eos_token)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print(tokenizer.padding_side)



# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
# )

# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", max_memory={0: "80GiB"})
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=0)
print(type(model))
print(model.config)

# model.generation_config.max_new_tokens = 350
# model.generation_config.new_tokens = 350
model.generation_config.pad_token_id = model.generation_config.eos_token_id
model.generation_config.padding_side = "left"

print(model.generation_config)
print(model)

model_name: gpt2-large-coedit,model_id: iliazlobin/gpt2-large-coedit,model_path: iliazlobin_gpt2-large-coedit
<class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>
50256
50256
left
<class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>
GPT2Config {
  "_name_or_path": "iliazlobin/gpt2-large-coedit",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1280,
  "n_head": 20,
  "n_inner": null,
  "n_layer": 36,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_sp

In [None]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "gpt2-large"
model_repo = f"openai-community"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")


tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=True,
)
tokenizer.add_eos_token = True
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"

print(type(tokenizer))


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map=0)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=0)
print(type(model))
# print(model.config)
# print(model)

trained_model_name = "gpt2-large-bnb8-coedit"
trained_model_repo = f"iliazlobin"
trained_model_id = f"{trained_model_repo}/{trained_model_name}"
trained_model_checkpoint = f"{trained_model_repo}/{trained_model_name}"
trained_model_path = f"{trained_model_repo}_{trained_model_name}"
print(
    f"trained_model_name: {trained_model_name},"
    f"trained_model_id: {trained_model_id},"
    f"trained_model_path: {trained_model_path}"
)


adapters_path = f"../model-train/model-{trained_model_repo}_{trained_model_name}"
peft_model = PeftModel.from_pretrained(model, adapters_path)

print(type(peft_model))
print(peft_model.config)
print(peft_model)

origin_model = model
model = peft_model
model_name = trained_model_name
model_repo = trained_model_repo
model_id = trained_model_id
model_checkpoint = trained_model_checkpoint
model_path = trained_model_path
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

### Loading BART
* https://huggingface.co/facebook/bart-large

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BartForConditionalGeneration, BitsAndBytesConfig

model_name = "bart-large"
model_repo = f"facebook"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = BartTokenizer.from_pretrained(model_id)
tokenizer.add_eos_token = True
# tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"
print(type(tokenizer))

model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map=0)
print(type(model))

model_name: bart-large,model_id: facebook/bart-large,model_path: facebook_bart-large
<class 'transformers.models.bart.tokenization_bart.BartTokenizer'>
<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BartForConditionalGeneration, BitsAndBytesConfig

model_name = "bart-large-coedit"
model_repo = f"iliazlobin"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = BartTokenizer.from_pretrained(model_id)
tokenizer.add_eos_token = True
# tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"
print(type(tokenizer))

# model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map=0, max_memory={0: "20GIB"})
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map=0)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="auto")
print(type(model))

model_name: bart-large-coedit,model_id: iliazlobin/bart-large-coedit,model_path: iliazlobin_bart-large-coedit
<class 'transformers.models.bart.tokenization_bart.BartTokenizer'>
<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>


In [3]:
model_name = "iliazlobin/bart-grammarly"
model_alias = model_name.replace("/", "_")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
# model = BartModel.from_pretrained(model_name)
# model = BartForCausalLM.from_pretrained(model_name, device_map=0)
model = BartForConditionalGeneration.from_pretrained(model_name, device_map=0)

print(type(tokenizer))
print(type(model))
print(model.config)

<class 'transformers.models.bart.tokenization_bart.BartTokenizer'>
<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>
BartConfig {
  "_name_or_path": "iliazlobin/bart-grammarly",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
 

### Loading google/t5-large
* https://huggingface.co/google-t5/t5-large

In [3]:
model_name = "t5-large"
model_repo = f"google-t5"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = T5TokenizerFast.from_pretrained(model_id)
# tokenizer.add_eos_token = True
# tokenizer.pad_token_id = tokenizer.eos_token_id
# tokenizer.padding_side = "right"
print(type(tokenizer))

# model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map=0, max_memory={0: "20GIB"})
model = T5ForConditionalGeneration.from_pretrained(model_id, device_map=0)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="auto")
print(type(model))
print(model.config)

model_name: t5-large,model_id: google-t5/t5-large,model_path: google-t5_t5-large


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on google-t5/t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


<class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>
<class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>
T5Config {
  "_name_or_path": "google-t5/t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 4096,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_siz

### Loading grammarly/coedit

In [4]:
model_name = "coedit-large"
model_repo = f"grammarly"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")


tokenizer = AutoTokenizer.from_pretrained(model_id)
print(type(tokenizer))

model = T5ForConditionalGeneration.from_pretrained(model_id, device_map=0)
print(type(model))
print(model.config)

model_name: coedit-large,model_id: grammarly/coedit-large,model_path: grammarly_coedit-large
<class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>
<class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>
T5Config {
  "_name_or_path": "grammarly/coedit-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_ver

### Loading Gemma

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "gemma-2b-it"
# model_name = "gemma-7b-it"
# model_name = "gemma-7b"
model_repo = f"google"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=True,
)
# tokenizer.add_eos_token = True
# tokenizer.pad_token_id = tokenizer.eos_token_id
# tokenizer.padding_side = "left"
print(type(tokenizer))
# print(tokenizer.add_eos_token)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print(tokenizer.padding_side)


# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
# )

# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", max_memory={0: "80GiB"})
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=0)
print(type(model))

# model.generation_config.max_new_tokens = 350
# # model.generation_config.new_tokens = 350
# model.generation_config.pad_token_id = model.generation_config.eos_token_id
# model.generation_config.padding_side = "left"

print(model.generation_config)
print(model)

model_name: gemma-2b-it,model_id: google/gemma-2b-it,model_path: google_gemma-2b-it


Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


<class 'transformers.models.gemma.tokenization_gemma_fast.GemmaTokenizerFast'>
1
0
left


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<class 'transformers.models.gemma.modeling_gemma.GemmaForCausalLM'>
GenerationConfig {
  "bos_token_id": 2,
  "eos_token_id": 1,
  "pad_token_id": 0
}

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): P

### Loading Phi-2
* https://huggingface.co/microsoft/phi-2
* https://huggingface.co/TheBloke/phi-2-GPTQ

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "phi-2"
model_repo = f"microsoft"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=True,
)
# tokenizer.add_eos_token = True
# tokenizer.pad_token_id = tokenizer.eos_token_id
# tokenizer.padding_side = "left"
print(type(tokenizer))
# print(tokenizer.add_eos_token)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print(tokenizer.padding_side)


# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
# )

# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", max_memory={0: "80GiB"})
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=0)
print(type(model))

# model.generation_config.max_new_tokens = 350
# # model.generation_config.new_tokens = 350
# model.generation_config.pad_token_id = model.generation_config.eos_token_id
# model.generation_config.padding_side = "left"

print(model.generation_config)
print(model.config)
print(model)

total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total/trainable params: {total_params}/{total_trainable_params}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


tokenizer.eos_token: <|endoftext|>
PhiConfig {
  "_name_or_path": "TheBloke/phi-2-GPTQ",
  "activation_function": "gelu_new",
  "architectures": [
    "PhiForCausalLM"
  ],
  "attn_pdrop": 0.0,
  "auto_map": {
    "AutoConfig": "TheBloke/phi-2-GPTQ--configuration_phi.PhiConfig",
    "AutoModelForCausalLM": "TheBloke/phi-2-GPTQ--modeling_phi.PhiForCausalLM"
  },
  "embd_pdrop": 0.0,
  "flash_attn": false,
  "flash_rotary": false,
  "fused_dense": false,
  "img_processor": null,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "phi-msft",
  "n_embd": 2560,
  "n_head": 32,
  "n_head_kv": null,
  "n_inner": null,
  "n_layer": 32,
  "n_positions": 2048,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "quantization_config": {
    "batch_size": 1,
    "bits": 4,
    "block_name_to_quantize": null,
    "cache_block_outputs": true,
    "damp_percent": 0.1,
    "dataset": null,
    "desc_act": true,
    "exllama_config": {
      "version": 1
    },
    "group_size": 128,

### Loading mixtral-8x7B
* https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
* https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GPTQ

Quantization
* https://medium.com/@rakeshrajpurohit/model-quantization-with-hugging-face-transformers-and-bitsandbytes-integration-b4c9983e8996

In [7]:
# model_name = "mistralai/Mixtral-8x7B-v0.1"
model_name = 'TheBloke/Mixtral-8x7B-v0.1-GPTQ'
model_alias = model_name.replace('/', '_')

torch.cuda.empty_cache()

# from transformers import BitsAndBytesConfig
# quantization_config = BitsAndBytesConfig(load_in_4bit=4)

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map=0,
#     # torch_dtype=torch.float16,
#     # load_in_4bit=True,
#     # quantization_config=quantization_config,
# )

# print(f"model.config.eos_token_id: {model.config.eos_token_id}")
# eos_token_id = 50256 # https://huggingface.co/microsoft/phi-2/blob/main/config.json

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=0,
    load_in_4bit=True,
    trust_remote_code=False,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True,
)

# print(f"tokenizer.eos_token: {tokenizer.eos_token}")
# tokenizer.pad_token = tokenizer.eos_token


print(model.config)

# text = 'Hello my name is'
# inputs = tokenizer(text, return_tensors='pt')
# outputs = model.generate(**inputs, max_new_tokens=20)
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


AttributeError: 'BitsAndBytesConfig' object has no attribute 'get_loading_attributes'

### Loading llama-2

In [3]:
# model_name = "meta-llama/Llama-2-7b-hf"
model_name = "TheBloke/Llama-2-7B-GPTQ"
# model_name = "TheBloke/Nous-Hermes-Llama-2-7B-GPTQ"
model_alias = model_name.replace("/", "_")

from transformers import LlamaTokenizer, LlamaForCausalLM

tokenizer = LlamaTokenizer.from_pretrained(model_name, padding_side='left')
model = LlamaForCausalLM.from_pretrained(model_name, device_map=0)

tokenizer.pad_token = tokenizer.eos_token

print(type(tokenizer))
print(type(model))
print(model.config)

# from auto_gptq import exllama_set_max_input_length
# model = exllama_set_max_input_length(model, max_input_length=2400)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class
The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class


<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>
<class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>
LlamaConfig {
  "_name_or_path": "TheBloke/Llama-2-7B-GPTQ",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_length": 4096,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "quantization_config": {
    "batch_size": 1,
    "bits": 4,
    "block_name_to_quantize": null,
    "cache_block_outputs": true,
    "damp_percent": 0.01,
    "dataset": null,
    "desc_act": false,
    "exllama_config": {
      "version": 1
    },
    "group_size": 128,
    "max_input_length": null,
    "model_seqlen": null,
    "mo

## Evaluation

### Grammarly/coedit dataset
* https://huggingface.co/datasets/grammarly/coedit

In [5]:
from datasets import DatasetDict, load_dataset

# full_dataset = load_dataset("grammarly/coedit")
# print(full_dataset)

# train_dataset = load_dataset("grammarly/coedit", split="train[:50000]")
# test_dataset = load_dataset("grammarly/coedit", split="train[10000:]")
# # test_dataset = load_dataset("grammarly/coedit", split="validation")

all_dataset = load_dataset("grammarly/coedit", split="train+validation")
# print(all_dataset)

# print()
# print(f"train set {set(all_dataset['task'])}")
# print(f"total len: {len(all_dataset)}")
# print(f"gec len: {len(all_dataset.filter(lambda x: x['task'] == 'gec'))}")
# print(f"simplification len: {len(all_dataset.filter(lambda x: x['task'] == 'simplification'))}")
# print(f"clarity len: {len(all_dataset.filter(lambda x: x['task'] == 'clarity'))}")
# print(f"coherence len: {len(all_dataset.filter(lambda x: x['task'] == 'coherence'))}")
# print(f"paraphrase len: {len(all_dataset.filter(lambda x: x['task'] == 'paraphrase'))}")
# print(f"neutralize len: {len(all_dataset.filter(lambda x: x['task'] == 'neutralize'))}")
# print()

# train_ratio = 0.01
# test_ratio = 0.001
# train_ratio = 0.1
# test_ratio = 0.01
train_ratio = 0.9
test_ratio = 0.1

gec_dataset = all_dataset.filter(lambda x: x["task"] == "gec")
train_gec_dataset = gec_dataset.select(range(0, int(train_ratio * len(gec_dataset))))
test_gec_dataset = gec_dataset.select(range(int((1 - test_ratio) * len(gec_dataset)), len(gec_dataset)))

simplification_dataset = all_dataset.filter(lambda x: x["task"] == "simplification")
train_simplification_dataset = simplification_dataset.select(range(0, int(train_ratio * len(simplification_dataset))))
test_simplification_dataset = simplification_dataset.select(
    range(int((1 - test_ratio) * len(simplification_dataset)), len(simplification_dataset))
)

clarity_dataset = all_dataset.filter(lambda x: x["task"] == "clarity")
train_clarity_dataset = clarity_dataset.select(range(0, int(train_ratio * len(clarity_dataset))))
test_clarity_dataset = clarity_dataset.select(range(int((1 - test_ratio) * len(clarity_dataset)), len(clarity_dataset)))

coherence_dataset = all_dataset.filter(lambda x: x["task"] == "coherence")
train_coherence_dataset = coherence_dataset.select(range(0, int(train_ratio * len(coherence_dataset))))
test_coherence_dataset = coherence_dataset.select(
    range(int((1 - test_ratio) * len(coherence_dataset)), len(coherence_dataset))
)

paraphrase_dataset = all_dataset.filter(lambda x: x["task"] == "paraphrase")
train_paraphrase_dataset = paraphrase_dataset.select(range(0, int(train_ratio * len(paraphrase_dataset))))
test_paraphrase_dataset = paraphrase_dataset.select(
    range(int((1 - test_ratio) * len(paraphrase_dataset)), len(paraphrase_dataset))
)

neutralize_dataset = all_dataset.filter(lambda x: x["task"] == "neutralize")
neutralize_dataset_split = int(train_ratio * len(neutralize_dataset))
train_neutralize_dataset = neutralize_dataset.select(range(0, int(train_ratio * len(neutralize_dataset))))
test_neutralize_dataset = neutralize_dataset.select(
    range(int((1 - test_ratio) * len(neutralize_dataset)), len(neutralize_dataset))
)

from datasets import concatenate_datasets

train_dataset = concatenate_datasets(
    [
        train_gec_dataset,
        train_simplification_dataset,
        train_clarity_dataset,
        train_coherence_dataset,
        train_paraphrase_dataset,
        train_neutralize_dataset,
    ]
)
train_dataset = train_dataset.map(
    lambda item: {
        "input": item["src"],
        "reference": item["tgt"],
        "references": [item["tgt"]],
    },
    remove_columns=["src", "tgt", "_id"],
)
print(f"train set {set(train_dataset['task'])}")
print(train_dataset)

test_dataset = concatenate_datasets(
    [
        test_gec_dataset,
        test_simplification_dataset,
        test_clarity_dataset,
        test_coherence_dataset,
        test_paraphrase_dataset,
        test_neutralize_dataset,
    ]
)
test_dataset = test_dataset.map(
    lambda item: {
        "input": item["src"],
        "reference": item["tgt"],
        "references": [item["tgt"]],
    },
    remove_columns=["src", "tgt", "_id"],
)
print(f"test set {set(test_dataset['task'])}")
print(test_dataset)


def add_prompt(item):
    return {
        "request": f"{item['input']}\nResponse:",
        "prompt": f"{item['input']}\nResponse:{item['reference']}",
    }


dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
# dataset = dataset.rename_column("task", "label")
dataset = dataset.map(add_prompt)
print(dataset)
print(dataset["train"][0])

train set {'gec', 'clarity', 'neutralize', 'coherence', 'simplification', 'paraphrase'}
Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 63703
})
test set {'gec', 'clarity', 'neutralize', 'coherence', 'simplification', 'paraphrase'}
Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 7080
})
DatasetDict({
    train: Dataset({
        features: ['task', 'input', 'reference', 'references', 'request', 'prompt'],
        num_rows: 63703
    })
    test: Dataset({
        features: ['task', 'input', 'reference', 'references', 'request', 'prompt'],
        num_rows: 7080
    })
})
{'task': 'gec', 'input': 'Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.', 'reference': 'For example, countries with a lot of deserts can transform their desert to increase their habitable

In [6]:
# find the longest sequence in the dataset
max_input_length = max(len(tokenizer.encode(item["input"])) for item in dataset["train"])
print(f"max_input_length train: {max_input_length}")
max_input_length = max(len(tokenizer.encode(item["input"])) for item in dataset["test"])
print(f"max_input_length test: {max_input_length}")

max_input_length train: 171
max_input_length test: 239


In [7]:
from datasets import Dataset

train_lists_map = {}

for task in set(train_dataset['task']):
    train_lists_map[task] = []

for item in dataset["train"]:
    train_lists_map[item["task"]].append(item)

train_dataset_map = {}
for task, l in train_lists_map.items():
    train_dataset_map[task] = Dataset.from_list(l)
# print(train_dataset_map)

train_dataset_dict = DatasetDict(train_dataset_map)
# print(train_dataset_dict)

# for task, ds in train_dataset_dict.items():
#     print(f"{task}: {ds}")

for task in set(train_dataset['task']):
    print(f"train/{task}: {len(train_lists_map[task])}")

train/gec: 18277
train/clarity: 1126
train/neutralize: 10143
train/coherence: 9554
train/simplification: 10296
train/paraphrase: 14307


In [8]:
from datasets import Dataset

test_lists_map = {}

for task in set(train_dataset['task']):
    test_lists_map[task] = []

for item in dataset["test"]:
    test_lists_map[item["task"]].append(item)

test_dataset_map = {}
for task, l in test_lists_map.items():
    test_dataset_map[task] = Dataset.from_list(l)
# print(test_dataset_map)

test_dataset_dict = DatasetDict(test_dataset_map)
# print(test_dataset_dict)

# for task, ds in test_dataset_dict.items():
#     print(f"{task}: {ds}")

for task in set(train_dataset['task']):
    print(f"test/{task}: {len(test_lists_map[task])}")

test/gec: 2031
test/clarity: 126
test/neutralize: 1127
test/coherence: 1062
test/simplification: 1144
test/paraphrase: 1590


In [9]:
print(dataset["test"][0]["task"])
print(dataset["test"][0]["input"])
print(dataset["test"][0]["reference"])
print(dataset["test"][0]["prompt"])
print(dataset["test"][0]["request"])

gec
Fix grammaticality in this sentence: Despite strict japanese society, I feel happy when I had dinner with my family.
Despite the strict Japanese society, I feel happy when I have dinner with my family.
Fix grammaticality in this sentence: Despite strict japanese society, I feel happy when I had dinner with my family.
Response:Despite the strict Japanese society, I feel happy when I have dinner with my family.
Fix grammaticality in this sentence: Despite strict japanese society, I feel happy when I had dinner with my family.
Response:


### Test inference

In [10]:
max_batch = 2
max_length = 350

for task, batch in test_dataset_dict.items():
    print()
    print(f">> {task}")
    batch_size = len(batch) if len(batch) < max_batch else max_batch
    input_batch = batch.select(range(batch_size))
    print(f"input: {input_batch['input']}")

    input = tokenizer(input_batch["input"], padding=True, return_tensors="pt").to(device)
    outputs = model.generate(input.input_ids, max_length=max_length)
    result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    print(f"result: {result}")

    break

Both `max_new_tokens` (=350) and `max_length`(=350) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



>> gec
input: ['Fix grammaticality in this sentence: Despite strict japanese society, I feel happy when I had dinner with my family.', 'Fix grammaticality in this sentence: They are increasing rapidly in Japan for a couple of years.']
result: ['Fix grammaticality in this sentence: Despite strict japanese society, I feel happy when I had dinner with my family.', 'Fix grammaticality in this sentence: They are increasing rapidly in Japan for a couple of years.']


In [11]:
max_batch = 2
max_length = 350

for task, batch in test_dataset_dict.items():
    print()
    print(f">> {task}")
    batch_size = len(batch) if len(batch) < max_batch else max_batch
    input_batch = batch.select(range(batch_size))
    print(f"request: {input_batch['request']}")
    print(f"reference: {input_batch['reference']}")

    inputs = tokenizer(input_batch['request'], return_tensors="pt", padding=True).to(device)

    model.config.use_cache = False
    # outputs = model.generate(**inputs, max_new_tokens=128, num_return_sequences=1)
    outputs = model.generate(**inputs, num_return_sequences=1)
    # outputs = model.generate(
    #     **inputs,
    #     do_sample=True,
    #     top_k=10,
    #     num_return_sequences=1,
    #     pad_token_id=tokenizer.eos_token_id,
    #     # return_attention_mask=True,
    #     max_length=256,
    # )

    trimmed_output = outputs[:, inputs.input_ids.shape[1] :]
    result = tokenizer.batch_decode(trimmed_output, skip_special_tokens=True)
    print(f"result: {result}")

    # break


>> gec
request: ['Fix grammaticality in this sentence: Despite strict japanese society, I feel happy when I had dinner with my family.\nResponse:', 'Fix grammaticality in this sentence: They are increasing rapidly in Japan for a couple of years.\nResponse:']
reference: ['Despite the strict Japanese society, I feel happy when I have dinner with my family.', 'They have been increasing rapidly in Japan for over a couple of years.']
result: ['Despite strict Japanese society, I feel happy when I had dinner with my family.', 'They are increasing rapidly in Japan for a couple of years.']

>> clarity
request: ['Use clearer wording: Canals are waterways channels, or artificial waterways, for water conveyance, or to service water transport vehicles.\nResponse:', 'Clarify this text: Canals are waterways channels, or artificial waterways, for water conveyance, or to service water transport vehicles.\nResponse:']
reference: ['Canals are waterway channels, or artificial waterways, for water conveya

### Eval prep

In [12]:
%reload_ext autoreload

from utils.monitoring import calculate_utilization, format_utilization_narrow, print_utilization

total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total/trainable params: {total_params}/{total_trainable_params}')

utilization = calculate_utilization()
print(utilization)
utilization_str = format_utilization_narrow(utilization)
print(utilization_str)
print(
    f"total/used/cuda/res/ram(Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)


torch.cuda.empty_cache()

actual_fraction = 0.95
available_memory = utilization['total_memory'] - utilization['memory_used']
recommended_fraction = available_memory / utilization['total_memory']
torch.cuda.set_per_process_memory_fraction(actual_fraction, 0)

print(
    f"Total/used/available memory (Gb): {utilization['total_memory']/1024**3:.2f}/"
    "{utilization['memory_used']/1024**3:.2f}/{available_memory/1024**3:.2f}"
)
print(f'Recommended/actual fraction: {recommended_fraction:.2f}/{actual_fraction:.2f}')

# torch.cuda.empty_cache()
# torch.empty(utilization['total_memory'] // 2, dtype=torch.int8, device='cuda')
# print_utilization()

Total/trainable params: 774030080/774030080
{'total_memory': 10736893952, 'memory_used': 4979257344, 'cuda_allocated': 3218543104, 'cuda_reserved': 3552575488, 'ram_usage': 18353610752}
{'total_memory': '10.00', 'memory_used': '4.64', 'cuda_allocated': '3.00', 'cuda_reserved': '3.31', 'ram_usage': '17.09'}
total/used/cuda/res/ram(Gb): 10.00/4.64/3.00/3.31/17.09
Total/used/available memory (Gb): 10.00/{utilization['memory_used']/1024**3:.2f}/{available_memory/1024**3:.2f}
Recommended/actual fraction: 0.54/0.95


### Eval encoder-decoder models

In [None]:
%%time
%load_ext autoreload

batch_size = 20 # home: t5, grammarly/coedit
# batch_size = 100
max_length = 350


def model_process(batch, idx, **kwargs):
    num_samples = len(batch["input"])
    start_time = time.time()

    model = kwargs.get("model")
    tokenizer = kwargs.get("tokenizer")
    total_samples = kwargs.get("total_samples")

    inputs = tokenizer(batch["input"], max_length=max_length, padding=True, return_tensors="pt").to(device)
    # input_ids = tokenizer(batch['task'], return_tensors="pt").input_ids.to(device)
    # input_ids = tokenizer(item['task'], return_tensors="pt").input_ids
    # outputs = model.generate(input_ids, max_length=512)
    outputs = model.generate(inputs.input_ids, max_length=max_length)
    # print(f"outputs: {outputs}")
    processed = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = num_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{idx[0]}-{idx[-1]}/{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
        f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']} | "
        f"batch/sps: {num_samples}/{sps_str}"
    )

    return {"processed": processed}


utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)

processed_samples_map = {}

for task, samples in test_dataset_dict.items():
    total_samples = len(samples)

    print(f"Processing {total_samples} samples for {task}")

    start_time = time.time()

    processed_samples = samples.map(
        model_process,
        fn_kwargs={
            "model": model,
            "tokenizer": tokenizer,
            "total_samples": total_samples,
        },
        num_proc=1,
        batched=True,
        batch_size=batch_size,
        with_indices=True,
    )

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = total_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    print(f"Finished processing {total_samples} samples for {task}.")

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
        f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']} | "
        f"sps: {sps_str}"
    )

    processed_samples_map[task] = {
        # "task": task,
        # "samples": samples,
        "samples": processed_samples,
        "total_samples": total_samples,
        "elapsed_time": elapsed_time,
        "sps": sps,
        "batch_size": batch_size,
        "max_length": max_length,
        "utilization": utilization,
    }

# processed_samples = samples.map(model_process, num_proc=torch.cuda.device_count())
# processed_samples = samples.map(
#     model_process,
#     fn_kwargs={
#         "model": model,
#         "tokenizer": tokenizer,
#         "total_samples": total_samples,
#     },
#     num_proc=1,
#     batched=True,
#     batch_size=batch_size,
#     with_indices=True,
# )

processed_gec_samples = processed_samples_map["gec"]["samples"]

pprint(processed_gec_samples)
pprint(processed_gec_samples["processed"][:2])

### Eval decoder-only models

In [13]:
%%time
%load_ext autoreload

batch_size = 8  # gpt2-large
max_length = 350
max_new_tokens = 350

def model_process(batch, idx, **kwargs):
    num_samples = len(batch["input"])
    start_time = time.time()

    model = kwargs.get("model")
    tokenizer = kwargs.get("tokenizer")
    total_samples = kwargs.get("total_samples")

    # print(batch["request"])
    inputs = tokenizer(batch["request"], padding=True, return_tensors="pt").to(device)
    # inputs = tokenizer(batch["request"], return_tensors="pt").inputs.to(device)
    # inputs = tokenizer(item["request"], return_tensors="pt").inputs
    # print(inputs)

    model.config.use_cache = False
    # outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, num_return_sequences=1)
    outputs = model.generate(**inputs, num_return_sequences=1)
    # outputs = model.generate(
    #     **inputs,
    #     do_sample=True,
    #     top_k=10,
    #     num_return_sequences=1,
    #     pad_token_id=tokenizer.eos_token_id,
    #     max_length=max_length,
    # )
    # print(outputs)

    trimmed_output = outputs[:, inputs.input_ids.shape[1] :]
    processed = tokenizer.batch_decode(trimmed_output, skip_special_tokens=True)
    print(processed)

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = num_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{idx[0]}-{idx[-1]}/{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
        f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']} | "
        f"batch/sps: {num_samples}/{sps_str}"
    )

    # return {"processed": processed}
    return {"processed": processed}
    # return {"processed": processed, "utilization": utilization, "tps": tps}
    # return {"processed": processed, "utilization": utilization}


utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)

processed_samples_map = {}

for task, samples in test_dataset_dict.items():
    total_samples = len(samples)

    print(f"Processing {total_samples} samples for {task}")

    start_time = time.time()

    processed_samples = samples.map(
        model_process,
        fn_kwargs={
            "model": model,
            "tokenizer": tokenizer,
            "total_samples": total_samples,
        },
        num_proc=1,
        batched=True,
        batch_size=batch_size,
        with_indices=True,
    )

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = total_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    print(f"Finished processing {total_samples} samples for {task}.")
    print(processed_samples)
    print(processed_samples["reference"][0])
    print(processed_samples["processed"][0])

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
        f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']} | "
        f"sps: {sps_str}"
    )

    processed_samples_map[task] = {
        # "task": task,
        # "samples": samples,
        "samples": processed_samples,
        "total_samples": total_samples,
        "elapsed_time": elapsed_time,
        "sps": sps,
        "batch_size": batch_size,
        "max_length": max_length,
        "utilization": utilization,
    }

# processed_samples = samples.map(model_process, num_proc=torch.cuda.device_count())
# processed_samples = samples.map(
#     model_process,
#     fn_kwargs={
#         "model": model,
#         "tokenizer": tokenizer,
#         "total_samples": total_samples,
#     },
#     num_proc=1,
#     batched=True,
#     batch_size=batch_size,
#     with_indices=True,
# )

processed_gec_samples = processed_samples_map["gec"]["samples"]

pprint(processed_gec_samples)
pprint(processed_gec_samples["processed"][:2])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
total/used/cuda/res/ram (Gb): 10.00/4.45/3.00/3.07/17.09
Processing 2031 samples for gec


Map:   0%|          | 0/2031 [00:00<?, ? examples/s]

['Despite strict Japanese society, I feel happy when I had dinner with my family.', 'They are increasing rapidly in Japan for a couple of years.', "As I go to my friends' house, I play 'Wii' together every time.", 'I have to come home as soon as possible, because my wife had a cough yesterday.', "Of course, my wife and I went to my father's house to eat dinner together.", 'I have much time to do many things because today is free.', 'Therefore, I come to go their company to discuss once a few months.', 'Tonight, I ate yellow noodle soup with red pork and wonton at a roadside vendor cart.']
0-7/2031 | total/used/cuda/res/ram (Gb): 10.00/4.87/3.00/3.37/22.70 | batch/sps: 8/12.51
['As it to me, the place which is important to me is Hangzhou city located in Zhejiang province.', 'Of course, I like Hangzhou city also because there are many great Chinese traditional restaurants.', "So I'm a little bit different from people who want to get English ability as a tool of business or something.", "

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

['Canals are waterways channels, or artificial waterways, for water conveyance, or to service water transport vehicles.', 'Canals are waterways channels, or artificial waterways, for water conveyance, or to service water transport vehicles.', 'They can also help with irrigation. It can be thought of as an artificial version of a river.', 'In most cases, the engineered works will have a series of dams and locks that create reservoirs of low speed current flow.', 'A canal is also known as a navigation when it parallels a river and shares part of its waters and drainage basin, and leverages its resources by building dams and locks to increase and lengthen its stretches of slack water levels while staying in its valley.', 'In contrast, a canal cuts across a drainage divide atop a ridge, generally requiring an external water source above the highest elevation.', 'Many canals have been built at elevations towering over valleys and other water ways crossing far below.', 'Many canals have been

Map:   0%|          | 0/1127 [00:00<?, ? examples/s]

['new moon received poor reviews from critics.', 'rhonda shear (born 1954), american television personality, comedian, and actress', 'delta has many successful minor sports teams in hockey, baseball, and lacrosse.', 'burak zivit (born 1984) turkish actor', 'the bahr el-baqar massacre occurred on 8 april 1970, during the war of attrition.', 'she was mentor to canadian artist christian cardell corbet.', 'length of time between albums and false perception of perfectionism', 'on the same day, the allies finally split the japanese lines.']
0-7/1127 | total/used/cuda/res/ram (Gb): 10.00/9.75/3.00/8.49/22.79 | batch/sps: 8/14.26
['he retained that position until his death in 1921.', 'token forces from australia and poland also took part.', 'the dedicated family man now resides in los angeles.', 'the second account has been related to biblical mythology.', 'saint paul writing his epistles, 16th century (blaffer foundation collection, houston, texas).', "the 'pagri' ceremony was even attended b

Map:   0%|          | 0/1062 [00:00<?, ? examples/s]

['Outside the town, 6 tourists were reported killed. However, official documents indicate that at least 255 local residents were killed, with a further 29 never found.', 'Whereas at some times (and in some places) a Corps of two divisions was sufficient, at other times 5 or 6 divisions were necessary. However, under the Hindenburg regime (from summer 1916), new Corps headquarters were created without organic divisions.', 'An estimate in 2009 gave three million for Latin America and three million for Brazil, mostly illegal. However, legal abortion is difficult to impossible to obtain.', 'Filming used mostly local talent, although filming began on January 28 in Savannah, Georgia.', 'Treating his mission as one of critical importance, Edelman trained non-stop, watching World Cup videos during his rest time to create neuropathways that would develop his driving skill.', 'Infiltrating the club in disguise, Carter learns that the formula has been weaponized.', 'This proved very unpopular, an

Map:   0%|          | 0/1144 [00:00<?, ? examples/s]

['It was launched in October 2006 with 2.4 million viewers.', 'Felony disenfranchisement in Florida began with the 1838 ratification of the state constitution.', 'He was born in Chicago, Illinois.', 'Dementia is becoming more common in the population as a whole as more people live longer.', 'Within a decade his short stories and novels prevailed over his historical non-fiction works, though historical themes continued to imbue his fiction.', 'He was discharged from the Army in July 1967.', 'Kaieda was born and grew up in Suginami, Tokyo.', 'Courbet was accused of a deliberate pursuit of ugliness.']
0-7/1144 | total/used/cuda/res/ram (Gb): 10.00/9.82/3.00/8.55/22.78 | batch/sps: 8/11.55
["His gold medal was Spain's eighth gold medal of the Games.", 'Winter events include skiing, ice fishing, snowmobiles and snowshoeing.', 'This gene causes hemophilia A, a recessive X-linked coagulation disorder.', 'He was born in New York City, New York, and raised in Mount Vernon, New York.', 'On Octob

Map:   0%|          | 0/1590 [00:00<?, ? examples/s]

['As Jack walked across the boardwalk and down to the beach, the thought of Speedy Parker danced in his mind.', 'The proposal, if your Majesty will forgive my saying so, is most generous, Valgon concluded.', 'After the Commission has heard the recommendations of the Monitoring Trustee, it may approve the appointment of such advisors instead.', 'In Annex II to the Directive, there is further information on the method that has to be taken into account in order to remedy environmental damage.', 'Napoleon started out with more than four hundred thousand troops, but by the time he got back to Paris, he had less than ten thousand left.', 'The Ukrainian authorities and the IMF are expected to agree shortly on an economic programme that will be supported by a financing arrangement with the IMF.', 'Where possible, the Commission should provide information on where the requested information is not available to you.', 'For certain products processed from cereals, the insignificance of Community p

### Saving eval results

In [14]:
hardware = "HomeDesktop (RTX3080)"
# hardware = "NC24 (A100)"
print(f"model_name: {model_name}, model_id: {model_id}, model_path: {model_path}")

total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total/trainable params: {total_params}/{total_trainable_params}")

if list in globals():
    del list

all_flats = []
all_scores = []
if os.path.exists("results/all-scores.csv"):
    all_scores = pd.read_csv("results/all-scores.csv").to_dict("records")

all_fulls = []

rouge_metric = evaluate.load("rouge")
# glue_metric = evaluate.load("glue", "stsb")
sacreblue_metric = evaluate.load("sacrebleu")
sari_metric = evaluate.load("sari")
em_metric = evaluate.load("exact_match")


def calculate_scores(processed_samples):
    rouge_score = rouge_metric.compute(
        predictions=processed_samples["processed"], references=processed_samples["references"]
    )
    # pprint(rouge_score)

    sacreblue_score = sacreblue_metric.compute(
        predictions=processed_samples["processed"], references=processed_samples["references"]
    )
    # pprint(sacreblue_score)

    sari_score = sari_metric.compute(
        sources=processed_samples["input"],
        predictions=processed_samples["processed"],
        references=processed_samples["references"],
    )
    # pprint(sari_score)

    em_score = em_metric.compute(predictions=processed_samples["processed"], references=processed_samples["reference"])
    # pprint(em_score)

    return {
        "rouge": rouge_score,
        "sacreblue": sacreblue_score,
        "sari": sari_score,
        "em": em_score,
    }


for task, obj in processed_samples_map.items():
    print(f"task: {task}, samples: {len(obj['samples'])}")

    batch = obj["samples"]
    total_samples = len(batch)

    all_saved_samples = batch.remove_columns(["references"])
    saved_samples = all_saved_samples[:100] if len(all_saved_samples) > 100 else all_saved_samples
    flats_frame = pd.DataFrame.from_records(saved_samples)
    flats_frame.to_json(f"samples/{model_path}_{task}.json", orient="records")

    scores = calculate_scores(batch)
    # pprint(scores)

    score_paths = [
        "rouge.rouge1",
        # "rouge.rouge2",
        # "rouge.rougeL",
        # "rouge.rougeLsum",
        "sacreblue.score",
        "sari.sari",
        "em.exact_match",
    ]

    normalized_scores = {}
    for s, v in scores.items():
        print(f"s: {s}, v: {v}")
        for ss, vv in v.items():
            if not isinstance(vv, list):
                # normalized_scores[f"score.{k}.{ss}"] = vv
                path = f"{s}.{ss}"
                if path in score_paths:
                    normalized_scores[f"score.{s}.{ss}"] = vv
    # pprint(normalized_scores)

    normalized_utilization = {}
    for s, v in obj["utilization"].items():
        if not isinstance(v, list):
            normalized_utilization[f"utilization.{s}"] = v
    # print(normalized_utilization)

    flat_dict = {
        "model": model_id,
        "hardware": hardware,
        "total_params": total_params,
        "total_samples": total_samples,
        "elapsed_time": obj["elapsed_time"],
        "sps": obj["sps"],
        "batch_size": obj["batch_size"],
        "max_length": obj["max_length"],
        "task": task,
    }
    flat_dict.update(normalized_scores)
    flat_dict.update(normalized_utilization)
    # pprint(frame)

    all_flats.append(flat_dict)
    all_scores.append(flat_dict)

    fulls_frame = {
        "task": task,
        "total_samples": total_samples,
        "elapsed_time": obj["elapsed_time"],
        "sps": obj["sps"],
        "batch_size": obj["batch_size"],
        "max_length": obj["max_length"],
    }
    fulls_frame.update(
        {
            "scores": scores,
            "utilization": obj["utilization"],
        }
    )
    all_fulls.append(fulls_frame)

flats_frame = pd.DataFrame.from_records(all_flats)
flats_frame.to_csv(f"results/{model_path}.csv", index=False)

scores_frame = pd.DataFrame.from_records(all_scores)
scores_frame.to_csv(f"results/all-scores.csv", index=False)

fulls_dict = {
    "model": model_id,
    "hardware": hardware,
    "total_params": total_params,
}
for full in all_fulls:
    fulls_dict[full["task"]] = full

print(fulls_dict)
fulls_frame = pd.DataFrame.from_records([fulls_dict])
fulls_frame.to_json(f"results/{model_path}.json", orient="records", index=False)

model_name: gpt2-large-coedit, model_id: iliazlobin/gpt2-large-coedit, model_path: iliazlobin_gpt2-large-coedit
Total/trainable params: 774030080/774030080
task: gec, samples: 2031
s: rouge, v: {'rouge1': 0.8820027396714665, 'rouge2': 0.7591515635635748, 'rougeL': 0.8747724041297571, 'rougeLsum': 0.8748793895418927}
s: sacreblue, v: {'score': 61.06635704165297, 'counts': [40167, 31527, 25080, 19960], 'totals': [46741, 44710, 42685, 40665], 'precisions': [85.93526026400805, 70.51442630284053, 58.75600327984069, 49.08397885159228], 'bp': 0.9444776114379095, 'sys_len': 46741, 'ref_len': 49411}
s: sari, v: {'sari': 67.02221730671336}
s: em, v: {'exact_match': 0.09699655342195962}
task: clarity, samples: 126
s: rouge, v: {'rouge1': 0.866226370515452, 'rouge2': 0.7969226854278454, 'rougeL': 0.858432276251673, 'rougeLsum': 0.8597132033422445}
s: sacreblue, v: {'score': 78.78335963544968, 'counts': [3551, 3207, 2959, 2748], 'totals': [4129, 4003, 3877, 3751], 'precisions': [86.00145313635262, 