In [1]:
%load_ext autoreload
%load_ext dotenv
%dotenv
!huggingface-cli login --token $HUGGING_FACE_TOKEN

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/izlobin/.cache/huggingface/token
Login successful


In [2]:
# %%capture
# %pip install transformers evaluate
# %pip install nltk absl-py rouge_score
# %pip install bleu sacrebleu
# %pip install sacremoses
# %pip install scipy
# %pip install sentencepiece
# %pip install optimum auto-gptq
# %pip install scikit-learn
# %pip install einops
# %pip install bitsandbytes
# %pip install accelerate
# %pip install pynvml

In [1]:
import os
import sys
from pprint import pprint

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from huggingface_hub import HfApi
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BartForCausalLM,
    BartModel,
    BartTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5TokenizerFast,
)

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

from utils.metric import calculate_scores
from utils.monitoring import calculate_utilization, format_utilization_narrow, print_utilization

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(f"Device: {device}")

'Device: cuda'


In [9]:
# from trl import SFTTrainer, TrainingArguments
from trl import SFTTrainer

print(SFTTrainer)

<class 'trl.trainer.sft_trainer.SFTTrainer'>


## Loading models

### Training llama-2 on grammarly-coedit dataset
* https://huggingface.co/meta-llama/Llama-2-7b-hf
* https://huggingface.co/docs/transformers/en/model_doc/llama

In [26]:
model_name = "./model-llama-4bits-coedit"
# model_name = "meta-llama/Llama-2-7b-hf"
# model_name = "TheBloke/Llama-2-7B-GPTQ"
# model_name = "TheBloke/Nous-Hermes-Llama-2-7B-GPTQ"
model_alias = model_name.replace("/", "_")

from transformers import LlamaTokenizer, LlamaForCausalLM
from transformers import BitsAndBytesConfig, GPTQConfig

quantization_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)
print(type(quantization_config))

tokenizer = LlamaTokenizer.from_pretrained(model_name,
    padding_side="left",
    # model_max_length=512,
    add_eos_token=True,
    # add_bos_token=True,
    # padding='longest',
    # use_fast=False,
    trust_remote_code=True,
)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

# input_text = """Fix grammar: In the case of young people the best way is studying as hard as possible to get better grades so in the future they will have better chance to find better jobs
# Response:
# """.strip()
# tokens = tokenizer(input_text, return_tensors="pt").to(device)
# print(tokens.input_ids)

model = LlamaForCausalLM.from_pretrained(
    model_name,
    device_map=0,
    quantization_config=quantization_config,
)
model.config.use_cache = False

print(type(tokenizer))
print(type(model))
print(model.config)

# from auto_gptq import exllama_set_max_input_length
# model = exllama_set_max_input_length(model, max_input_length=2400)

utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)

<class 'transformers.utils.quantization_config.BitsAndBytesConfig'>


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# input_text = "Fix grammatical errors in this sentence: I goes to school every day."
# input_text = "Fix grammar: In the case of young people the best way is studying as hard as possible to get better grades so in the future they will have better chance to find better jobs ."
input_text = """Fix grammar: In the case of young people the best way is studying as hard as possible to get better grades so in the future they will have better chance to find better jobs
Response:
""".strip()

# input_ids = tokenizer(input_text, add_special_tokens=False, return_tensors="pt")
tokens = tokenizer(input_text, return_tensors="pt").to(device)

outputs = model.generate(**tokens, max_new_tokens=128)
# outputs = model.generate(
#     **input_ids,
#     do_sample=True,
#     top_k=10,
#     num_return_sequences=1,
#     # eos_token_id=tokenizer.eos_token_id,
#     # return_attention_mask=False,
#     max_length=256,
# )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Below is an original English text. Correct English Grammar and spelling mistakes in the text.

### Input:
Fix grammar: In the case of young people the best way is studying as hard as possible to get better grades so in the future they will have better chance to find better jobs

### Response:
Fix grammar: In the case of young people the best way is to study as hard as possible to get better grades so in the future they will have better chances to find better jobs.

### Explanation:
In the case of young people the best way is to study as hard as possible to get better grades so in the future they will have better chances to find better jobs.

### Input:
Fix grammar: In the case of young people the best way is studying as hard as possible to get better grades so in the future they will have better chance to find


In [6]:
from datasets import DatasetDict

train_data = load_dataset("grammarly/coedit", split="train[:100]")
test_data = load_dataset("grammarly/coedit", split="train[:10]")

dataset = DatasetDict({"train": train_data, "test": test_data})
print(dataset)
print(len(dataset))

full_dataset = load_dataset("grammarly/coedit")
print(full_dataset)
print(len(full_dataset))

# data["test"] = test_data
# data = load_dataset("grammarly/coedit")
# data["test"] = data["validation"]
# del data["validation"]

# print(len(train_data))
# print(len(data))

# print(data)
# print(data["train"])
# print(data["train"][0]['src'])
# print(data["test"])
# print(data["test"][0]['src'])

DatasetDict({
    train: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 100
    })
    test: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 10
    })
})
2
DatasetDict({
    train: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 69071
    })
    validation: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 1712
    })
})
2


In [7]:
DEFAULT_SYSTEM_PROMPT = """
Below is an original English text. Correct English Grammar and spelling mistakes in the text.
""".strip()


def generate_prompt(input: str, response: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f"""### Instruction: {system_prompt}


### Input:
{input.strip()}


### Response:
{response}
""".strip()


def process_data(item):
    return {
        "text": generate_prompt(item["src"], item["tgt"]),
    }


text_dataset = dataset.map(process_data, remove_columns=["task", "_id"])
print(text_dataset)
print(len(text_dataset))
print(text_dataset["train"][0]["text"])
print(text_dataset["train"][1]["text"])

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['src', 'tgt', 'text'],
        num_rows: 100
    })
    test: Dataset({
        features: ['src', 'tgt', 'text'],
        num_rows: 10
    })
})
2
### Instruction: Below is an original English text. Correct English Grammar and spelling mistakes in the text.


### Input:
Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.


### Response:
For example, countries with a lot of deserts can transform their desert to increase their habitable land and use irrigation to provide clean water to the desert.
### Instruction: Below is an original English text. Correct English Grammar and spelling mistakes in the text.


### Input:
Improve the grammaticality: As the number of people grows, the need of habitable environment is unquestionably essential.


### Response:
As the number of people g

In [15]:
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print(tokenizer.special_tokens_map)


def preprocess_function(examples):
    model_inputs = tokenizer(examples["text"])
    # labels = tokenizer(text_target=examples["tgt"], max_length=1024, padding=True).input_ids
    # model_inputs["labels"] = labels
    return model_inputs


# processed_data = data.map(preprocess_function, batched=True, batch_size=10, remove_columns=data["train"].column_names)
processed_dataset = text_dataset.map(
    preprocess_function, batched=True, batch_size=100, remove_columns=text_dataset["train"].column_names
)
print(processed_dataset)
print(processed_dataset["train"][0]["input_ids"])
print(processed_dataset["train"][1]["input_ids"])

2
2
{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 10
    })
})
[1, 835, 2799, 4080, 29901, 13866, 338, 385, 2441, 4223, 1426, 29889, 28518, 4223, 16878, 3034, 322, 805, 7807, 28947, 297, 278, 1426, 29889, 13, 13, 13, 2277, 29937, 10567, 29901, 13, 15941, 599, 14961, 2922, 936, 4436, 515, 445, 1426, 29901, 1152, 1342, 29892, 10916, 411, 263, 3287, 310, 18197, 29879, 508, 15087, 689, 1009, 18197, 304, 7910, 1009, 4760, 519, 2982, 322, 773, 3805, 8966, 362, 304, 3867, 5941, 4094, 304, 278, 18197, 29889, 13, 13, 13, 2277, 29937, 13291, 29901, 13, 2831, 1342, 29892, 10916, 411, 263, 3287, 310, 18197, 29879, 508, 4327, 1009, 18197, 304, 7910, 1009, 4760, 519, 2982, 322, 671, 3805, 8966, 362, 304, 3867, 5941, 4094, 304, 278, 18197, 29889, 2]
[1, 835, 2799, 4080, 29901, 13866, 338, 385, 2441, 4223, 1426, 29889, 28518, 4223, 16878, 3034, 322, 

## Training

In [None]:
%reload_ext autoreload

utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)


# available_memory = utilization["total_memory"] - utilization["memory_used"]
# recommended_fraction = available_memory / utilization["total_memory"]

# actual_fraction = 0.90
# torch.cuda.set_per_process_memory_fraction(actual_fraction, 0)

# print(
#     f"total/used/available memory (Gb): {utilization['total_memory']/1024**3:.2f}/{utilization['memory_used']/1024**3:.2f}/{available_memory/1024**3:.2f} | "
#     f"recommended/actual fraction: {recommended_fraction:.2f}/{actual_fraction:.2f}"
# )

# torch.cuda.empty_cache()
# torch.empty(utilization['total_memory'] // 2, dtype=torch.int8, device='cuda')
# print_utilization()

total/used/cuda/res/ram (Gb): 10.00/3.51/0.00/0.00/23.28


In [26]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

perf_model = get_peft_model(model, peft_config)
print_trainable_parameters(perf_model)

utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)

trainable params: 8388608 || all params: 3508801536 || trainable%: 0.23907331075678143
total/used/cuda/res/ram (Gb): 10.00/7.62/4.24/5.33/14.40


In [27]:
from transformers import Trainer, DataCollatorForLanguageModeling, TrainingArguments
from trl import SFTTrainer

rouge = evaluate.load("rouge")


# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

#     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
#     result["gen_len"] = np.mean(prediction_lens)

#     return {k: round(v, 4) for k, v in result.items()}

def compute_metrics(eval_pred):
    return {"k": 2}


# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


training_args = TrainingArguments(
    output_dir="model-llama-4bits-train",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    logging_steps=1,
    learning_rate=1e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=20,
    evaluation_strategy="epoch",
    eval_steps=0.2,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=42,
    load_best_model_at_end=True,
)
print(type(training_args))


trainer = Trainer(
    model=perf_model,
    args=training_args,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
print(type(trainer))

# trainer.train()

<class 'transformers.training_args.TrainingArguments'>
<class 'transformers.trainer.Trainer'>


In [28]:
trainer.save_model("model-llama-4bits-coedit")
# trainer.push_to_hub("iliazlobin/bart-large-coedit", token=os.getenv("HUGGING_FACE_TOKEN"))

### Save and Load for Evaluation

In [41]:
from transformers import BartForConditionalGeneration, BartTokenizer

model_name = "iliazlobin/bart-large-coedit"
model_alias = model_name.replace("/", "_")

# tokenizer = BartTokenizer.from_pretrained('iliazlobin/bart-grammarly')
model = BartForConditionalGeneration.from_pretrained(model_name, device_map=0)
print(model.config)

model.save_pretrained("model-bart-large-coedit")

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


BartConfig {
  "_name_or_path": "iliazlobin/bart-grammarly",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_e

In [42]:
model = BartForConditionalGeneration.from_pretrained('./bart-grammarly-model', device_map=0)
print(type(model))
# print(model.config)

checkpoint = "facebook/bart-large"
tokenizer = BartTokenizer.from_pretrained(checkpoint)
print(type(tokenizer))

<class 'transformers.models.bart.modeling_bart.BartForConditionalGeneration'>
<class 'transformers.models.bart.tokenization_bart.BartTokenizer'>


In [43]:
# input_text = "Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert."
# input_text = "Fix the grammatical mistakes: One of the reasons for its success was that laptop catered to people's needs greatly."
# input_text = "Fix grammaticality in this sentence: The low technological standard is one of the significant factors which hamper engineering design process."
input_text = "Fix all grammatical errors: They also feels that these cars are less attractive compared to conventional cars too."
tokens = tokenizer(input_text, return_tensors="pt").to(device)
outputs = model.generate(**tokens, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

They also feel that these cars are less attractive compared to conventional cars too.
