In [1]:
%load_ext autoreload
%load_ext dotenv
%dotenv
!huggingface-cli login --token $HUGGING_FACE_TOKEN

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/izlobin/.cache/huggingface/token
Login successful


In [2]:
# %%capture
# %pip install transformers evaluate
# %pip install nltk absl-py rouge_score
# %pip install bleu sacrebleu
# %pip install sacremoses
# %pip install scipy
# %pip install sentencepiece
# %pip install optimum auto-gptq
# %pip install scikit-learn
# %pip install einops
# %pip install bitsandbytes
# %pip install accelerate
# %pip install pynvml

In [2]:
import os
import sys
from pprint import pprint

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from huggingface_hub import HfApi
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BartForCausalLM,
    BartModel,
    BartTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5TokenizerFast,
)

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

from utils.dataset import get_iterater_samples_simplified
from utils.metric import calculate_scores
from utils.monitoring import calculate_utilization, format_utilization_narrow, print_utilization

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(f"Device: {device}")

'Device: cuda'


In [12]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_id = "openai-community/gpt2"

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=False,
)
print(type(tokenizer))

tokenizer.add_eos_token = True
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

print(type(model))

<class 'transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer'>
<class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>


In [6]:
%%script echo skip

# tokenizer = AutoTokenizer.from_pretrained(
#     model_id,
#     # padding_side="right",
#     # model_max_length=512,
#     # pad_token="!",
#     # add_eos_token=True,
#     # add_bos_token=True,
#     # padding='longest',
#     use_fast=True,
#     # trust_remote_code=True,
# )
# print(type(tokenizer))

# tokenizer.pad_token_id = tokenizer.eos_token_id

# print(tokenizer.eos_token_id)
# print(tokenizer.pad_token_id)
# print(tokenizer.special_tokens_map)

# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.add_special_tokens(
#     {
#         "pad_token": "<|endoftext|>",
#         "eos_token": "<|endoftext|>",
#         "bos_token": "<|endoftext|>",
#         "unk_token": "<|endoftext|>",
#     }
# )

input_text = """Fix grammar: In the case of young people the best way is studying as hard as possible to get better grades so in the future they will have better chance to find better jobs
Response:
""".strip()
tokens = tokenizer(input_text, add_special_tokens=True)
print(tokens.input_ids)

skip


In [13]:
input_text = "Fix grammatical errors in this sentence: I goes to school every day."
# input_text = "Fix grammar: In the case of young people the best way is studying as hard as possible to get better grades so in the future they will have better chance to find better jobs ."

# input_ids = tokenizer(input_text, add_special_tokens=False, return_tensors="pt")
input_ids = tokenizer(input_text, return_tensors="pt").to(device)

outputs = model.generate(**input_ids, max_new_tokens=100)
# outputs = model.generate(
#     **input_ids,
#     do_sample=True,
#     top_k=10,
#     num_return_sequences=1,
#     # eos_token_id=tokenizer.eos_token_id,
#     # return_attention_mask=False,
#     max_length=256,
# )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Fix grammatical errors in this sentence: I goes to school every day.

I go to school every day. I go to school every day. I go to school every day. I go to school every day. I go to school every day. I go to school every day. I go to school every day. I go to school every day. I go to school every day. I go to school every day. I go to school every day. I go to school every day. I go to school every day. I go to school every day.


## Loading dataset

In [None]:
from datasets import load_dataset

full_ds = load_dataset("Abirate/english_quotes")
print(full_ds)
train_ds = load_dataset("Abirate/english_quotes", split="train[:30%]")
test_ds = load_dataset("Abirate/english_quotes", split="train[-05%:]")

processed_train_ds = train_ds.map(
    lambda samples: tokenizer(samples["quote"], max_length=512), batched=True
)
processed_test_ds = test_ds.map(
    lambda samples: tokenizer(samples["quote"], max_length=512), batched=True
)
print(processed_train_ds)
print(processed_test_ds)

# test_data = train_test_split(
#     test_size=0.1,
# )
# print(test_data)

### Grammarly dataset

In [14]:
from datasets import DatasetDict

train_data = load_dataset("grammarly/coedit", split="train[:1000]")
test_data = load_dataset("grammarly/coedit", split="train[:100]")

dataset = DatasetDict({"train": train_data, "test": test_data})
print(dataset)
print(len(dataset))

full_dataset = load_dataset("grammarly/coedit")
print(full_dataset)
print(len(full_dataset))

# data["test"] = test_data
# data = load_dataset("grammarly/coedit")
# data["test"] = data["validation"]
# del data["validation"]

# print(len(train_data))
# print(len(data))

# print(data)
# print(data["train"])
# print(data["train"][0]['src'])
# print(data["test"])
# print(data["test"][0]['src'])

DatasetDict({
    train: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 100
    })
})
2
DatasetDict({
    train: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 69071
    })
    validation: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 1712
    })
})
2


In [15]:
# DEFAULT_SYSTEM_PROMPT = """
# Below is an original English text. Correct English Grammar and spelling mistakes in the text.
# """.strip()


# def generate_instruction_prompt(input: str, response: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
#     return f"""### Instruction: {system_prompt}
# ### Input:
# {input.strip()}
# ### Response:
# {response}
# """.strip()


def generate_grammar_prompt(src: str, tgt: str) -> str:
    return f"""{src}
Response: {tgt}
""".strip()


def process_data(item):
    return {
        "text": generate_grammar_prompt(item["src"], item["tgt"]),
    }


text_dataset = dataset.map(process_data, remove_columns=["task", "_id"])
print(text_dataset)
print(len(text_dataset))
print(text_dataset["train"][0]["text"])
print(text_dataset["train"][1]["text"])

DatasetDict({
    train: Dataset({
        features: ['src', 'tgt', 'text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['src', 'tgt', 'text'],
        num_rows: 100
    })
})
2
Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.
Response: For example, countries with a lot of deserts can transform their desert to increase their habitable land and use irrigation to provide clean water to the desert.
Improve the grammaticality: As the number of people grows, the need of habitable environment is unquestionably essential.
Response: As the number of people grows, the need for a habitable environment is unquestionably increasing.


In [16]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples["text"], max_length=2048)
    new_input_ids = []
    for input_ids in model_inputs.input_ids:
        new_input_ids.append(input_ids.append(tokenizer.eos_token_id))

    # labels = tokenizer(text_target=examples["tgt"], max_length=1024, padding=True).input_ids
    # model_inputs["labels"] = labels

    model_inputs.input_ids = new_input_ids
    return model_inputs


# processed_data = data.map(preprocess_function, batched=True, batch_size=10, remove_columns=data["train"].column_names)
processed_dataset = text_dataset.map(
    # preprocess_function, batched=True, batch_size=100, remove_columns=text_dataset["train"].column_names
    preprocess_function, batched=True, batch_size=100
)
print(processed_dataset)
print(processed_dataset["train"][0]["text"])
print(processed_dataset["train"][0]["input_ids"])
print(processed_dataset["train"][1]["input_ids"])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['src', 'tgt', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['src', 'tgt', 'text', 'input_ids', 'attention_mask'],
        num_rows: 100
    })
})
Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.
Response: For example, countries with a lot of deserts can transform their desert to increase their habitable land and use irrigation to provide clean water to the desert.
[27914, 477, 14599, 44935, 8563, 422, 428, 2420, 25, 1114, 1672, 11, 2678, 351, 257, 1256, 286, 45288, 460, 1059, 430, 687, 511, 10326, 284, 2620, 511, 49055, 1956, 290, 1262, 35425, 284, 2148, 3424, 1660, 284, 262, 10326, 13, 198, 31077, 25, 1114, 1672, 11, 2678, 351, 257, 1256, 286, 45288, 460, 6121, 511, 10326, 284, 2620, 511, 49055, 1956, 290, 779, 

## Training

In [17]:
%reload_ext autoreload

utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)


available_memory = utilization["total_memory"] - utilization["memory_used"]
recommended_fraction = available_memory / utilization["total_memory"]

actual_fraction = 0.90
torch.cuda.set_per_process_memory_fraction(actual_fraction, 0)

print(
    f"total/used/available memory (Gb): {utilization['total_memory']/1024**3:.2f}/"
    f"{utilization['memory_used']/1024**3:.2f}/{available_memory/1024**3:.2f}"
)
print(f"recommended/actual fraction: {recommended_fraction:.2f}/{actual_fraction:.2f}")

total/used/cuda/res/ram (Gb): 10.00/3.38/0.48/1.04/12.82
total/used/available memory (Gb): 10.00/3.38/6.62
recommended/actual fraction: 0.66/0.90


In [12]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


# model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)


config = LoraConfig(
    r=128,
    lora_alpha=32,
    # target_modules=["query_key_value"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print(type(model))
print_trainable_parameters(model)

<class 'peft.peft_model.PeftModelForCausalLM'>
trainable params: 23592960 || all params: 443728640 || trainable%: 5.316979314204285


In [18]:
%%capture
%load_ext tensorboard
%tensorboard --logdir 'outputs/runs'

In [19]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# needed for gpt-neo-x tokenizer
tokenizer.pad_token = tokenizer.eos_token

args = TrainingArguments(
    output_dir="train-gpt2-test",
    learning_rate=2e-4,
    warmup_ratio=0.05,
    optim="paged_adamw_8bit",
    fp16=True,
    gradient_checkpointing=True,
    per_device_train_batch_size=10,
    gradient_accumulation_steps=4,
    logging_steps=1,
    warmup_steps=2,
    # save_steps=50,
    # max_steps=100,
    num_train_epochs=2,
    save_strategy="epoch",
    # eval_steps=1,
    # evaluation_strategy="epoch",
    report_to='tensorboard',
)

trainer = Trainer(
    model=model,
    train_dataset=processed_train_ds,
    # eval_dataset=processed_test_ds,
    args=args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
print(type(trainer))

# model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

NameError: name 'processed_train_ds' is not defined

In [14]:
# trainer.save_model("model-gpt2-large-bnb4bit-test")

print(type(trainer))
print(type(trainer.model))

trained_model = trainer.model

<class 'transformers.trainer.Trainer'>
<class 'peft.peft_model.PeftModelForCausalLM'>


## Eval

In [20]:
# input_text = "Fix grammatical errors in this sentence: I goes to school every day."
# input_text = "Fix grammar: In the case of young people the best way is studying as hard as possible to get better grades so in the future they will have better chance to find better jobs ."

# input_text = f"""Fix grammar: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.
# Response:
# """.strip()
# input_text = f"""Fix grammar: Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.
# Response: For example, countries with a lot of deserts can transform their desert to increase their habitable land and use irrigation to provide clean water to the desert.
# """.strip()
input_text = f"""Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.
Response:
""".strip()

# input_ids = tokenizer(input_text, add_special_tokens=False, return_tensors="pt")
input_ids = tokenizer(input_text, return_tensors="pt").to(device)

trained_model.config.pad_token_id = trained_model.config.eos_token_id
# outputs = trained_model.generate(**input_ids, max_new_tokens=128)
# outputs = trained_model.generate(**input_ids, max_new_tokens=128, num_return_sequences=1)
# outputs = trained_model.generate(**input_ids, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id, num_return_sequences=1)
outputs = model.generate(
    **input_ids,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,
    # return_attention_mask=True,
    max_length=128,
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.
Response: Yes.
Response: Yes.
Response: Yes.
Response: Yes.
Response: Yes.
Response: Yes. This question has been asked many times before. It has been answered in the form of a question and response with an emphasis on the word "yes". However, I think that this question needs a clarification. The first question, "yes", is the "yes" of the verb meaning "


In [None]:
rouge_metric = evaluate.load("rouge")
# glue_metric = evaluate.load("glue", "stsb")
sacreblue_metric = evaluate.load("sacrebleu")
sari_metric = evaluate.load("sari")
em_metric = evaluate.load("exact_match")

In [None]:
%load_ext autoreload

batch_size = 10
total_samples = 100
samples_map = {
    "fluency": get_iterater_samples_simplified(label="fluency", num_samples=total_samples),
    "clarity": get_iterater_samples_simplified(label="clarity", num_samples=total_samples),
    "coherence": get_iterater_samples_simplified(label="coherence", num_samples=total_samples),
}
print(samples_map)
print(samples_map["fluency"]["task"][0])
print(samples_map["fluency"]["source"][0])
print(samples_map["fluency"]["reference"][0])
print(samples_map["fluency"]["task"][1])

In [None]:
%%time
%load_ext autoreload

batch_size = 5
total_samples = 10
samples_map = {
    "fluency": get_iterater_samples_simplified(label="fluency", num_samples=total_samples),
    "clarity": get_iterater_samples_simplified(label="clarity", num_samples=total_samples),
    "coherence": get_iterater_samples_simplified(label="coherence", num_samples=total_samples),
}

def model_process(batch, idx, **kwargs):
    num_samples = len(batch["task"])
    start_time = time.time()

    model = kwargs.get("model")
    tokenizer = kwargs.get("tokenizer")
    total_samples = kwargs.get("total_samples")

    input_ids = tokenizer(batch["task"], padding=True, return_tensors="pt").input_ids.to(device)
    # input_ids = tokenizer(batch['task'], return_tensors="pt").input_ids.to(device)
    # input_ids = tokenizer(item['task'], return_tensors="pt").input_ids
    # outputs = model.generate(input_ids, max_length=512)
    outputs = model.generate(input_ids, max_length=312)
    # print(f"outputs: {outputs}")
    processed = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = num_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{idx[0]}-{idx[-1]}/{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
        f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']} | "
        f"batch/sps: {num_samples}/{sps_str}"
    )

    # return {"processed": processed}
    return {"processed": processed}
    # return {"processed": processed, "utilization": utilization, "tps": tps}
    # return {"processed": processed, "utilization": utilization}


utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)

processed_samples_map = {}

for category, samples in samples_map.items():
    total_samples = len(samples)

    print(f"Processing {total_samples} samples for {category}")

    start_time = time.time()

    processed_samples = samples.map(
        model_process,
        fn_kwargs={
            "model": model,
            "tokenizer": tokenizer,
            "total_samples": total_samples,
        },
        num_proc=1,
        batched=True,
        batch_size=batch_size,
        with_indices=True,
    )

    end_time = time.time()
    elapsed_time = end_time - start_time
    sps = total_samples / elapsed_time
    sps_str = f"{sps:.2f}"

    print(f"Finished processing {total_samples} samples for {category}.")

    utilization = calculate_utilization()
    utilization_str = format_utilization_narrow(utilization)
    print(
        f"{total_samples} | total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
        f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']} | "
        f"sps: {sps_str}"
    )

    processed_samples_map[category] = {
        # "category": category,
        # "samples": samples,
        "samples": processed_samples,
        "sps": sps,
        "utilization": utilization,
    }

# processed_samples = samples.map(model_process, num_proc=torch.cuda.device_count())
# processed_samples = samples.map(
#     model_process,
#     fn_kwargs={
#         "model": model,
#         "tokenizer": tokenizer,
#         "total_samples": total_samples,
#     },
#     num_proc=1,
#     batched=True,
#     batch_size=batch_size,
#     with_indices=True,
# )

processed_fluency_samples = processed_samples_map["fluency"]["samples"]

pprint(processed_fluency_samples)
pprint(processed_fluency_samples["processed"][:2])