In [3]:
%load_ext autoreload
%load_ext dotenv
%dotenv
!huggingface-cli login --token $HUGGING_FACE_TOKEN

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/izlobin/.cache/huggingface/token
Login successful


In [2]:
# %%capture
# %pip install transformers evaluate
# %pip install nltk absl-py rouge_score
# %pip install bleu sacrebleu
# %pip install sacremoses
# %pip install scipy
# %pip install sentencepiece
# %pip install optimum auto-gptq
# %pip install scikit-learn
# %pip install einops
# %pip install bitsandbytes
# %pip install accelerate
# %pip install pynvml

In [4]:
%load_ext autoreload
%autoreload 2

import os
import sys
import time
from pprint import pprint

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from huggingface_hub import HfApi
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BartForCausalLM,
    BartModel,
    BartTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5TokenizerFast,
)

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

from utils.dataset import get_iterater_samples_simplified, get_iterater_samples_with_instruction
from utils.metric import calculate_scores
from utils.monitoring import calculate_utilization, format_utilization_narrow, print_utilization

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(f"Device: {device}")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
'Device: cuda'


In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "gpt2-large"
model_repo = f"openai-community"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_fast=True,
)
tokenizer.add_eos_token = True
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"

print(type(tokenizer))


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto", max_memory={0: "10GiB"})
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map=0)

print(type(model))

<class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>
<class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>


In [5]:
%%script echo skip

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    # padding_side="right",
    # model_max_length=512,
    # pad_token="!",
    # add_eos_token=True,
    # add_bos_token=True,
    # padding='longest',
    use_fast=True,
    # trust_remote_code=True,
)
print(type(tokenizer))

# tokenizer.pad_token_id = tokenizer.eos_token_id

# print(tokenizer.eos_token_id)
# print(tokenizer.pad_token_id)
# print(tokenizer.special_tokens_map)

# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.add_special_tokens(
#     {
#         "pad_token": "<|endoftext|>",
#         "eos_token": "<|endoftext|>",
#         "bos_token": "<|endoftext|>",
#         "unk_token": "<|endoftext|>",
#     }
# )

tokenizer.add_eos_token = True
# tokenizer.padding_side = "right"

input_text = """Fix grammar: In the case of young people the best way is studying as hard as possible to get better grades so in the future they will have better chance to find better jobs
Response:
""".strip()
tokens = tokenizer(input_text, add_special_tokens=True)
print(tokens.input_ids)

skip


In [6]:
%%script echo skip

input_text = "Fix grammatical errors in this sentence: I goes to school every day."
# input_text = "Fix grammar: In the case of young people the best way is studying as hard as possible to get better grades so in the future they will have better chance to find better jobs ."

# input_ids = tokenizer(input_text, add_special_tokens=False, return_tensors="pt")
inputs = tokenizer(input_text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=100)
# outputs = model.generate(
#     **input_ids,
#     do_sample=True,
#     top_k=10,
#     num_return_sequences=1,
#     # eos_token_id=tokenizer.eos_token_id,
#     # return_attention_mask=False,
#     max_length=256,
# )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

skip


## Loading dataset

In [7]:
%%script echo skip

from datasets import load_dataset

full_ds = load_dataset("Abirate/english_quotes")
print(full_ds)
train_ds = load_dataset("Abirate/english_quotes", split="train[:30%]")
test_ds = load_dataset("Abirate/english_quotes", split="train[-05%:]")

processed_train_ds = train_ds.map(
    lambda samples: tokenizer(samples["quote"], max_length=512), batched=True
)
processed_test_ds = test_ds.map(
    lambda samples: tokenizer(samples["quote"], max_length=512), batched=True
)
print(processed_train_ds)
print(processed_test_ds)

# test_data = train_test_split(
#     test_size=0.1,
# )
# print(test_data)

skip


### Grammarly dataset

In [5]:
from datasets import DatasetDict, load_dataset

# full_dataset = load_dataset("grammarly/coedit")
# print(full_dataset)

# train_data = load_dataset("grammarly/coedit", split="train")
# test_data = load_dataset("grammarly/coedit", split="validation")

train_data = load_dataset("grammarly/coedit", split="train[:1000]")
test_data = load_dataset("grammarly/coedit", split="train[:100]")

dataset = DatasetDict({"train": train_data, "test": test_data})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 100
    })
})


In [6]:
def generate_grammar_prompt(src: str, tgt: str) -> str:
    return f"""{src}
Response: {tgt}
""".strip()


def prompt_dataset(item):
    return {
        "prompt": generate_grammar_prompt(item["src"], item["tgt"]),
        # "labels": item["tgt"],
    }


prompted_dataset = dataset.map(prompt_dataset, remove_columns=["task", "_id"])
print(prompted_dataset)
print(len(prompted_dataset))
print(prompted_dataset["train"][0]["prompt"])
print(prompted_dataset["train"][1]["prompt"])

DatasetDict({
    train: Dataset({
        features: ['src', 'tgt', 'prompt'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['src', 'tgt', 'prompt'],
        num_rows: 100
    })
})
2
Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.
Response: For example, countries with a lot of deserts can transform their desert to increase their habitable land and use irrigation to provide clean water to the desert.
Improve the grammaticality: As the number of people grows, the need of habitable environment is unquestionably essential.
Response: As the number of people grows, the need for a habitable environment is unquestionably increasing.


In [7]:
import numpy as np
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq


def process_dataset(batch):
    # model_input = tokenizer(examples["prompt"], max_length=256, padding="max_length", truncation=True)

    model_inputs = tokenizer(batch["prompt"])
    model_reponses = tokenizer(batch["tgt"])

    new_input_ids = []
    new_labels = []
    for input_ids, response_ids in zip(model_inputs.input_ids, model_reponses.input_ids):
        # debug_labels = input_ids[-len(response_ids) :]
        # print(tokenizer.decode(input_ids, skip_special_tokens=False))
        # print(tokenizer.decode(debug_labels, skip_special_tokens=False))

        input_ids.append(tokenizer.eos_token_id)
        new_input_ids.append(input_ids)

        num_tokens_ignore = len(input_ids) - len(response_ids)
        labels = [-100] * num_tokens_ignore + input_ids[-len(response_ids) :]
        new_labels.append(labels)

    new_attention_mask = []
    for attention_mask in model_inputs.attention_mask:
        attention_mask.append(1)
        new_attention_mask.append(attention_mask)

    # labels = tokenizer(text_target=examples["tgt"], max_length=1024, padding=True).input_ids
    # model_inputs["labels"] = labels

    model_inputs["input_ids"] = new_input_ids
    model_inputs["attention_mask"] = new_attention_mask
    model_inputs["labels"] = new_labels

    # print(
    #     f">> input_ids: {len(model_inputs['input_ids'])},"
    #     "attention_mask: {len(model_inputs['attention_mask'])},"
    #     "labels: {len(model_inputs['labels'])}"
    # )
    # count = 0
    # for input_ids, labels, attention_masks in zip(new_input_ids, new_labels, new_attention_mask):
    #     count += 1
    #     if count > 3:
    #         break
    #     print(f">> input_ids: {len(input_ids)}, attention_mask: {len(attention_masks)}, labels: {len(labels)}")
    #     print(tokenizer.decode(input_ids, skip_special_tokens=False))
    #     print(labels)

    return model_inputs


processed_dataset = prompted_dataset.map(process_dataset, batched=True, remove_columns=["tgt", "src", "prompt"])
print(processed_dataset)
print(processed_dataset["train"]["input_ids"][0])
print(processed_dataset["train"]["attention_mask"][0])
print(processed_dataset["train"]["labels"][0])

data_collator = DataCollatorForSeq2Seq(tokenizer, return_tensors="pt", padding=True)

# dataloader = DataLoader(processed_dataset["train"], batch_size=4, collate_fn=data_collator)
# for batch in dataloader:
# print(batch)
# break

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})
[27914, 477, 14599, 44935, 8563, 422, 428, 2420, 25, 1114, 1672, 11, 2678, 351, 257, 1256, 286, 45288, 460, 1059, 430, 687, 511, 10326, 284, 2620, 511, 49055, 1956, 290, 1262, 35425, 284, 2148, 3424, 1660, 284, 262, 10326, 13, 198, 31077, 25, 1114, 1672, 11, 2678, 351, 257, 1256, 286, 45288, 460, 6121, 511, 10326, 284, 2620, 511, 49055, 1956, 290, 779, 35425, 284, 2148, 3424, 1660, 284, 262, 10326, 13, 50256]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -

## Training

In [16]:
%reload_ext autoreload

training_model_repo = f"iliazlobin"
training_model_name: str = f"{model_name}-bnb8-coedit"
training_model_id = f"{training_model_repo}/{training_model_name}"
training_model_checkpoint = f"{training_model_id}"
training_model_path = f"{training_model_repo}_{training_model_name}"
print(
    f"training_model_name: {training_model_name}, "
    f"training_model_id: {training_model_id}, "
    f"training_model_path: {training_model_path}"
)

utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)


available_memory = utilization["total_memory"] - utilization["memory_used"]
recommended_fraction = available_memory / utilization["total_memory"]

actual_fraction = 0.90
torch.cuda.set_per_process_memory_fraction(actual_fraction, 0)

print(
    f"total/used/available memory (Gb): {utilization['total_memory']/1024**3:.2f}/"
    f"{utilization['memory_used']/1024**3:.2f}/{available_memory/1024**3:.2f}"
)
print(f"recommended/actual fraction: {recommended_fraction:.2f}/{actual_fraction:.2f}")

training_model_name: gpt2-large-bnb8-coedit,training_model_id: iliazlobin/gpt2-large-bnb8-coedit,training_model_path: iliazlobin_gpt2-large-bnb8-coedit
total/used/cuda/res/ram (Gb): 10.00/6.63/0.82/1.36/19.64
total/used/available memory (Gb): 10.00/6.63/3.37
recommended/actual fraction: 0.34/0.90


In [17]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

perf_model = model
perf_model = prepare_model_for_kbit_training(perf_model, use_gradient_checkpointing=True)
# perf_model.gradient_checkpointing_enable()

config = LoraConfig(
    r=64,
    lora_alpha=32,
    # target_modules=["query_key_value"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

perf_model = get_peft_model(perf_model, config)
print(type(perf_model))
print(perf_model)

<class 'peft.peft_model.PeftModelForCausalLM'>
trainable params: 11796480 || all params: 431932160 || trainable%: 2.7310955498196754


In [None]:
training_model = perf_model
print(type(training_model))
print(training_model.config)

total_params = sum(p.numel() for p in perf_model.parameters())
total_trainable_params = sum(p.numel() for p in perf_model.parameters() if p.requires_grad)
print(f"Total/trainable params: {total_params}/{total_trainable_params}")

In [21]:
%%capture
%load_ext tensorboard
%tensorboard --logdir "model-{training_model_path}-train/runs"

In [23]:
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments


train_path = f"model-{training_model_path}-train"
print(f"train_path: {train_path}")

train_size = len(processed_dataset["train"])
per_device_train_batch_size = 2
gradient_accumulation_steps = 4
per_epoch_steps = train_size / (per_device_train_batch_size * gradient_accumulation_steps)
max_steps = 125
epochs = max_steps / per_epoch_steps
print(f"train_size: {train_size}, per_epoch_steps: {per_epoch_steps}, max_steps: {max_steps}, epochs: {epochs}")

if max_steps < per_epoch_steps:
    raise Exception(f"Training doesn't cover the entire training dataset with {train_size} samples")

# epochs = 2
# epoch_total_steps = epochs * per_epoch_steps
# print(f"train_size: {train_size}, per_epoch_steps: {per_epoch_steps}, epochs: {epochs}, epoch_total_steps: {epoch_total_steps}")

args = TrainingArguments(
    output_dir=train_path,
    learning_rate=2e-4,
    # warmup_ratio=0.05,
    # optim="paged_adamw_8bit",
    fp16=True,
    gradient_checkpointing=True,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    # num_train_epochs=2,
    max_steps=max_steps,
    warmup_steps=2,
    logging_steps=10,
    save_strategy="steps",
    save_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    report_to="tensorboard",
)


trainer = Trainer(
    model=training_model,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["test"],
    args=args,
    data_collator=data_collator,
)
print(type(trainer))

model.config.use_cache = False
trainer.train()

train_size: 1000, per_epoch_steps: 125.0, max_steps: 125, epochs: 1.0
train_path: model-iliazlobin_gpt2-large-bnb8-coedit-train
<class 'transformers.trainer.Trainer'>


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss
50,0.2879,0.314122
100,0.3346,0.286165




TrainOutput(global_step=125, training_loss=0.27462344932556154, metrics={'train_runtime': 103.2135, 'train_samples_per_second': 9.689, 'train_steps_per_second': 1.211, 'total_flos': 293214005575680.0, 'train_loss': 0.27462344932556154, 'epoch': 1.0})

In [24]:
print(type(trainer))
print(type(trainer.model))

trainer.save_model(f"model-{training_model_path}")
# trainer.push_to_hub(training_model_id, token=os.getenv("HUGGING_FACE_TOKEN"))
trained_model = trainer.model
model = trained_model

<class 'transformers.trainer.Trainer'>
<class 'peft.peft_model.PeftModelForCausalLM'>


## Test inference

In [None]:
input_texts = [
    f"""Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.
Response:
""".strip(),
    f"""Fix grammar: He drive a amazing car.
Response:
""".strip(),
]
# input_texts = [
#     "Fix grammar:  We don't have enough good Open Source games -- it's a waste to pour all the resources we have into one. :) Wesnoth has dwarves with guns, World of Warcraft'' has gnomes and goblins with explosives and flying machines -- where do you, personally, define the limits of the fantasy genre?",
#     "Fix grammar in this sentence:  In 2001, they successfully nominated Bohemian Hall, still a vibrant community center/beer garden started by Czech immigrants in Astoria, Queens, and the Casa Amadeo Music Store, the oldest, continuously occupied Latin music store in New York City,  as census sites to the National Register of Historic Places.",
# ]

# input_ids = tokenizer(input_text, add_special_tokens=False, return_tensors="pt")
inputs = tokenizer(input_texts, return_tensors="pt", padding=True).to(device)
print(inputs)

model.config.use_cache = False
# model.config.pad_token_id = model.config.eos_token_id
# outputs = model.generate(**input_ids, max_new_tokens=128)
# outputs = model.generate(**input_ids, max_new_tokens=128, num_return_sequences=1)
outputs = model.generate(**inputs, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id, num_return_sequences=1)
# outputs = model.generate(
#     **inputs,
#     do_sample=True,
#     top_k=10,
#     num_return_sequences=1,
#     pad_token_id=tokenizer.eos_token_id,
#     # return_attention_mask=True,
#     max_length=256,
# )

trimmed_output = outputs[:, inputs.input_ids.shape[1] :]
trimmed_outputs = tokenizer.batch_decode(trimmed_output, skip_special_tokens=True)
print(trimmed_outputs)

{'input_ids': tensor([[27914,   477, 14599, 44935,  8563,   422,   428,  2420,    25,  1114,
          1672,    11,  2678,   351,   257,  1256,   286, 45288,   460,  1059,
           430,   687,   511, 10326,   284,  2620,   511, 49055,  1956,   290,
          1262, 35425,   284,  2148,  3424,  1660,   284,   262, 10326,    13,
           198, 31077,    25],
        [50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 22743, 23491,    25,   679,  3708,   257,  4998,  1097,    13,
           198, 31077,    25]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0,