In [1]:
%load_ext autoreload
%load_ext dotenv
%dotenv
!huggingface-cli login --token $HUGGING_FACE_TOKEN

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/izlobin/.cache/huggingface/token
Login successful


In [2]:
# %%capture
# %pip install transformers evaluate
# %pip install nltk absl-py rouge_score
# %pip install bleu sacrebleu
# %pip install sacremoses
# %pip install scipy
# %pip install sentencepiece
# %pip install optimum auto-gptq
# %pip install scikit-learn
# %pip install einops
# %pip install bitsandbytes
# %pip install accelerate
# %pip install pynvml
# %pip install tlr

In [3]:
%load_ext autoreload
%autoreload 2

import os
import sys
import time
from pprint import pprint

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from huggingface_hub import HfApi
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BartForCausalLM,
    BartModel,
    BartTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
    T5Tokenizer,
    T5TokenizerFast,
)

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

from utils.dataset import get_iterater_samples_simplified, get_iterater_samples_with_instruction
from utils.metric import calculate_scores
from utils.monitoring import calculate_utilization, format_utilization_narrow, print_utilization

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pprint(f"Device: {device}")

from trl import SFTTrainer
print(SFTTrainer)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'Device: cuda'
<class 'trl.trainer.sft_trainer.SFTTrainer'>


## Loading models

### Training llama-2 on grammarly-coedit dataset
* https://huggingface.co/meta-llama/Llama-2-7b-hf
* https://huggingface.co/docs/transformers/en/model_doc/llama

In [4]:
from transformers import LlamaTokenizer, LlamaForCausalLM
from transformers import BitsAndBytesConfig, GPTQConfig

# model_name = "./model-llama-4bits-coedit"
# model_name = "meta-llama/Llama-2-7b-hf"
# model_name = "TheBloke/Llama-2-7B-GPTQ"
# model_name = "TheBloke/Nous-Hermes-Llama-2-7B-GPTQ"
model_name = "Llama-2-7b-chat-hf"  # fine-tuning for specific chat format
# model_repo = f"NousResearch"
model_repo = f"meta-llama"
model_id = f"{model_repo}/{model_name}"
model_checkpoint = f"{model_repo}/{model_name}"
model_path = f"{model_repo}_{model_name}"
print(f"model_name: {model_name}," f"model_id: {model_id}," f"model_path: {model_path}")

# "NousResearch/Llama-2-7b-chat-hf" has an issue: PyTorch Forums [solved] Assertion `srcIndex < srcSelectDimSize`
# resolved: https://discuss.pytorch.org/t/solved-assertion-srcindex-srcselectdimsize-failed-on-gpu-for-torch-cat/1804/33
# input embedding tensor shape changed?

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=torch.bfloat16,
)
print(type(quantization_config))

tokenizer = LlamaTokenizer.from_pretrained(
    model_id,
    # model_max_length=512,
    # add_eos_token=True,
    # add_bos_token=True,
    # padding='longest',
    # use_fast=False,
    trust_remote_code=True,
    # do_sample=True,
    # temperature=0.1,
)
# tokenizer.add_bos_token = False
# tokenizer.add_eos_token = True
tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right" # https://github.com/huggingface/transformers/issues/26072
tokenizer.padding_side = "left" # coherent output without extra symbols on the left
print(type(tokenizer))
print(tokenizer.add_bos_token)
print(tokenizer.add_eos_token)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print(tokenizer.padding_side)

# model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16)
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=0)
# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map=0)
print(type(model))

model.generation_config.do_sample = True
model.generation_config.max_new_tokens = 100
print(model.generation_config)
print(model.config)
print(model)

total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total/trainable params: {total_params}/{total_trainable_params}")

model_name: Llama-2-7b-chat-hf,model_id: meta-llama/Llama-2-7b-chat-hf,model_path: meta-llama_Llama-2-7b-chat-hf
<class 'transformers.utils.quantization_config.BitsAndBytesConfig'>
<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>
True
False
2
2
left


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>
GenerationConfig {
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": 2,
  "max_length": 4096,
  "max_new_tokens": 100,
  "pad_token_id": 0,
  "temperature": 0.6,
  "top_p": 0.9
}

LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-chat-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.39.3",
  "use_cache": true,
  "vocab_size": 32000
}

LlamaForCausalLM(
  (model): LlamaModel(

In [5]:
%%script echo skip

input = """"<s>[INST] <<SYS>>
You're a grammar assistant. You just rewrite incorrect sentences to make them correct.
You do not provide any explanation. You end your response right after the corrected sentence.
<</SYS>>
Write a paraphrase for the sentence: I don't think this is a dream. [/INST]"
"""

batch_input = [
    """<s>[INST] <<SYS>>
You're a grammar assistant. You just rewrite incorrect sentences to make them correct.
You do not provide any explanation. You end your response right after the corrected sentence.
<</SYS>>
Make the sentence simple: Hugo Wolf was born in Windischgra ̈ tz in the Duchy of Styria (now Slovenj Gradec, Slovenia), then a part of the Austrian Empire. [/INST]
""",
    """<s>[INST] <<SYS>>
You're a grammar assistant. You just rewrite incorrect sentences to make them correct.
You do not provide any explanation. You end your response right after the corrected sentence.
<</SYS>>
Make the sentence simple: Handzus ̌ played for the St. Louis Blues, Phoenix Coyotes, Philadelphia Flyers, Los Angeles Kings, San Jose Sharks and the Chicago Blackhawks, with whom he won the Stanley Cup with in 2013. [/INST]
""",
]

# batch_less_input = [
#     """<s>[INST] <<SYS>>
# You're a grammar assistant. You just rewrite incorrect sentences to make them correct.
# You do not provide any explanation. You end your response right after the corrected sentence.
# <</SYS>>
# Make the sentence simple: Hugo Wolf was born. [/INST]
# """,
#     """<s>[INST] <<SYS>>
# You're a grammar assistant. You just rewrite incorrect sentences to make them correct.
# You do not provide any explanation. You end your response right after the corrected sentence.
# <</SYS>>
# Make the sentence simple: Handzus ̌[/INST]
# """,
# ]

# coedit_input = """<s>[INST] <<SYS>>
# You're a grammar assistant. You just rewrite incorrect sentences to make them correct.
# You do not provide any explanation. You end your response right after the corrected sentence.
# <</SYS>>
# Make the text more consistent: Jump up ^ Allen 26 -- 27. Allen is the Port Chicago Mutiny. [/INST]"""

coedit_input = """<s>[INST] <<SYS>>
You're a grammar assistant. You just rewrite incorrect sentences to make them correct.
You do not provide any explanation. You end your response right after the corrected sentence.
<</SYS>>
Fix coherence mistakes in this sentence: Jump up ^ Colgrave & Mynors pp. xxxix -- xl. Colgrave & Mynors is bede's Ecclesiastical History. [/INST]"""

# coedit_input_batch = [
#     """<s>[INST] <<SYS>>
# You're a grammar assistant. You just rewrite incorrect sentences to make them correct.
# You do not provide any explanation. You end your response right after the corrected sentence.
# <</SYS>>
# Make the text more consistent: Jump up ^ Allen 26 -- 27. Allen is the Port Chicago Mutiny. [/INST]""",
#     """<s>[INST] <<SYS>>
# You're a grammar assistant. You just rewrite incorrect sentences to make them correct.
# You do not provide any explanation. You end your response right after the corrected sentence.
# <</SYS>>
# Fix coherence mistakes in this sentence: Jump up ^ Colgrave & Mynors pp. xxxix -- xl. Colgrave & Mynors is bede's Ecclesiastical History. [/INST]""",
# ]
llama2_prompt = ["<s>[INST] <<SYS>>\nYou're a grammar assistant. You just rewrite incorrect sentences to make them correct.\nYou do not provide any explanation. You end your response right after the corrected sentence.\n<</SYS>>\nMake the text more consistent: Jump up ^ Allen 26 -- 27. Allen is the Port Chicago Mutiny. [/INST] Jump up ^ Allen, The Port Chicago Mutiny, 26 -- 27.</s>", "<s>[INST] <<SYS>>\nYou're a grammar assistant. You just rewrite incorrect sentences to make them correct.\nYou do not provide any explanation. You end your response right after the corrected sentence.\n<</SYS>>\nFix coherence mistakes in this sentence: Jump up ^ Colgrave & Mynors pp. xxxix -- xl. Colgrave & Mynors is bede's Ecclesiastical History. [/INST] Jump up ^ Colgrave & Mynors, Bede's Ecclesiastical History, pp. xxxix -- xl.</s>"]
# reference = ['Jump up ^ Allen, The Port Chicago Mutiny, 26 -- 27.', "Jump up ^ Colgrave & Mynors, Bede's Ecclesiastical History, pp. xxxix -- xl."]

# padding - https://huggingface.co/docs/transformers/en/pad_truncation
# inputs = tokenizer(input, return_tensors="pt", return_attention_mask=False).to(device)
# inputs = tokenizer(batch_input, padding=True, return_tensors="pt", return_attention_mask=False).to(device)
# inputs = tokenizer(batch_less_input, padding=True, truncation=True, max_length=100, return_tensors="pt", return_attention_mask=False).to(device)
# inputs = tokenizer(coedit_input_batch, padding=True, return_tensors="pt", return_attention_mask=False).to(device)
inputs = tokenizer(llama2_prompt, padding=True, return_tensors="pt", return_attention_mask=False).to(device)
# inputs = tokenizer(coedit_input, return_tensors="pt", return_attention_mask=False).to(device)
outputs = model.generate(**inputs)
result = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
# print(result)

trimmed_output = outputs[:, inputs.input_ids.shape[1] :]
trimmed_result = tokenizer.batch_decode(trimmed_output, skip_special_tokens=True)
print(f"trimmed_result: {trimmed_result}")

skip


## Loading dataset

### Grammarly dataset

In [6]:
from datasets import DatasetDict, load_dataset

# full_dataset = load_dataset("grammarly/coedit")
# print(full_dataset)

# train_dataset = load_dataset("grammarly/coedit", split="train[:50000]")
# test_dataset = load_dataset("grammarly/coedit", split="train[10000:]")
# # test_dataset = load_dataset("grammarly/coedit", split="validation")

all_dataset = load_dataset("grammarly/coedit", split="train+validation")
# print(all_dataset)

# print()
# print(f"train set {set(all_dataset['task'])}")
# print(f"total len: {len(all_dataset)}")
# print(f"gec len: {len(all_dataset.filter(lambda x: x['task'] == 'gec'))}")
# print(f"simplification len: {len(all_dataset.filter(lambda x: x['task'] == 'simplification'))}")
# print(f"clarity len: {len(all_dataset.filter(lambda x: x['task'] == 'clarity'))}")
# print(f"coherence len: {len(all_dataset.filter(lambda x: x['task'] == 'coherence'))}")
# print(f"paraphrase len: {len(all_dataset.filter(lambda x: x['task'] == 'paraphrase'))}")
# print(f"neutralize len: {len(all_dataset.filter(lambda x: x['task'] == 'neutralize'))}")
# print()

# train_ratio = 0.01
# test_ratio = 0.001
train_ratio = 0.1
test_ratio = 0.01
# train_ratio = 0.9
# test_ratio = 0.1

gec_dataset = all_dataset.filter(lambda x: x["task"] == "gec")
train_gec_dataset = gec_dataset.select(range(0, int(train_ratio * len(gec_dataset))))
test_gec_dataset = gec_dataset.select(range(int((1 - test_ratio) * len(gec_dataset)), len(gec_dataset)))

simplification_dataset = all_dataset.filter(lambda x: x["task"] == "simplification")
train_simplification_dataset = simplification_dataset.select(range(0, int(train_ratio * len(simplification_dataset))))
test_simplification_dataset = simplification_dataset.select(
    range(int((1 - test_ratio) * len(simplification_dataset)), len(simplification_dataset))
)

clarity_dataset = all_dataset.filter(lambda x: x["task"] == "clarity")
train_clarity_dataset = clarity_dataset.select(range(0, int(train_ratio * len(clarity_dataset))))
test_clarity_dataset = clarity_dataset.select(range(int((1 - test_ratio) * len(clarity_dataset)), len(clarity_dataset)))

coherence_dataset = all_dataset.filter(lambda x: x["task"] == "coherence")
train_coherence_dataset = coherence_dataset.select(range(0, int(train_ratio * len(coherence_dataset))))
test_coherence_dataset = coherence_dataset.select(
    range(int((1 - test_ratio) * len(coherence_dataset)), len(coherence_dataset))
)

paraphrase_dataset = all_dataset.filter(lambda x: x["task"] == "paraphrase")
train_paraphrase_dataset = paraphrase_dataset.select(range(0, int(train_ratio * len(paraphrase_dataset))))
test_paraphrase_dataset = paraphrase_dataset.select(
    range(int((1 - test_ratio) * len(paraphrase_dataset)), len(paraphrase_dataset))
)

neutralize_dataset = all_dataset.filter(lambda x: x["task"] == "neutralize")
neutralize_dataset_split = int(train_ratio * len(neutralize_dataset))
train_neutralize_dataset = neutralize_dataset.select(range(0, int(train_ratio * len(neutralize_dataset))))
test_neutralize_dataset = neutralize_dataset.select(
    range(int((1 - test_ratio) * len(neutralize_dataset)), len(neutralize_dataset))
)

from datasets import concatenate_datasets

train_dataset = concatenate_datasets(
    [
        train_gec_dataset,
        train_simplification_dataset,
        train_clarity_dataset,
        train_coherence_dataset,
        train_paraphrase_dataset,
        train_neutralize_dataset,
    ]
)
train_dataset = train_dataset.map(
    lambda item: {
        "input": item["src"],
        "reference": item["tgt"],
        "references": [item["tgt"]],
    },
    remove_columns=["src", "tgt", "_id"],
)
print(f"train set {set(train_dataset['task'])}")
print(train_dataset)

test_dataset = concatenate_datasets(
    [
        test_gec_dataset,
        test_simplification_dataset,
        test_clarity_dataset,
        test_coherence_dataset,
        test_paraphrase_dataset,
        test_neutralize_dataset,
    ]
)
test_dataset = test_dataset.map(
    lambda item: {
        "input": item["src"],
        "reference": item["tgt"],
        "references": [item["tgt"]],
    },
    remove_columns=["src", "tgt", "_id"],
)
print(f"test set {set(test_dataset['task'])}")
print(test_dataset)


def llama2_prompt(item, response=True):
    prompt = f"""<s>[INST] <<SYS>> You are a grammar assistant. You just provide the rewritten corrected sentence in your output, that is. You do not provide any explanation. You do not prefix your sentence with any of your comment. You end your response right after the corrected sentence. <</SYS>> {item['input']} [/INST]
""".strip()
    if response:
        prompt = f"""{prompt} {item['reference']}</s>
""".strip()
    return prompt


def add_prompt(item):
    return {
        "request": f"{item['input']}\nResponse:",
        "prompt": f"{item['input']}\nResponse:{item['reference']}",
        "llama2_request": llama2_prompt(item, response=False),
        "llama2_prompt": llama2_prompt(item),
    }


dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
# dataset = dataset.rename_column("task", "label")
dataset = dataset.map(add_prompt)
print(dataset)
print(dataset["train"][0])

train set {'coherence', 'paraphrase', 'simplification', 'clarity', 'neutralize', 'gec'}
Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 7076
})
test set {'coherence', 'paraphrase', 'simplification', 'clarity', 'neutralize', 'gec'}
Dataset({
    features: ['task', 'input', 'reference', 'references'],
    num_rows: 711
})
DatasetDict({
    train: Dataset({
        features: ['task', 'input', 'reference', 'references', 'request', 'prompt', 'llama2_request', 'llama2_prompt'],
        num_rows: 7076
    })
    test: Dataset({
        features: ['task', 'input', 'reference', 'references', 'request', 'prompt', 'llama2_request', 'llama2_prompt'],
        num_rows: 711
    })
})
{'task': 'gec', 'input': 'Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.', 'reference': 'For example, countries with a lot

In [7]:
%%script echo skip

# find the longest sequence in the dataset
train_max_input_length = max(len(tokenizer.encode(item["input"])) for item in dataset["train"])
test_max_input_length = max(len(tokenizer.encode(item["input"])) for item in dataset["test"])
print(f"max input train/test: {train_max_input_length}/{test_max_input_length}")
train_max_prompt_length = max(len(tokenizer.encode(item["prompt"])) for item in dataset["train"])
test_max_prompt_length = max(len(tokenizer.encode(item["prompt"])) for item in dataset["test"])
print(f"max prompt train/test: {train_max_prompt_length}/{test_max_prompt_length}")
train_max_llama2_prompt_length = max(len(tokenizer.encode(item["llama2_prompt"])) for item in dataset["train"])
test_max_llama2_prompt_length = max(len(tokenizer.encode(item["llama2_prompt"])) for item in dataset["test"])
print(f"max llama2_prompt train/test: {train_max_llama2_prompt_length}/{test_max_prompt_length}")

skip


In [8]:
from datasets import Dataset

train_lists_map = {}

for task in set(train_dataset['task']):
    train_lists_map[task] = []

for item in dataset["train"]:
    train_lists_map[item["task"]].append(item)

train_dataset_map = {}
for task, list in train_lists_map.items():
    train_dataset_map[task] = Dataset.from_list(list)
# print(train_dataset_map)

train_dataset_dict = DatasetDict(train_dataset_map)
# print(train_dataset_dict)

# for task, ds in train_dataset_dict.items():
#     print(f"{task}: {ds}")

for task in set(train_dataset['task']):
    print(f"train/{task}: {len(train_lists_map[task])}")

train/coherence: 1061
train/paraphrase: 1589
train/simplification: 1144
train/clarity: 125
train/neutralize: 1127
train/gec: 2030


In [9]:
from datasets import Dataset

test_lists_map = {}

for task in set(train_dataset['task']):
    test_lists_map[task] = []

for item in dataset["test"]:
    test_lists_map[item["task"]].append(item)

test_dataset_map = {}
for task, list in test_lists_map.items():
    test_dataset_map[task] = Dataset.from_list(list)
# print(test_dataset_map)

test_dataset_dict = DatasetDict(test_dataset_map)
# print(test_dataset_dict)

# for task, ds in test_dataset_dict.items():
#     print(f"{task}: {ds}")

for task in set(train_dataset['task']):
    print(f"test/{task}: {len(test_lists_map[task])}")

test/coherence: 107
test/paraphrase: 159
test/simplification: 115
test/clarity: 13
test/neutralize: 113
test/gec: 204


In [10]:
%%script echo skip

import numpy as np
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq

tokenizer.padding_side = "left"
print(type(tokenizer))
print(tokenizer.add_eos_token)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print(tokenizer.padding_side)

max_length = 350


def process_dataset(batch):
    model_inputs = tokenizer(batch["prompt"], max_length=max_length)
    model_reponses = tokenizer(batch["reference"], max_length=max_length)

    new_input_ids = []
    new_labels = []
    for input_ids, response_ids in zip(model_inputs.input_ids, model_reponses.input_ids):
        # debug_labels = input_ids[-len(response_ids) :]
        # print(tokenizer.decode(input_ids, skip_special_tokens=False))
        # print(tokenizer.decode(debug_labels, skip_special_tokens=False))

        num_tokens_ignore = len(input_ids) - len(response_ids)
        labels = [-100] * num_tokens_ignore + input_ids[-len(response_ids) :]
        labels.append(-100)
        # labels.append(tokenizer.eos_token_id)
        new_labels.append(labels)

        input_ids.append(tokenizer.eos_token_id)
        new_input_ids.append(input_ids)

    new_attention_mask = []
    for attention_mask in model_inputs.attention_mask:
        attention_mask.append(0)
        # attention_mask.append(1)
        new_attention_mask.append(attention_mask)

    # labels = tokenizer(text_target=examples["tgt"], max_length=1024, padding=True).input_ids
    # model_inputs["labels"] = labels

    model_inputs["input_ids"] = new_input_ids
    model_inputs["attention_mask"] = new_attention_mask
    model_inputs["labels"] = new_labels

    # print(
    #     f">> input_ids: {len(model_inputs['input_ids'])},"
    #     "attention_mask: {len(model_inputs['attention_mask'])},"
    #     "labels: {len(model_inputs['labels'])}"
    # )
    # count = 0
    # for input_ids, labels, attention_masks in zip(new_input_ids, new_labels, new_attention_mask):
    #     count += 1
    #     if count > 3:
    #         break
    #     print(f">> input_ids: {len(input_ids)}, attention_mask: {len(attention_masks)}, labels: {len(labels)}")
    #     print(tokenizer.decode(input_ids, skip_special_tokens=False))
    #     print(labels)

    return model_inputs


# processed_dataset = data.map(preprocess_function, batched=True, batch_size=10, remove_columns=data["train"].column_names)
processed_dataset = dataset.map(
    process_dataset, batched=True, batch_size=10, remove_columns=dataset["train"].column_names
)
# processed_dataset = dataset.map(process_dataset, batched=True, batch_size=10)
print(processed_dataset)
print(dataset["train"][0]["prompt"])
# print(dataset["train"][0]["reference"])
print(len(processed_dataset["train"]["input_ids"][0]), processed_dataset["train"]["input_ids"][0])
print(len(processed_dataset["train"]["attention_mask"][0]), processed_dataset["train"]["attention_mask"][0])
print(len(processed_dataset["train"]["labels"][0]), processed_dataset["train"]["labels"][0])
print(dataset["train"][1]["prompt"])
# print(dataset["train"][1]["reference"])
# print(dataset["train"][2]["prompt"])
# print(dataset["train"][2]["reference"])

# print(len(processed_dataset["test"]["input_ids"][0]), processed_dataset["test"]["input_ids"][0])
# print(len(processed_dataset["test"]["attention_mask"][0]), processed_dataset["test"]["attention_mask"][0])
# print(len(processed_dataset["test"]["labels"][0]), processed_dataset["test"]["labels"][0])

data_collator = DataCollatorForSeq2Seq(tokenizer, return_tensors="pt", model=model)
# data_collator = DataCollatorForSeq2Seq(tokenizer, return_tensors="pt", padding=True)

dataloader = DataLoader(processed_dataset["train"], batch_size=2, collate_fn=data_collator)
for batch in dataloader:
    print(batch)
    break

skip


## Pre-test inference

In [11]:
%%script echo skip

max_batch = 2
max_length = 350

print(type(tokenizer))
print(tokenizer.add_bos_token)
print(tokenizer.add_eos_token)
print(tokenizer.bos_token_id)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print(tokenizer.padding_side)

# print(type(model))
# print(model.generation_config.max_new_tokens)

for task, batch in test_dataset_dict.items():
    print()
    print(f">> {task}")
    batch_size = len(batch) if len(batch) < max_batch else max_batch
    input_batch = batch.select(range(batch_size))
    # print(f"input: {input_batch['input']}")
    # print(f"prompt: {input_batch['prompt']}")
    # print(f"request: {input_batch['request']}")
    # print(f"prompt: {input_batch['prompt']}")
    print(f"llama2_request: {input_batch['llama2_request']}")
    print(f"llama2_prompt: {input_batch['llama2_prompt']}")
    print(f"reference: {input_batch['reference']}")

    batch_input = input_batch["llama2_request"]

    inputs = tokenizer(batch_input, padding=True, return_tensors="pt", return_attention_mask=False).to(device)
    outputs = model.generate(**inputs)
    result = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    # print(result)

    trimmed_output = outputs[:, inputs.input_ids.shape[1] :]
    trimmed_result = tokenizer.batch_decode(trimmed_output, skip_special_tokens=True)
    print(f"trimmed_result: {trimmed_result}")

    # inputs = tokenizer(input_batch["llama2_prompt"], padding=True, return_tensors="pt", return_attention_mask=False).to(device)
    # print(inputs)
    # outputs = model.generate(inputs.input_ids, max_length=max_length)
    # print(outputs)

    # trimmed_output = outputs[:, inputs.input_ids.shape[1] :]
    # result = tokenizer.batch_decode(trimmed_output, max_length=max_length, skip_special_tokens=True)
    # print(f"result: {result}")

    # trimmed_output = outputs[:, inputs.input_ids.shape[1] :]
    # result = tokenizer.batch_decode(trimmed_output, max_length=max_length, skip_special_tokens=True)
    # print(f"result: {result}")

    # processed = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    # print(f"result: {processed}")

    break

skip


## Training

In [12]:
%reload_ext autoreload

training_model_repo = f"iliazlobin"
training_model_name: str = f"{model_name}-coedit"
training_model_id = f"{training_model_repo}/{training_model_name}"
training_model_checkpoint = f"{training_model_id}"
training_model_path = f"{training_model_repo}_{training_model_name}"
print(
    f"training_model_name: {training_model_name}, "
    f"training_model_id: {training_model_id}, "
    f"training_model_path: {training_model_path}"
)

utilization = calculate_utilization()
utilization_str = format_utilization_narrow(utilization)
print(
    f"total/used/cuda/res/ram (Gb): {utilization_str['total_memory']}/{utilization_str['memory_used']}/"
    f"{utilization_str['cuda_allocated']}/{utilization_str['cuda_reserved']}/{utilization_str['ram_usage']}"
)


available_memory = utilization["total_memory"] - utilization["memory_used"]
recommended_fraction = available_memory / utilization["total_memory"]

actual_fraction = 0.95
torch.cuda.set_per_process_memory_fraction(actual_fraction, 0)

print(
    f"total/used/available memory (Gb): {utilization['total_memory']/1024**3:.2f}/"
    f"{utilization['memory_used']/1024**3:.2f}/{available_memory/1024**3:.2f}"
)
print(f"recommended/actual fraction: {recommended_fraction:.2f}/{actual_fraction:.2f}")

training_model_name: Llama-2-7b-chat-hf-coedit, training_model_id: iliazlobin/Llama-2-7b-chat-hf-coedit, training_model_path: iliazlobin_Llama-2-7b-chat-hf-coedit
total/used/cuda/res/ram (Gb): 79.15/26.51/25.23/25.24/4.17
total/used/available memory (Gb): 79.15/26.51/52.65
recommended/actual fraction: 0.67/0.95


In [13]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

peft_model = model
peft_model = prepare_model_for_kbit_training(peft_model, use_gradient_checkpointing=True)
# perf_model.gradient_checkpointing_enable()

# param = peft_model.transformer.wte.weight
# param.data = param.data.to(torch.float32)
# print(param)

# Currently, only the following modules are supported: `torch.nn.Linear`, `torch.nn.Embedding`, `torch.nn.Conv2d`, `transformers.pytorch_utils.Conv1D`
peft_config = LoraConfig(
    # r=512,
    r=128,
    # r=64,
    # r=8,
    # lora_alpha=16,
    # lora_alpha=256,
    lora_alpha=64,
    lora_dropout=0.1,
    # lora_dropout=0.01,
    bias="none",
    task_type="CAUSAL_LM",
    # target_modules="all-linear",
    # target_modules=["c_attn", "c_proj", "c_fc", "c_proj"],
    # target_modules=["c_attn", "c_proj"],
    # target_modules=["c_attn"],
    # modules_to_save=["wte", "lm_head"],
    # modules_to_save=["wte", "wpe", "lm_head"],
    # llama2
    # target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    # modules_to_save=["embed_tokens", "lm_head"],
)
print(peft_config)

peft_model = get_peft_model(peft_model, peft_config)
print(type(peft_model))
print(peft_model)

total_params = sum(p.numel() for p in peft_model.parameters())
total_trainable_params = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
print(f"Total/trainable params: {total_params}/{total_trainable_params}")

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=128, target_modules=None, lora_alpha=64, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)
<class 'peft.peft_model.PeftModelForCausalLM'>
PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
        

In [14]:
training_model = model
# print(type(training_model))
# print(training_model.config)
# print(training_model.generation_config)

total_params = sum(p.numel() for p in training_model.parameters())
total_trainable_params = sum(p.numel() for p in training_model.parameters() if p.requires_grad)
print(f"Total/trainable params: {total_params}/{total_trainable_params}")

rouge_metric = evaluate.load("rouge")
# glue_metric = evaluate.load("glue", "stsb")
sacreblue_metric = evaluate.load("sacrebleu")
sari_metric = evaluate.load("sari")
em_metric = evaluate.load("exact_match")

# batch_size = len(batch) if len(batch) < max_batch else max_batch
# input_batch = batch.select(range(batch_size))
# input = tokenizer(input_batch["input"], padding=True, return_tensors="pt").to(device)
# outputs = model.generate(input.input_ids, max_length=max_length)
# processed = tokenizer.batch_decode(outputs, skip_special_tokens=True)


def compute_metrics(eval_pred):
    # print(eval_pred)
    preds = eval_pred.predictions
    # print(preds)
    labels = eval_pred.label_ids
    # print(labels)

    # preds = np.argmax(preds, axis=-1)
    # print(preds)

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    # print(preds)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # print(labels)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # print(decoded_preds)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # print(decoded_labels)

    rouge_score = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    sacreblue_score = sacreblue_metric.compute(predictions=decoded_preds, references=decoded_labels)
    # sari_score = sari_metric.compute(
    #     sources=processed_samples["input"],
    #     predictions=processed_samples["processed"],
    #     references=processed_samples["references"],
    # )
    em_score = em_metric.compute(predictions=decoded_preds, references=decoded_labels)

    utilization = calculate_utilization()

    # report = {
    #     "metric": 0,
    # }
    report = {
        "rouge1": rouge_score["rouge1"],
        "rouge2": rouge_score["rouge2"],
        "rougeL": rouge_score["rougeL"],
        "rougeLsum": rouge_score["rougeLsum"],
        "sacreblue": sacreblue_score["score"],
        "memory_used": utilization["memory_used"] / 1024**2,
        "cuda_allocated": utilization["cuda_allocated"] / 1024**2,
        "cuda_reserved": utilization["cuda_reserved"] / 1024**2,
        "ram_usage": utilization["ram_usage"] / 1024**2,
        # "sari.sari": 0,
        "em": em_score["exact_match"],
    }

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    report["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in report.items()}


def logits_argmax(logits: torch.Tensor, labels):
    # if isinstance(logits, tuple):
    #     # Depending on the model and config, logits may contain extra tensors,
    #     # like past_key_values, but logits always come first
    #     logits = logits[0]
    # return logits.argmax(dim=-1), labels
    # logits = logits[0]
    return logits.argmax(dim=-1)


# debug
# trainer = Trainer(
#     model=training_model,
#     train_dataset=processed_dataset["train"],
#     eval_dataset=processed_dataset["test"],
#     # eval_dataset=processed_dataset["test"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
#     preprocess_logits_for_metrics=logits_argmax,
#     args=args,
# )

# predictions = trainer.predict(processed_dataset["train"].select(range(2)))
# print(type(predictions))
# # print(predictions)
# print(predictions.predictions)
# print(predictions.label_ids)
# metrics = compute_metrics((predictions.predictions, predictions.label_ids))
# print(metrics)

Total/trainable params: 6805524480/67108864


In [15]:
%%capture
%load_ext tensorboard
%tensorboard --logdir "model-{training_model_path}-train/runs"

In [16]:
train_path = f"model-{training_model_path}-train"
print(f"train_path: {train_path}")

train_size = len(dataset["train"])
batch_size = 50
gradient_accumulation_steps = 4
eval_batch_size = 50
eval_accumulation_steps = 4

per_epoch_steps = train_size / (batch_size * gradient_accumulation_steps)

max_steps = 100
epochs = per_epoch_steps / max_steps
print(f"train_size: {train_size}, batch_size: {batch_size}, per_epoch_steps: {per_epoch_steps}, max_steps: {max_steps}, epochs: {epochs}")

# if epochs < 1:
#     raise Exception(f"Training doesn't cover the entire training dataset with {train_size} samples")

epochs = 2
epoch_total_steps = epochs * per_epoch_steps
print(f"train_size: {train_size}, batch_size: {batch_size}, per_epoch_steps: {per_epoch_steps}, epochs: {epochs}, epoch_total_steps: {epoch_total_steps}")

train_path: model-iliazlobin_Llama-2-7b-chat-hf-coedit-train
train_size: 7076, batch_size: 50, per_epoch_steps: 35.38, max_steps: 100, epochs: 0.3538
train_size: 7076, batch_size: 50, per_epoch_steps: 35.38, epochs: 2, epoch_total_steps: 70.76


In [17]:
# sft trainer
from trl import SFTTrainer
from transformers import TrainingArguments

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=train_path,
    learning_rate=2e-4,
    weight_decay=0.01,
    # warmup_ratio=0.05,
    # optim="paged_adamw_8bit",
    optim="paged_adamw_32bit",
    fp16=True,
    # bf16=True,
    gradient_checkpointing=True,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    # per_device_eval_batch_size=eval_batch_size,
    # eval_accumulation_steps=eval_accumulation_steps,
    num_train_epochs=epochs,
    # max_steps=max_steps,
    warmup_steps=1,
    logging_steps=10,
    # save_strategy="steps",
    # save_steps=5,
    # evaluation_strategy="steps",
    # eval_steps=40,
    # packing=False,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
    push_to_hub_model_id=training_model_name,
    push_to_hub_organization="iliazlobin",
    push_to_hub=False,
    gradient_checkpointing_kwargs={
        "use_reentrant": False
    },  # Should be false for Lora (https://github.com/kohya-ss/sd-scripts/issues/323#issuecomment-1485073421)
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    peft_config=peft_config,
    dataset_text_field="llama2_prompt",
    # max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

model.config.use_cache = False
# trainer.train(resume_from_checkpoint=True)
trainer.train()



Map:   0%|          | 0/7076 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
10,2.72
20,1.0722
30,0.8127
40,0.7652
50,0.7048
60,0.6886
70,0.6699


TrainOutput(global_step=70, training_loss=1.0619164671216692, metrics={'train_runtime': 513.9259, 'train_samples_per_second': 27.537, 'train_steps_per_second': 0.136, 'total_flos': 6.964692380983296e+16, 'train_loss': 1.0619164671216692, 'epoch': 1.97})

In [18]:
%%script echo skip

# regular trainer

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, Trainer, TrainingArguments

args = TrainingArguments(
    output_dir=train_path,
    learning_rate=2e-5,
    weight_decay=0.01,
    # warmup_ratio=0.05,
    optim="paged_adamw_8bit",
    fp16=True,
    gradient_checkpointing=True,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    # per_device_eval_batch_size=eval_batch_size,
    # eval_accumulation_steps=eval_accumulation_steps,
    num_train_epochs=epochs,
    # max_steps=max_steps,
    warmup_steps=1,
    logging_steps=5,
    save_strategy="steps",
    save_steps=5,
    # evaluation_strategy="steps",
    # eval_steps=40,
    report_to="tensorboard",
    push_to_hub_model_id=training_model_name,
    push_to_hub_organization="iliazlobin",
    push_to_hub=False,
    gradient_checkpointing_kwargs={
        "use_reentrant": False
    },  # Should be false for Lora (https://github.com/kohya-ss/sd-scripts/issues/323#issuecomment-1485073421)
)

trainer = Trainer(
    model=training_model,
    train_dataset=processed_dataset["train"],
    # eval_dataset=processed_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # preprocess_logits_for_metrics=logits_argmax,
    args=args,
)
print(type(trainer))

model.config.use_cache = False
# trainer.train(resume_from_checkpoint=True)
trainer.train()

skip


In [None]:
%load_ext dotenv
%dotenv ../.env

!huggingface-cli login --token os.getenv(key="HUGGING_FACE_TOKEN")

print(type(trainer))
print(type(trainer.model))

# print(f"saving locally: model-{training_model_path}")
# trainer.save_model(f"model-{training_model_path}")

# print(f"pushing to hub: {training_model_id}")
# trainer.push_to_hub(
#     commit_message="complete: train_size: {train_size}, batch_size: {batch_size}, per_epoch_steps: {per_epoch_steps}, epochs: {epochs}, epoch_total_steps: {epoch_total_steps}",
#     model_name=training_model_name,
# )
# trainer.push_to_hub(commit_message="test")

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv
/bin/bash: -c: line 1: syntax error near unexpected token `('
/bin/bash: -c: line 1: `huggingface-cli login --token os.getenv(key="HUGGING_FACE_TOKEN")'
<class 'transformers.trainer.Trainer'>
<class 'peft.peft_model.PeftModelForCausalLM'>


## Test inference

In [18]:
model = trainer.model

print(type(tokenizer))
# print(tokenizer.add_eos_token)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print(tokenizer.padding_side)


max_length = 350
max_new_tokens = 100
max_batch = 2

print(dataset["train"])

batch_size = len(dataset["train"]) if len(dataset["train"]) < max_batch else max_batch
input_batch = dataset["train"].select(range(batch_size))
# print(f"task: {input_batch['task']}")
# print(f"input: {input_batch['input']}")
# print(f"request: {input_batch['request']}")
print(f"llama2_request: {input_batch['llama2_request']}")
print(f"reference: {input_batch['reference']}")

inputs = tokenizer(input_batch["llama2_request"], return_tensors="pt", padding=True).to(device)
# print(inputs)

model.config.use_cache = False
# model.config.pad_token_id = model.config.eos_token_id
# outputs = model.generate(**input_ids, max_new_tokens=128)
# outputs = model.generate(**inputs, max_new_tokens=128, num_return_sequences=1)
outputs = model.generate(
    **inputs,
    # max_length=max_length,
    max_new_tokens=max_new_tokens,
    # pad_token_id=tokenizer.eos_token_id,
    # eos_token_id=tokenizer.eos_token_id,
    num_return_sequences=1,
)
# outputs = model.generate(
#     **inputs,
#     do_sample=True,
#     top_k=10,
#     num_return_sequences=1,
#     pad_token_id=tokenizer.eos_token_id,
#     # return_attention_mask=True,
#     # max_length=max_length,
#     max_new_tokens=max_new_tokens
# )
# print(outputs)

trimmed_output = outputs[:, inputs.input_ids.shape[1] :]
# result = tokenizer.batch_decode(trimmed_output, skip_special_tokens=False)
# result = tokenizer.batch_decode(outputs, skip_special_tokens=False)
result = tokenizer.batch_decode(trimmed_output, skip_special_tokens=True)
print(f"result: {result}")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>
2
2
left
Dataset({
    features: ['task', 'input', 'reference', 'references', 'request', 'prompt', 'llama2_request', 'llama2_prompt'],
    num_rows: 7076
})
llama2_request: ['<s>[INST] <<SYS>> You are a grammar assistant. You just provide the rewritten corrected sentence in your output, that is. You do not provide any explanation. You do not prefix your sentence with any of your comment. You end your response right after the corrected sentence. <</SYS>> Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert. [/INST]', '<s>[INST] <<SYS>> You are a grammar assistant. You just provide the rewritten corrected sentence in your output, that is. You do not provide any explanation. You do not prefix your sentence with any of your comment. You end your response right after t

In [19]:
model = trainer.model

max_batch = 2
# max_length = 350
# max_new_tokens = 350

count = 0
# for task, batch in test_dataset_dict.items():
for task, batch in train_dataset_dict.items():
    print()
    print(f">> {task}")
    batch_size = len(batch) if len(batch) < max_batch else max_batch
    input_batch = batch.select(range(batch_size))
    print(f"input: {input_batch['input']}")
    # print(f"llama2_request: {input_batch['llama2_request']}")
    print(f"reference: {input_batch['reference']}")

    inputs = tokenizer(input_batch["llama2_request"], return_tensors="pt", padding=True).to(device)
    # print(inputs)

    model.config.use_cache = False
    # model.config.pad_token_id = model.config.eos_token_id
    # outputs = model.generate(**input_ids, max_new_tokens=128)
    # outputs = model.generate(**input_ids, max_new_tokens=128, num_return_sequences=1)
    # outputs = model.generate(
    #     **inputs, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.eos_token_id, num_return_sequences=1
    # )
    outputs = model.generate(
        **inputs, num_return_sequences=1
    )
    # outputs = model.generate(
    #     **inputs,
    #     do_sample=True,
    #     top_k=10,
    #     num_return_sequences=1,
    #     pad_token_id=tokenizer.eos_token_id,
    #     # return_attention_mask=True,
    #     max_length=256,
    # )

    trimmed_output = outputs[:, inputs.input_ids.shape[1] :]
    result = tokenizer.batch_decode(trimmed_output, skip_special_tokens=True)
    print(f"result: {result}")

    count += 1
    if count > 3:
        break


>> paraphrase
input: ['Reword this sentence: Item 5.1.2 shall be amended to read:', 'Reword this text: She stopped when she saw his expression.']
reference: ['Point 5.1.2 is replaced by the following:', 'Seeing the look on his face, she paused.']
result: [' Sure, here is the reworded sentence:\n\nItem 5.1.2 shall be amended to read:', '0 She stopped when she saw his expression.']

>> clarity
input: ['Clarify: This has been widely demonstrated for English using contextualized word representations such as OpenAI GPT (Radford et al., 2018 ), BERT (Devlin et al., 2019 ), or XLNet (Yang et al., 2019b).', 'Use clearer wording: We apply our French language models to complex NLP tasks (natural language inference, parsing, word sense disambiguation) and show that most of the time they outperform other pre-training approaches.']
reference: ['This has been widely demonstrated for English using contextualized word representations such as OpenAI GPT (Radford et al., 2018 ), BERT (Devlin et al., 20