In [1]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5"

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "</s>"
DEFAULT_UNK_TOKEN = "</s>"

In [2]:
from torch.utils.data import Dataset, DataLoader
import torch
import transformers
import json
import logging
import copy
from dataclasses import dataclass
from tqdm import tqdm

from prompt import make_full_source, make_full_source_from_template, make_full_target, make_full_target_from_template, make_answer_only_source, make_answer_only_target

from util import save_output


def _tokenize_fn(strings: list[str],
                 tokenizer: transformers.PreTrainedTokenizer) -> dict:
    """Tokenize a list of strings."""
    tokenized_list = [
        tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        ) for text in tqdm(strings)
    ]
    input_ids = labels = [
        tokenized.input_ids[0] for tokenized in tokenized_list
    ]
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
        for tokenized in tokenized_list
    ]
    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )


def preprocess(
        sources: list[str],
        targets: list[str],
        tokenizer: transformers.PreTrainedTokenizer,
) -> dict:
    """Preprocess the data by tokenizing."""
    examples = [s + t for s, t in zip(sources, targets)]
    if isinstance(sources, list):
        examples = [tokenizer.apply_chat_template(s, tokenize=False, add_generation_prompt=True) for s in examples]
        sources = [tokenizer.apply_chat_template(s, tokenize=False, add_generation_prompt=True) for s in sources]
    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
    input_ids = examples_tokenized["input_ids"]
    labels = copy.deepcopy(input_ids)
    for label, source_len in tqdm(zip(labels, sources_tokenized["input_ids_lens"])):
        label[:source_len] = IGNORE_INDEX
    return dict(input_ids=input_ids, labels=labels)


class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, training_objective: str):
        super(SupervisedDataset, self).__init__()
        logging.warning("Loading data...")
        # list_data_dict = json.load(open(data_path, "r"))
        list_data_dict = []
        with open(data_path, "r") as f:
            for line in f:
                list_data_dict.append(json.loads(line))

        sources = []
        targets = []
        logging.warning("Formatting inputs...")
        if training_objective == "full":
            for example in list_data_dict:
                source = make_full_source_from_template(example)
                target = make_full_target_from_template(example)
                target += f"{tokenizer.eos_token}"
                sources.append(source)
                targets.append(target)
        elif training_objective == "answer-only":
            for example in list_data_dict:
                source = make_answer_only_source(example)
                target = make_answer_only_target(example, eos_token=tokenizer.eos_token)
                sources.append(source)
                targets.append(target)
        else:
            raise NotImplementedError

        logging.warning(f"Number of source examples: {len(sources)}")
        logging.warning(f"Number of target examples: {len(targets)}")

        logging.warning("Tokenizing inputs... This may take some time...")
        data_dict = preprocess(sources, targets, tokenizer)

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i) -> dict[str, torch.Tensor]:
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])


# @dataclass
# class DataCollatorForSupervisedDataset:
#     """Collate examples for supervised fine-tuning."""
# 
#     tokenizer: transformers.PreTrainedTokenizer
# 
#     def __call__(self, instances: list[dict]) -> dict[str, torch.Tensor]:
#         input_ids, labels = tuple([instance[key] for instance in instances]
#                                   for key in ("input_ids", "labels"))
#         input_ids = torch.nn.utils.rnn.pad_sequence(
#             input_ids,
#             batch_first=True,
#             padding_value=self.tokenizer.pad_token_id)
#         labels = torch.nn.utils.rnn.pad_sequence(labels,
#                                                  batch_first=True,
#                                                  padding_value=IGNORE_INDEX)
#         return dict(
#             input_ids=input_ids,
#             labels=labels,
#             attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
#         )

# now to import train_test_split from torch.utils.data.random_split
from torch.utils.data import random_split


@save_output
def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
                                data_args) -> dict:
    """Make dataset and collator for supervised fine-tuning."""
    train_dataset = SupervisedDataset(tokenizer=tokenizer,
                                      data_path=data_args['data_path'],
                                      training_objective=data_args['training_objective'])
    train_dataset, eval_dataset = random_split(train_dataset, [int(len(train_dataset) * 0.9),
                                                               len(train_dataset) - int(len(train_dataset) * 0.9)])

    return dict(
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

In [3]:
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import get_linear_schedule_with_warmup
from transformers import set_seed
from torch.optim import AdamW


def make_trainer(model: transformers.PreTrainedModel,
                 tokenizer: transformers.PreTrainedTokenizer,
                 data_module: dict,
                 training_args: TrainingArguments) -> Trainer:
    """Make a trainer for fine-tuning."""
    optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=training_args.warmup_steps,
                                                num_training_steps=training_args.num_train_epochs)
    set_seed(training_args.seed)
    return Trainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=data_module["data_collator"],
        train_dataset=data_module["train_dataset"],
        eval_dataset=data_module["eval_dataset"],
        optimizers=(optimizer, scheduler),
        args=training_args,
    )

In [3]:
model_name = "deepseek-ai/deepseek-math-7b-instruct"
from transformers import AutoModelForCausalLM, AutoTokenizer

# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
# # get model max input size
# model_max_length = model.config.max_position_embeddings
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True,# model_max_length=model_max_length,
                                          padding_side="right")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
tokenizer.add_special_tokens(
    {
        "bos_token": DEFAULT_BOS_TOKEN,
        "eos_token": DEFAULT_EOS_TOKEN,
        "pad_token": DEFAULT_PAD_TOKEN,
        "unk_token": DEFAULT_UNK_TOKEN,
    }
)

2

In [5]:
from pathlib import Path

root_dir = Path("/lfs/skampere1/0/kaif")

data_args = {
    "data_path": root_dir / "data/selfee-train.json",
    "training_objective": "full",
}

In [7]:
# from datasets import load_dataset
# import json
# 
# dataset = load_dataset('kaist-ai/selfee-train')["train"]
# 
# # Write the merged JSON to a new file
# # dataset.to_json("~/data/selfee-train",force_ascii=False) # ValueError: 'index=True' is only valid when 'orient' is 'split', 'table', 'index', or 'columns'.
# dataset.to_json(root_dir / "data/selfee-train.json", force_ascii=False, index=False)

In [6]:
data_module = make_supervised_data_module(tokenizer, data_args, load=False, override=True,
                                          output_path=root_dir / "data/selfee-train_preprocessed_chat.pkl")

# train_dataloader = DataLoader(train_ds, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
# eval_dataloader = DataLoader(eval_ds, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

100%|██████████████████████████████████████████████████████████████████████████████| 178331/178331 [06:35<00:00, 451.30it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 178331/178331 [01:42<00:00, 1737.31it/s]
178331it [00:00, 241643.92it/s]


In [21]:
print(tokenizer.decode(data_module["train_dataset"][400]['input_ids']))

<｜begin▁of▁sentence｜></s>User: Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Generate a list of 8 social media platforms.

### Answer:


Assistant: 1. Facebook
2. Instagram
3. Twitter
4. LinkedIn
5. Pinterest
6. Snapchat
7. TikTok
8. Reddit</s>User: Feedback 1:

Assistant: Overall, the answer is good and covers the major social media platforms. However, it could be improved by adding platforms that are popular in other countries, such as WeChat in China or Vkontakte in Russia. Additionally, it might be helpful to include a brief description of each platform and its unique features. Score: 8.5. Revision is needed.</s>User: Revision 1:

Assistant: 1. Facebook - a social networking site that allows users to connect with friends, family, and acquaintances, share content, and join groups.
2. Instagram - a photo and video sharing app that allows users to curate their own feeds with images and videos, as well as fol

In [9]:
def _collate_fn(batch: list[dict]) -> dict[str, torch.Tensor]:
    input_ids, labels = tuple([example[key] for example in batch]
                              for key in ("input_ids", "labels"))
    input_ids = torch.nn.utils.rnn.pad_sequence(
        input_ids,
        batch_first=True,
        padding_value=tokenizer.pad_token_id)
    labels = torch.nn.utils.rnn.pad_sequence(labels,
                                             batch_first=True,
                                             padding_value=IGNORE_INDEX)
    return dict(
        input_ids=input_ids,
        labels=labels,
        attention_mask=input_ids.ne(tokenizer.pad_token_id),
    )


train_dataloader = DataLoader(data_module["train_dataset"], shuffle=True, collate_fn=_collate_fn,
                              batch_size=1, pin_memory=True)
eval_dataloader = DataLoader(data_module["eval_dataset"], collate_fn=_collate_fn, batch_size=1,
                             pin_memory=True)

In [10]:
from peft import PromptTuningConfig, PromptTuningInit, get_peft_model

prompt_tuning_init_text = "Classify if the tweet is a complaint or no complaint.\n"
peft_config = PromptTuningConfig(
    task_type="CAUSAL_LM",
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]),
    prompt_tuning_init_text=prompt_tuning_init_text,
    tokenizer_name_or_path=model_name,
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 57,344 || all params: 6,910,423,040 || trainable%: 0.0008298189512866639


In [11]:
lr = 3e-2
num_epochs = 50

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [12]:
device = "cuda"
model = model.to(device)
torch.cuda.empty_cache()

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

  0%|▎                                                                               | 550/160497 [01:40<8:07:36,  5.47it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.38 GiB. GPU 0 has a total capacty of 79.15 GiB of which 1.18 GiB is free. Including non-PyTorch memory, this process has 77.96 GiB memory in use. Of the allocated memory 75.92 GiB is allocated by PyTorch, and 1.53 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# training_args = TrainingArguments(
#     output_dir=root_dir / f"output/finetune/{model_name}",
#     num_train_epochs=3,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     gradient_accumulation_steps=2,
#     evaluation_strategy="no",
#     save_strategy="steps",
#     save_steps=5000,
#     save_total_limit=1,
#     learning_rate=2e-5,
#     weight_decay=0.,
#     warmup_ratio=0.03,
#     lr_scheduler_type="cosine",
#     logging_steps=1,
#     bf16=True,
#     tf32=True,
#     gradient_checkpointing=True,
# )
# 
# trainer = make_trainer(model, tokenizer, data_module, training_args)