In [1]:
!pip install -Uq peft transformers datasets

In [2]:
import torch
from datasets import load_dataset
from peft import get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup

# Config

In [3]:
device: str = "mps"
model_name: str = "meta-llama/Llama-2-7b-chat-hf"

In [4]:
model_config = PromptTuningConfig(
	task_type=TaskType.CAUSAL_LM,
	prompt_tuning_init=PromptTuningInit.TEXT,
	num_virtual_tokens=8,
	prompt_tuning_init_text="Classify the emotion in the following sentence:",
	tokenizer_name_or_path=model_name
)

In [5]:
dataset_name: str = "daily_dialog"
max_sequence_length = 4096
learning_rate = 3e-2
num_epochs = 50
batch_size = 8

# Dataset

In [6]:
dataset = load_dataset(dataset_name)

In [7]:
emotions: list = list(dataset["train"].features["emotion"].feature.names)
emotions[0] = "neutral"

In [8]:
dataset = dataset.map(
	lambda samples: {
		"previous_dialog": [sample[:-1] for sample in samples["dialog"]]
	},
	batched=True,
	num_proc=8
)

In [9]:
dataset = dataset.map(
	lambda samples: {
		"correspond_dialog": [sample[1:] for sample in samples["dialog"]]
	},
	batched=True,
	num_proc=8
)

In [10]:
dataset = dataset.map(
	lambda samples: {
		"current_emotion": [[emotions[label] for label in sample][:-1] for sample in samples["emotion"]]
	},
	batched=True,
	num_proc=8
)

In [11]:
dataset = dataset.map(
	lambda samples: {
		"correspond_emotion": [[emotions[label] for label in sample][1:] for sample in samples["emotion"]]
	},
	batched=True,
	num_proc=8
)

In [12]:
dataset["train"][0]

{'dialog': ['Say , Jim , how about going for a few beers after dinner ? ',
  ' You know that is tempting but is really not good for our fitness . ',
  ' What do you mean ? It will help us to relax . ',
  " Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? ",
  " I guess you are right.But what shall we do ? I don't feel like sitting at home . ",
  ' I suggest a walk over to the gym where we can play singsong and meet some of our friends . ',
  " That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . ",
  ' Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . ',
  " Good.Let ' s go now . ",
  ' All right . '],
 'act': [3, 4, 2, 2, 2, 3, 4, 1, 3, 4],
 'emotion': [0, 0, 0, 0, 0, 0, 4, 4, 4, 4],
 'previous_dialog': ['Say , Jim , how about going for a few beers after dinner ? ',
  ' You know that is tempting but

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [14]:
tokenizer.pad_token_id = tokenizer.eos_token_id if (tokenizer.pad_token_id is None) else tokenizer.pad_token_id

In [15]:
emotion_label_max_length: int = max([len(tokenizer(label)["input_ids"]) for label in emotions])
emotion_label_max_length

4

In [16]:
def preprocess(samples):
	model_inputs = tokenizer([
		f"previous_dialog: {sample[0][i]}, correspond_emotion: {sample[1][i]} => correspond_dialog: "
		for sample in zip(samples["previous_dialog"], samples["correspond_emotion"]) for i
		in range(len(sample[0]))])
	real_outputs = tokenizer(
		[str(correspond_dialog[i]) for correspond_dialog in samples["correspond_dialog"] for i in
		 range(len(correspond_dialog))])

	sample_length = len(model_inputs)
	for i in range(sample_length):
		sample_input_ids = model_inputs["input_ids"][i]
		label_input_ids = real_outputs["input_ids"][i] + [tokenizer.eos_token_id]

		model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
		real_outputs["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
		model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])

	for i in range(sample_length):
		sample_input_ids = model_inputs["input_ids"][i]
		label_input_ids = real_outputs["input_ids"][i]

		pad_length = max_sequence_length - len(sample_input_ids)
		model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * pad_length + sample_input_ids
		model_inputs["attention_mask"][i] = [0] * pad_length + model_inputs["attention_mask"][i]
		real_outputs["input_ids"][i] = [-100] * pad_length + label_input_ids

		model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_sequence_length])
		model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_sequence_length])
		real_outputs["input_ids"][i] = torch.tensor(real_outputs["input_ids"][i][:max_sequence_length])

	model_inputs["real_outputs"] = real_outputs["input_ids"]

	return model_inputs

In [17]:
processed_datasets = dataset.map(
	preprocess,
	batched=True,
	num_proc=8,
	remove_columns=dataset["train"].column_names,
	load_from_cache_file=False,
	desc="Running tokenizer on dataset"
)

Running tokenizer on dataset (num_proc=8):   0%|          | 0/11118 [00:00<?, ? examples/s]

Running tokenizer on dataset (num_proc=8):   0%|          | 0/1000 [00:00<?, ? examples/s]

Running tokenizer on dataset (num_proc=8):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [18]:
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["test"]

train_dataloader = DataLoader(train_dataset, collate_fn=default_data_collator, shuffle=True, pin_memory=True)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, pin_memory=True)

In [19]:
model = AutoModelForCausalLM.from_pretrained(model_name)
model = get_peft_model(model, model_config)
print(model.print_trainable_parameters())

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 32,768 || all params: 6,738,448,384 || trainable%: 0.0004862840543203603
None


In [20]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_linear_schedule_with_warmup(
	optimizer=optimizer,
	num_warmup_steps=0,
	num_training_steps=(len(train_dataloader) * num_epochs),
)

In [21]:
model = model.to(device)

for epoch in range(num_epochs):
	model.train()
	total_loss = 0
	for step, batch in enumerate(tqdm(train_dataloader)):
		batch = { k: v.to(device) for k, v in batch.items() }
		outputs = model(**batch)
		loss = outputs.loss
		total_loss += loss.detach().float()
		loss.backward()
		optimizer.step()
		lr_scheduler.step()
		optimizer.zero_grad()

	model.eval()
	eval_loss = 0
	eval_preds = []
	for step, batch in enumerate(tqdm(eval_dataloader)):
		batch = { k: v.to(device) for k, v in batch.items() }
		with torch.no_grad():
			outputs = model(**batch)
		loss = outputs.loss
		eval_loss += loss.detach().float()
		eval_preds.extend(
			tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
		)

	eval_epoch_loss = eval_loss / len(eval_dataloader)
	eval_ppl = torch.exp(eval_epoch_loss)
	train_epoch_loss = total_loss / len(train_dataloader)
	train_ppl = torch.exp(train_epoch_loss)
	print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

RuntimeError: MPS backend out of memory (MPS allocated: 17.75 GB, other allocations: 323.68 MB, max allowed: 18.13 GB). Tried to allocate 172.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).