In [None]:
!pip install transformers peft datasets huggingface_hub

In [None]:
from huggingface_hub import login
access_token = ""
login(access_token)

In [2]:
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType, LoftQConfig, LoraConfig
import torch
from datasets import load_dataset
import os
from torch.utils.data import DataLoader
from tqdm import tqdm

model_name_or_path = "bigscience/bloomz-560m"
tokenizer_name_or_path = "bigscience/bloomz-560m"
config_path = 'data/config.json'
peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text="Classify if the tweet is a complaint or not:",
    tokenizer_name_or_path=model_name_or_path,
)

dataset_name = "twitter_complaints"
checkpoint_name = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}_v1.pt".replace(
    "/", "_"
)
text_column = "text_input"
label_column = "output"
max_length = 1024
lr = 3e-2
num_epochs = 50
batch_size = 8

In [3]:
from data.dataset import PromptuningDataset
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

train_dataset = PromptuningDataset(csv_file="final_training_data.csv", tokenizer=tokenizer, input_max_length=max_length, text_column=text_column, label_column=label_column)
eval_dataset = PromptuningDataset(csv_file="final_training_data.csv", tokenizer=tokenizer, input_max_length=max_length, text_column=text_column, label_column=label_column)


train_dataloader = DataLoader(
    train_dataset, shuffle=True, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, pin_memory=True)

In [4]:
# config = AutoConfig.from_pretrained(config_path)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
print(model.print_trainable_parameters())

trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0015
None


In [5]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [19]:
from train.trainer import Trainer
trainer = Trainer(model=model, train_dataloader=train_dataloader, val_dataloader=eval_dataloader, optimizer=optimizer, lr_scheduler=lr_scheduler, batch_size=batch_size, num_epochs=num_epochs)

In [20]:
trainer.train()

  0%|          | 0/174 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  0%|          | 0/174 [00:15<?, ?it/s]


KeyboardInterrupt: 