In [None]:
!pip install transformers datasets accelerate -U

In [None]:
import transformers
import datasets
from datasets import load_dataset
dataset = load_dataset("SetFit/sst5")

# dataset = dataset.rename_column("label", "label_num")

from datasets import concatenate_datasets
dataset_train = concatenate_datasets([ dataset["train"] , dataset["validation"]])
dataset_test = dataset["test"]
model_checkpoint = "distilbert/distilbert-base-uncased"

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [4]:
import re

max_input_length = 512
max_target_length = 200


def clean_text(text):

  # to clean data, since its already pretty clean, not a lot of cleaning is necessary
  # text = re.sub(r'[^a-zA-Z\s]', '', text)
  text = re.sub(r'\s+', ' ', text).strip()

  return text

def preprocess_data(examples):
  texts_cleaned = [clean_text(text) for text in examples["text"]]
  model_inputs = tokenizer(texts_cleaned, max_length=max_input_length, truncation=True, padding = True)
  # targets = [str(label) for label in examples["labels"]]

  # # To setup the tokenizer for targets
  # with tokenizer.as_target_tokenizer():
  #       labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding=False)

#   labels = tokenizer(text_target = examples["label_text"], max_length=max_target_length, truncation=True, padding = True)

#   model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
tokenized_dataset = dataset_train.map(preprocess_data, batched=True)
tokenized_test_dataset = dataset_test.map(preprocess_data, batched=True)

In [6]:
tokenized_dataset = tokenized_dataset.remove_columns(["label_text", "text"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["label_text", "text"])

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainingArguments,  AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer


batch_size = 8
# model_name = "mt5-base"
# model_dir = f"google/{model_name}"

model =  AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=5)


args = TrainingArguments(
    output_dir="distilbert-ft-sst5", #save the results of the runs on disk
    evaluation_strategy="epoch", #evaluate the model at the end of each epoch
    logging_strategy="epoch", #log the training loss and metrics at the end of each epoch
    logging_steps=100, #log every 100 steps
    save_strategy="epoch", #save the model at the end of each epoch
    learning_rate=4e-5, #learning rate for the optimizer
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01, #weight decay for the optimizer
    save_total_limit=3,
    num_train_epochs=3,
#     fp16=True,
    load_best_model_at_end=True,
)

In [8]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

trainer = Trainer(
    model = model,
    args=args,
    train_dataset= tokenized_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
trainer.push_to_hub()