## Setup Development Environment

Our first step is to install the Hugging Face Libraries, including transformers and datasets. Running the following cell will install all the required packages.

In [None]:
!pip install datasets evaluate

## Import libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os, re, glob, datasets
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
from transformers import DataCollatorForSeq2Seq
from tqdm.auto import tqdm
from sklearn.metrics import classification_report
# from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments


nltk.download("punkt")
nltk.download('punkt_tab')

## Load and prepare dataset

In [None]:
df_train = pd.read_csv("train.txt", delimiter = '\t')
df_test = pd.read_csv("test.txt", delimiter= '\t')

df_train['Tweet'] = df_train['Tweet'].str.lower()
df_test['Tweet'] = df_test['Tweet'].str.lower()
df_train["Intensity Class"] = df_train["Intensity Class"].apply(lambda x: int(x.split(':')[0]))
df_train["Intensity Class"] = df_train["Intensity Class"].replace({-3: "negative", -2: "negative", -1: "neutral", 0: "neutral", 1: "neutral", 2: "positive", 3: "positive"})
df_test["Intensity Class"] = df_test["Intensity Class"].apply(lambda x: int(x.split(':')[0]))
df_test["Intensity Class"] = df_test["Intensity Class"].replace({-3: "negative", -2: "negative", -1: "neutral", 0: "neutral", 1: "neutral", 2: "positive", 3: "positive"})

In [None]:
train = datasets.Dataset.from_pandas(df_train)
test = datasets.Dataset.from_pandas(df_test)
train, test

## Section 1: Load the pretrained model and tokenize input text

In [None]:
# Load pre-trained FlanT5 model and tokenizer
model_id="google/flan-t5-base"

# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
def preprocess_function(sample):
    # tokenize inputs
    model_inputs = tokenizer(sample["Tweet"], truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["Intensity Class"], truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset_train = train.map(preprocess_function, batched=True)
tokenized_dataset_test = test.map(preprocess_function, batched=True)
print(f"Keys of tokenized dataset: {list(tokenized_dataset_train.features)}")

## Section 2: Prepare data for FLAN-T5

After we have processed our dataset, we can start training our model. Therefore we first need to load our [FLAN-T5](https://huggingface.co/models?search=flan-t5) and define data collector for FLAN-T5.

In [None]:
# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

 We use the `DataCollatorForSeq2Seq` from the Transformers library that will take care of padding our inputs and labels.

In [None]:
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding=True,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

## Section 3: Fine-tune FLAN-T5

The last step is to define the hyperparameters (`TrainingArguments`) we want to use for our training. We are leveraging the [Hugging Face Hub](https://huggingface.co/models) integration of the `Trainer` to automatically push our checkpoints, logs and metrics during training into a repository.

In [None]:
# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir="Results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=3e-4,

    num_train_epochs=5,
    logging_strategy="epoch",
    save_strategy="no",
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset_train
)

We can start our training by using the `train` method of the `Trainer`.

In [None]:
# Start training
trainer.train()

## Section 3: Run Inference and Classification Report

In [None]:
predictions = trainer.predict(tokenized_dataset_test)
predictions= np.where(predictions.label_ids != -100, predictions.predictions, tokenizer.pad_token_id)
predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)

In [None]:
report = classification_report(tokenized_dataset_test['Intensity Class'], predictions, zero_division=0)
print(report)

## Section 4

In [None]:
samples_number = len(tokenized_dataset_test)
progress_bar = tqdm(range(samples_number))
predictions_list = []
labels_list = []
model.eval()
for i in range(samples_number):
  text = tokenized_dataset_test['Tweet'][i]
  inputs = tokenizer.encode_plus(text, return_tensors='pt').to('cuda')
  outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)
  prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
  predictions_list.append(prediction)
  labels_list.append(tokenized_dataset_test['Intensity Class'][i])

  progress_bar.update(1)
str_labels_list = []
for i in range(len(labels_list)): str_labels_list.append(str(labels_list[i]))
report = classification_report(str_labels_list, predictions_list, zero_division=0)
print(report)

In [None]:
# Define predict function
def predict(input_text):
  print(input_text)

  # Tokenize input text
  inputs = tokenizer.encode_plus(input_text, return_tensors='pt').to('cuda')
  outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'])
  prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
  print("Prediction: ", prediction)

predict("CSGO matchmaking is so full of closet hacking, it's a truly awful game.")
predict("The things I would do for a @nvidia 3090... unspeakable! üßê")
predict("@mcmalveiro Hi Miguel, that's awesome! Thank you very much for updating us! üòÑ -Claire")
