import torch,os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
print(torch.cuda.is_available())

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re, string

In [None]:
# trainer require accelerate installed

!pip install -U accelerate
!pip install -U transformers
!pip install datasets

# Read training dataset

In [None]:
# import os
# from google.colab import drive
# drive.mount('/content/gdrive')

path_train = '/kaggle/input/homomex24-development/public_data_train_phase/track_3_train.csv'
path_dev = '/kaggle/input/homomex24-development/track_3_dev.csv'

In [None]:
dev_data = pd.read_csv(path_dev)
dev_data = dev_data.rename(columns={'lyric': 'lyrics', 'label': 'label'})
dev_data.head()

In [None]:
train_data = pd.read_csv(path_train)
train_data.head()

# Create format for input

## Prompt Sample
1. Please classify sexism in the tweet "tweet" as an annotator with following information: gender: female, age: 46+,...

2. Tweet: "tweet". Anotator's infomation: "Age: 18  Education: High School". Is the tweet sexist?

3. 
   

In [None]:
# Task Instructions prompt 3
task_instructions = """Classify: """
annotator_infomation = """ Context: """
add_instructions = """ (If Yes, what is the intention of the person who wrote it and what type of sexism is it?)"""

In [None]:
# fist prompt
import re
def create_instruction_input_output(df):
    input_text = []
    output_text = []
    output_text = df["label"].tolist()
    inputs = df["lyrics"].tolist()
    for index, item in enumerate(inputs):
        prompt11 = f"classify: {item}"
        prompt11 = re.sub("\\s+", " ", prompt11)
        input_text.append(prompt11)
    print(len(input_text),len(output_text))
    return input_text,output_text

# Preprocessing Data

In [None]:
# Create instruction input ouput for each task
input_train,output_train = create_instruction_input_output(train_data)
input_dev,output_dev = create_instruction_input_output(dev_data)

In [None]:
print(input_train[2])
print(output_train[2])

In [None]:
train_df = pd.DataFrame(list(zip(input_train, output_train)), columns =['x_input', 'y_output'])
dev_df = pd.DataFrame(list(zip(input_dev, output_dev)), columns =['x_input', 'y_output'])
train_df.head(6)

In [None]:
# create a dictionary datatype contain train, val, test set
from datasets import Dataset, DatasetDict

tds = Dataset.from_pandas(train_df)
vds = Dataset.from_pandas(dev_df)

dataset_absa = DatasetDict()
dataset_absa['train'] = tds
dataset_absa['validation'] = vds

print(dataset_absa)

# Defining model: mT5(small, base, large, XL)

In [None]:
# Use multilingual model: mT5(small, base, large, XL), mT0(small, base, large, XL)
# Use small for parameter tunning
small = 'google/mt5-base'
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from transformers import AutoModelForSeq2SeqLM
model_id = small
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast = False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="auto")

In [None]:
# Set Max input token length and max output token length
max_input_length = 256
max_output_length = 16
text_column = "x_input"
label_column = "y_output"

In [None]:
def preprocess_function(sample,padding="max_length"):
    # tokenize inputs
    model_inputs = tokenizer(text_target=sample[text_column], max_length=max_input_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample[label_column], max_length=max_output_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset_absa.map(preprocess_function, batched=True, remove_columns=["x_input", "y_output"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=16
)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir="mt5"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size= True,
    #per_device_train_batch_size = 16,
    learning_rate= 3e-4, # Typically, 1e-4 and 3e-4 work well for T5 base / 0.001 as in mT5 paper and 2e-3 as other code / or 1e-5 to 5e-5
    num_train_epochs= 15 , #at least 5 to 10 epochs.
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"]
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [None]:
# train model
trainer.train()

# Evaluation

In [None]:
def evaluate_model(sample,max_target_length=max_output_length):
    input_ids = tokenizer(sample, max_length=max_input_length, return_tensors="pt", padding="max_length", truncation=True).input_ids.cuda()
    outputs = model.generate(input_ids=input_ids, max_new_tokens=max_output_length,do_sample=True, top_p=0.9)
    output = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
    return output

In [None]:
from tqdm import tqdm

y_true = output_dev
y_pred = []

#predict one batch of datadev at a time
for index, sample in enumerate(tqdm(input_dev)):
    pred = evaluate_model(sample)
    y_pred.append(pred)

print(len(y_true))
print(len(y_pred))

F1-SCORE

In [None]:
from sklearn.metrics import *
print(classification_report(y_true, y_pred))

---------------------------------------------------------------------------