import torch,os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
print(torch.cuda.is_available())

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re, string

In [2]:
# trainer require accelerate installed

!pip install -U accelerate
!pip install -U transformers
!pip install datasets

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.27.2
    Uninstalling accelerate-0.27.2:
      Successfully uninstalled accelerate-0.27.2
Successfully installed accelerate-0.30.1
Collecting transformers
  Downloading transformers-4.41.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Downloading huggingface_hub-0.23.1-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.man

# Preprocessing

In [3]:
import string,re
import emoji

def preprocessing_text(text):
    text = re.sub(r"(?:\@|https?\://)\S+", "", text, flags=re.MULTILINE)
    text = text.strip()
    text = text.translate(text.maketrans('', '', string.punctuation.replace("_","")))
    text = re.sub('\\s+',' ',text).strip() #remove white space
    return text

def emoji_preprocess(data, column='content'):
    for index, row in data.iterrows():
        data.loc[index, column] = emoji.demojize(row['content'], language='es')

def convert_label(label):
    if label == 'NP':
        return 0
    elif label == 'P':
        return 1
    elif label == 'NR':
        return 2
    else:
        print('error:', label)
        return None
    
def preprocessing_data(df):
    df["content"] = df["content"].apply(preprocessing_text)
    #df["label"] = df["label"].apply(convert_label)
    emoji_preprocess(df)

# Read training dataset

In [4]:
# import os
# from google.colab import drive
# drive.mount('/content/gdrive')

path_train = '/kaggle/input/homomex24-development/public_data_train_phase/track_1_train.csv'
path_dev = '/kaggle/input/homomex24-development/track_1_dev.csv'

In [5]:
dev_data = pd.read_csv(path_dev)
preprocessing_data(dev_data)
dev_data.head()

Unnamed: 0,content,label
0,Me quise ligar a una chava ayer y no me pelo l...,P
1,eres un puñal papayita,P
2,Magnate ofrece 130 mdd al hombre que conquiste...,P
3,Los trolebuses del desgobierno de son idiotas ...,P
4,En época de Hitler no se decía eres gay y sí e...,P


In [6]:
train_data = pd.read_csv(path_train)
preprocessing_data(train_data)
train_data.head()

Unnamed: 0.1,Unnamed: 0,content,label
0,0,Golden Gay hombres que nunca se han acostado c...,NP
1,1,CuandoMiMamaDice Quien es ese gay que este al ...,NP
2,2,¡Felicidades Ganaste un pase doble para Rupaul...,NP
3,3,Ricardo del Real hombre trans de paso Gran pro...,NP
4,4,Los conceptos no pueden ser transgénero porque...,NP


# Create format for input

## Prompt Sample
1. Please classify sexism in the tweet "tweet" as an annotator with following information: gender: female, age: 46+,...

2. Tweet: "tweet". Anotator's infomation: "Age: 18  Education: High School". Is the tweet sexist?

3. 
   

In [7]:
# Task Instructions prompt 3
task_instructions = """Classify: """
annotator_infomation = """ Context: """
add_instructions = """ (If Yes, what is the intention of the person who wrote it and what type of sexism is it?)"""

In [8]:
# fist prompt
import re
def create_instruction_input_output(df):
    input_text = []
    output_text = []
    output_text = df["label"].tolist()
    inputs = df["content"].tolist()
    for index, item in enumerate(inputs):
        prompt11 = f"classify: {item}"
        prompt11 = re.sub("\\s+", " ", prompt11)
        input_text.append(prompt11)
    print(len(input_text),len(output_text))
    return input_text,output_text

# Preprocessing Data

In [9]:
# Create instruction input ouput for each task
input_train,output_train = create_instruction_input_output(train_data)
input_dev,output_dev = create_instruction_input_output(dev_data)

8800 8800
7000 7000


In [10]:
print(input_train[2])
print(output_train[2])

classify: ¡Felicidades Ganaste un pase doble para Rupauls Drag Race en SomosIgualesFM envía nombre completo y mail por DM
NP


In [11]:
train_df = pd.DataFrame(list(zip(input_train, output_train)), columns =['x_input', 'y_output'])
dev_df = pd.DataFrame(list(zip(input_dev, output_dev)), columns =['x_input', 'y_output'])
train_df.head(6)

Unnamed: 0,x_input,y_output
0,classify: Golden Gay hombres que nunca se han ...,NP
1,classify: CuandoMiMamaDice Quien es ese gay qu...,NP
2,classify: ¡Felicidades Ganaste un pase doble p...,NP
3,classify: Ricardo del Real hombre trans de pas...,NP
4,classify: Los conceptos no pueden ser transgén...,NP
5,classify: si el premio hace que la gente hable...,NP


In [12]:
# create a dictionary datatype contain train, val, test set
from datasets import Dataset, DatasetDict

tds = Dataset.from_pandas(train_df)
vds = Dataset.from_pandas(dev_df)

dataset_absa = DatasetDict()
dataset_absa['train'] = tds
dataset_absa['validation'] = vds

print(dataset_absa)

DatasetDict({
    train: Dataset({
        features: ['x_input', 'y_output'],
        num_rows: 8800
    })
    validation: Dataset({
        features: ['x_input', 'y_output'],
        num_rows: 7000
    })
})


# Defining model: mT5(small, base, large, XL)

In [13]:
# Use multilingual model: mT5(small, base, large, XL), mT0(small, base, large, XL)
# Use small for parameter tunning
small = 'google/mt5-base'
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from transformers import AutoModelForSeq2SeqLM
model_id = small
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast = False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="auto")

2024-05-22 11:54:21.009742: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-22 11:54:21.009853: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-22 11:54:21.121751: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [14]:
# Set Max input token length and max output token length
max_input_length = 256
max_output_length = 16
text_column = "x_input"
label_column = "y_output"

In [15]:
def preprocess_function(sample,padding="max_length"):
    # tokenize inputs
    model_inputs = tokenizer(text_target=sample[text_column], max_length=max_input_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample[label_column], max_length=max_output_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset_absa.map(preprocess_function, batched=True, remove_columns=["x_input", "y_output"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [16]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=16
)

In [17]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir="mt5"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size= True,
    #per_device_train_batch_size = 16,
    learning_rate= 3e-4, # Typically, 1e-4 and 3e-4 work well for T5 base / 0.001 as in mT5 paper and 2e-3 as other code / or 1e-5 to 5e-5
    num_train_epochs= 15 , #at least 5 to 10 epochs.
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"]
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [18]:
# train model
trainer.train()

Step,Training Loss
500,1.2756
1000,0.522
1500,0.4345
2000,0.442
2500,0.3345
3000,0.2989
3500,0.2736
4000,0.2584
4500,0.2496
5000,0.2311


TrainOutput(global_step=16500, training_loss=0.23829040134314336, metrics={'train_runtime': 7577.0535, 'train_samples_per_second': 17.421, 'train_steps_per_second': 2.178, 'total_flos': 7.9137044692992e+16, 'train_loss': 0.23829040134314336, 'epoch': 15.0})

# Evaluation

In [19]:
def evaluate_model(sample,max_target_length=max_output_length):
    input_ids = tokenizer(sample, max_length=max_input_length, return_tensors="pt", padding="max_length", truncation=True).input_ids.cuda()
    outputs = model.generate(input_ids=input_ids, max_new_tokens=max_output_length,do_sample=True, top_p=0.9)
    output = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
    return output

In [20]:
from tqdm import tqdm

y_true = output_dev
y_pred = []

#predict one batch of datadev at a time
for index, sample in enumerate(tqdm(input_dev)):
    pred = evaluate_model(sample)
    y_pred.append(pred)

print(len(y_true))
print(len(y_pred))

100%|██████████| 7000/7000 [08:26<00:00, 13.81it/s]

7000
7000





F1-SCORE

In [21]:
from sklearn.metrics import *
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

          NP       0.96      0.97      0.96      4360
          NR       0.96      0.98      0.97      1778
           P       0.89      0.83      0.86       862

    accuracy                           0.95      7000
   macro avg       0.94      0.92      0.93      7000
weighted avg       0.95      0.95      0.95      7000



---------------------------------------------------------------------------