
Adaptado de [Gabriel Assis et al.,](https://github.com/MeLLL-UFF/hate_speech_in_context_pt)


Gabriel Assis, Annie Amorim, Jonnatahn Carvalho, Daniel de Oliveira, Daniela Vianna, and Aline Paes. 2024. Exploring Portuguese Hate Speech Detection in Low-Resource Settings: Lightly Tuning Encoder Models or In-Context Learning of Large Models?. In Proceedings of the 16th International Conference on Computational Processing of Portuguese, pages 301–311, Santiago de Compostela, Galicia/Spain. Association for Computational Lingustics.]



In [None]:
!pip install datasets transformers scikit-learn evaluate emoji



In [None]:
import torch
from datasets import load_dataset
from transformers import BERTTokenizer, BertModel
import sklearn
import numpy as np
import evaluate

In [None]:
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
!ls '/content/drive/MyDrive/LMs-gmail'

5-feature-extraction.ipynb  tweetsentbr.csv  tweetsentbr_test.csv  tweetsentbr_train.csv


# Lendo dataset

In [None]:

DATA_PATH = '/content/drive/MyDrive/LMs-gmail/'
DATA_FILE = 'tweetsentbr.csv'
MODEL_NAME = 'melll-uff/bertweetbr'


In [None]:
# prompt: break a csv into random train and test

import pandas as pd
from sklearn.model_selection import train_test_split

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(DATA_PATH+DATA_FILE)

# Split the DataFrame into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save the training and testing sets to new CSV files
train_df.to_csv(DATA_PATH+'tweetsentbr_'+'train.csv', index=False)
test_df.to_csv(DATA_PATH+'tweetsentbr_'+'test.csv', index=False)


In [None]:
train_file = DATA_PATH + 'tweetsentbr_' + 'train.csv'
test_file = DATA_PATH +  'tweetsentbr_' + 'test.csv'

dataset = load_dataset('csv', data_files={'train': train_file, 'test': test_file})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
dataset['train'].shape, dataset['test'].shape

((6215, 2), (1554, 2))

In [None]:
dataset['train'] = dataset['train'][:int(dataset['train'].shape[0]/4)]


In [None]:
dataset['test'] = dataset['test'][:int(dataset['test'].shape[0]/4)]



# Carregando os modelos

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertModel.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertweetTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertModel were not initialized from the model checkpoint at melll-uff/bertweetbr and are newly initialized: ['embeddings.LayerNor

# Recuperando embeddings: last_hidden_state é um tensor de (batch_size, sequence_length, hidden_size)

In [None]:
def get_embeddings(sentences):
    embeddings = []
    for sentence in tqdm(sentences):
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding="max_length", max_length=130) #change according to model
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state[:,0,:].squeeze().numpy()) #token CLS
    return embeddings

train_embeddings = get_embeddings(dataset['train']['tweet'])
test_embeddings = get_embeddings(dataset['test']['tweet'])

100%|██████████| 1553/1553 [11:17<00:00,  2.29it/s]
100%|██████████| 388/388 [02:50<00:00,  2.28it/s]


In [None]:
len(train_embeddings[0])

768

In [None]:
#inputs = tokenizer.batch_encode_plus(dataset['train']['tweet'], return_tensors="pt", truncation=True, add_special_tokens=True, max_length=130, padding="max_length")

In [None]:
# other layers
# hidden_states = outputs[2][1:]
# o modelo BERT base tem 12 elementos em hidden_states correspondendo a todas as camadas da primeira até a última.
# Cada uma é um array de shape (batch_size, sequence_length, hidden_size).
# Para acessar o estado escondido da 3a camada para o quinto token de todas as amostras no batch seria hidden_states[2][:,4].
# Pode iterar sobre todos os tokens



# Treinando um MLP classifier

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,\
                    hidden_layer_sizes=(50, 2), random_state=1)
mlp.fit(train_embeddings, dataset['train']['class'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [None]:
mlp.predict_proba(test_embeddings)

array([[0.41457894, 0.58542106],
       [0.43540519, 0.56459481],
       [0.62383501, 0.37616499],
       [0.5388044 , 0.4611956 ],
       [0.23808835, 0.76191165],
       [0.34820494, 0.65179506],
       [0.43540519, 0.56459481],
       [0.29486101, 0.70513899],
       [0.23808835, 0.76191165],
       [0.20216657, 0.79783343],
       [0.40242571, 0.59757429],
       [0.34796429, 0.65203571],
       [0.34796429, 0.65203571],
       [0.50370952, 0.49629048],
       [0.37034885, 0.62965115],
       [0.43214869, 0.56785131],
       [0.29486101, 0.70513899],
       [0.29486101, 0.70513899],
       [0.39910697, 0.60089303],
       [0.29923494, 0.70076506],
       [0.27236494, 0.72763506],
       [0.52338625, 0.47661375],
       [0.20216657, 0.79783343],
       [0.29923494, 0.70076506],
       [0.23808835, 0.76191165],
       [0.23808835, 0.76191165],
       [0.43214869, 0.56785131],
       [0.27236494, 0.72763506],
       [0.40242571, 0.59757429],
       [0.34820494, 0.65179506],
       [0.

In [None]:
mlp_predictions = mlp.predict(test_embeddings)
mlp_predictions

array([1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,

In [None]:
sklearn.metrics.confusion_matrix(dataset['test']['class'], mlp_predictions)

array([[ 24, 127],
       [ 22, 215]])

In [None]:
# Carregar métricas
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
recall = evaluate.load("recall")
precision = evaluate.load("precision")

def compute_metrics(predictions, labels):
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels),
        "f1": f1.compute(predictions=predictions, references=labels, average='macro'),
        "recall": recall.compute(predictions=predictions, references=labels, average='macro'),
        "precision": precision.compute(predictions=predictions, references=labels, average='macro')
    }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

In [None]:
metrics = compute_metrics(mlp_predictions, dataset['test']['class'])

print(metrics)

{'accuracy': {'accuracy': 0.615979381443299}, 'f1': {'f1': 0.4931572902694125}, 'recall': {'recall': 0.533056696565792}, 'precision': {'precision': 0.5751970505975083}}
