In [None]:
python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('{{ token }}')" # inserir token huggingface
pip install transformers datasets numpy evaluate



In [None]:
from datasets import load_dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np

dataset = load_dataset("JAugusto97/told-br")
print(dataset)

model_name = "google-bert/bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def preprocess_data(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(preprocess_data, batched=True)

precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision = precision_metric.compute(predictions=predictions, references=labels)
    recall = recall_metric.compute(predictions=predictions, references=labels)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")

    return {"precision": precision["precision"], "recall": recall["recall"], "accuracy": accuracy["accuracy"], "f1": f1["f1"]}

training_args = TrainingArguments(
    f"{model_name}-finetuned-hate-speech-ptbr",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16800
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2100
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2100
    })
})


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2100 [00:00<?, ? examples/s]

  trainer = Trainer(


In [None]:
# Step 7: Fine-Tune the Model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mguitapajos[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,Accuracy,F1
1,0.5127,0.497758,0.650041,0.875551,0.742381,0.74181
2,0.4415,0.52057,0.714286,0.710352,0.751905,0.751822
3,0.3623,0.62043,0.674731,0.829295,0.753333,0.754219
4,0.283,0.622424,0.70202,0.765419,0.758095,0.759012
5,0.2196,0.757232,0.695351,0.774229,0.755714,0.756755


TrainOutput(global_step=5250, training_loss=0.3703602360316685, metrics={'train_runtime': 1157.1895, 'train_samples_per_second': 72.59, 'train_steps_per_second': 4.537, 'total_flos': 5525332162560000.0, 'train_loss': 0.3703602360316685, 'epoch': 5.0})

In [None]:
# Step 8: Evaluate the Model
results = trainer.evaluate()
print(f"Evaluation Results: {results}")

Evaluation Results: {'eval_loss': 0.6224240660667419, 'eval_precision': 0.702020202020202, 'eval_recall': 0.7654185022026432, 'eval_accuracy': 0.758095238095238, 'eval_f1': 0.7590123199739615, 'eval_runtime': 6.8674, 'eval_samples_per_second': 305.791, 'eval_steps_per_second': 19.221, 'epoch': 5.0}


In [None]:
trainer.push_to_hub()

events.out.tfevents.1733798572.1644957f1651.72393.0:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1733799759.1644957f1651.72393.1:   0%|          | 0.00/560 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/GuiTap/bert-base-multilingual-cased-finetuned-hate-speech-ptbr/commit/09ce2b152c1bccc815c0305a22fdbfe35e565b7a', commit_message='End of training', commit_description='', oid='09ce2b152c1bccc815c0305a22fdbfe35e565b7a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/GuiTap/bert-base-multilingual-cased-finetuned-hate-speech-ptbr', endpoint='https://huggingface.co', repo_type='model', repo_id='GuiTap/bert-base-multilingual-cased-finetuned-hate-speech-ptbr'), pr_revision=None, pr_num=None)

In [1]:
from transformers import pipeline

# Load the pipeline with your fine-tuned model
classifier = pipeline(
    "text-classification",
    model="GuiTap/bert-base-multilingual-cased-finetuned-hate-speech-ptbr",  # Your model on Hugging Face Hub
    tokenizer="GuiTap/bert-base-multilingual-cased-finetuned-hate-speech-ptbr"  # Use the same tokenizer
)

# Example tweets to classify
tweets = [
    "Cala a boca, seu lixo.",
    "Não gosto de pessoas daquele bairro, são todos fedidos",
    "Se acha muito pra alguém que não sabe nem o básico.",
    "Vai se catar, ninguém quer saber sua opinião.",
    "O mundo seria melhor sem pessoas como você.",
    "Bora espalhar amor hoje, o mundo já tem problema demais.",
    "Tá tudo bem errar, faz parte do aprendizado. Bora melhorar!",
    "Acordei inspirado, hoje vai ser um dia top! ✨",
    "Impressionante como um sorriso muda o dia de alguém, tenta aí. 😊",
    "Todo mundo tá na sua luta, seja mais gentil!"
]

# Classify the tweets
predictions = classifier(tweets)

# Print results
for tweet, prediction in zip(tweets, predictions):
    print(f"Tweet: {tweet}")
    print(f"Prediction: {prediction['label']} (Confidence: {prediction['score']:.2f})")
    print()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/907 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Tweet: Cala a boca, seu lixo.
Prediction: LABEL_1 (Confidence: 0.99)

Tweet: Não gosto de pessoas daquele bairro, são todos fedidos
Prediction: LABEL_1 (Confidence: 0.98)

Tweet: Se acha muito pra alguém que não sabe nem o básico.
Prediction: LABEL_0 (Confidence: 0.99)

Tweet: Vai se catar, ninguém quer saber sua opinião.
Prediction: LABEL_0 (Confidence: 0.96)

Tweet: O mundo seria melhor sem pessoas como você.
Prediction: LABEL_0 (Confidence: 1.00)

Tweet: Bora espalhar amor hoje, o mundo já tem problema demais.
Prediction: LABEL_0 (Confidence: 0.98)

Tweet: Tá tudo bem errar, faz parte do aprendizado. Bora melhorar!
Prediction: LABEL_0 (Confidence: 1.00)

Tweet: Acordei inspirado, hoje vai ser um dia top! ✨
Prediction: LABEL_0 (Confidence: 0.99)

Tweet: Impressionante como um sorriso muda o dia de alguém, tenta aí. 😊
Prediction: LABEL_0 (Confidence: 0.98)

Tweet: Todo mundo tá na sua luta, seja mais gentil!
Prediction: LABEL_0 (Confidence: 0.99)

