# Setup

In [1]:
%pip install pandas transformers datasets torch scikit-learn evaluate seaborn imblearn accelerate>=0.26.0

# Preparação de dados
Carrega o dataset a ser utilizado para fine-tuning e seleciona os atributos mais relevantes.

In [2]:
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

Faz o download do dataset anotado no diretório ./data

In [3]:
import os

if not os.path.exists('./data/covidbr_labeled.csv'):
  !mkdir data
  !curl -L -o ./data/covidbr_labeled.csv https://zenodo.org/records/5193932/files/covidbr_labeled.csv
else:
    print("File already exists. Skipping download.")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2243k  100 2243k    0     0  1269k      0  0:00:01  0:00:01 --:--:-- 1269k


In [4]:
original_dataset_df = pd.read_csv('./data/covidbr_labeled.csv')
original_dataset_df

Unnamed: 0,shares,text,misinformation,source,revision
0,27,"O ministro da Ciência, Tecnologia, Inovações e...",0,https://www.gov.br/pt-br/noticias/educacao-e-p...,
1,26,Pesquisa com mais de 6.000 médicos em 30 paíse...,1,https://www.aosfatos.org/noticias/e-falso-que-...,
2,25,É com muita alegria que comunico que mais um p...,0,http://portal.mec.gov.br/component/content/art...,
3,25,Renda Brasil unificará vários programas sociai...,0,https://agenciabrasil.ebc.com.br/politica/noti...,
4,24,O Secretário-Geral da OTAN Jens Stoltenberg ta...,0,,1.0
...,...,...,...,...,...
2894,1,A torcida do corona deve estar arrancando os c...,0,,
2895,1,“OS EUA E O CORONAVÍRUS :\n\nAcabei de assisti...,0,https://www.reuters.com/article/us-health-coro...,1.0
2896,1,Estatísticas falsas conforme depoimentos colhi...,1,,1.0
2897,1,"Atenção => 🇧🇷💓💓💓 *MUITO IMPORTANTE! ""Como é qu...",0,,


In [5]:
dataset_df = original_dataset_df[["text", "misinformation"]]
dataset_df

Unnamed: 0,text,misinformation
0,"O ministro da Ciência, Tecnologia, Inovações e...",0
1,Pesquisa com mais de 6.000 médicos em 30 paíse...,1
2,É com muita alegria que comunico que mais um p...,0
3,Renda Brasil unificará vários programas sociai...,0
4,O Secretário-Geral da OTAN Jens Stoltenberg ta...,0
...,...,...
2894,A torcida do corona deve estar arrancando os c...,0
2895,“OS EUA E O CORONAVÍRUS :\n\nAcabei de assisti...,0
2896,Estatísticas falsas conforme depoimentos colhi...,1
2897,"Atenção => 🇧🇷💓💓💓 *MUITO IMPORTANTE! ""Como é qu...",0


# Análise exploratória de dados

O objetivo é entender melhor e sumarizar as características dos dados, analisando quantidade e tipos de atributos, verificando distribuição do atributo alvo, identificando padrões e anomalias, removendo atributos que pareçam irrelevantes ou problemáticos, etc. Utilize gráficos e sumarizações estatísticas para a EDA. Verifique potenciais problemas nos dados, como por exemplo, a necessidade de normalizar os atributos, balancear classes, ou remover instâncias ou atributos por inconsistências nos dados.

- P1. Qual a quantidade e tipos de atributos? Existem inconsistências?
  - Quais são os atributos disponíveis?
  - Existem inconsistências nos atributos? (Atributos vazios, potenciais erros, etc)
  - Existem atributos que necessitam ser removidos ou transformados?
- P2. Qual a distribuição do atributo alvo?
  - Quais são as classes alvo? Qual a distribuição entre as classes? Está balanceada ou desbalanceada?
- P3. Quais os padrões e anomalias dos atributos?



## P1. Qual a quantidade e tipos de atributos? Existem inconsistências?

In [6]:
dataset_df.info(verbose = False, memory_usage = False, show_counts = True) # mostra o tipo e a quantidade de itens não nulos de cada coluna

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2899 entries, 0 to 2898
Columns: 2 entries, text to misinformation
dtypes: int64(1), object(1)

In [7]:
dataset_df.dtypes

Unnamed: 0,0
text,object
misinformation,int64


## P2. Qual a distribuição do atributo alvo?

In [8]:
dataset_df['misinformation'].describe(include='all')

Unnamed: 0,misinformation
count,2899.0
mean,0.314591
std,0.464433
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


# Pré-processamento

## Tokenização

Carrega o tokenizador para `bert-base-portuguese-cased` (BERTimbau)

In [9]:
from transformers import AutoTokenizer  # Or BertTokenizer

tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)

tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Aplica a tokenização para todas as instâncias de `text`

In [10]:
def tokenize_function(examples):
    return tokenizer(str(examples), padding="max_length", truncation=True, max_length=128)

# Apply the tokenizer to the dataset
tokenized_datasets = dataset_df.apply(lambda row: tokenize_function(row["text"]), axis=1)

# Inspect tokenized samples
tokenized_df = pd.DataFrame(tokenized_datasets, columns=["tk_text"])
tokenized_df

Unnamed: 0,tk_text
0,"[input_ids, token_type_ids, attention_mask]"
1,"[input_ids, token_type_ids, attention_mask]"
2,"[input_ids, token_type_ids, attention_mask]"
3,"[input_ids, token_type_ids, attention_mask]"
4,"[input_ids, token_type_ids, attention_mask]"
...,...
2894,"[input_ids, token_type_ids, attention_mask]"
2895,"[input_ids, token_type_ids, attention_mask]"
2896,"[input_ids, token_type_ids, attention_mask]"
2897,"[input_ids, token_type_ids, attention_mask]"


In [11]:
data = pd.concat([tokenized_df, dataset_df["misinformation"]], axis=1, join="inner")
data

Unnamed: 0,tk_text,misinformation
0,"[input_ids, token_type_ids, attention_mask]",0
1,"[input_ids, token_type_ids, attention_mask]",1
2,"[input_ids, token_type_ids, attention_mask]",0
3,"[input_ids, token_type_ids, attention_mask]",0
4,"[input_ids, token_type_ids, attention_mask]",0
...,...,...
2894,"[input_ids, token_type_ids, attention_mask]",0
2895,"[input_ids, token_type_ids, attention_mask]",0
2896,"[input_ids, token_type_ids, attention_mask]",1
2897,"[input_ids, token_type_ids, attention_mask]",0


## Balanceamento de classes

Utilizando o cálculo de class_weights.

Fonte: https://medium.com/@heyamit10/fine-tuning-bert-for-classification-a-practical-guide-b8c1c56f252c

In [12]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

labels = data["misinformation"]
class_weights = compute_class_weight("balanced", classes=np.unique(labels), y=labels)
print(class_weights)

[0.7294917  1.58936404]


# Fine-tuning

## Configuração do modelo

In [13]:
import torch

# Configurações iniciais
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando o dispositivo: {device}")

Usando o dispositivo: cuda


In [31]:
from transformers import BertForSequenceClassification

model_name = 'neuralmind/bert-base-portuguese-cased'
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

print(model.config)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.52.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 29794
}



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [32]:
# Freeze all layers except the classifier
for param in model.bert.parameters():
    param.requires_grad = False

# Keep only the classification head trainable
for param in model.classifier.parameters():
    param.requires_grad = True

print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

Trainable parameters: 1538


## Configuração do treinamento

In [121]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",           # Directory for saving model checkpoints
    #evaluation_strategy="epoch",     # Evaluate at the end of each epoch
    learning_rate=5e-5,              # Start with a small learning rate
    per_device_train_batch_size=16,  # Batch size per GPU
    per_device_eval_batch_size=16,
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Regularization
    save_total_limit=2,              # Limit checkpoints to save space
    #load_best_model_at_end=True,     # Automatically load the best checkpoint
    logging_dir="./logs",            # Directory for logs
    logging_steps=100,               # Log every 100 steps
    fp16=True,                        # Enable mixed precision for faster training
    metric_for_best_model="accuracy"
)

print(training_args)

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.NO,
eval_use_gather_object=False,


## Treinamento com holdout

In [122]:


def stratified_train_val_split(df, target_column):
    train, val = train_test_split(
        df,
        test_size=0.2,
        stratify=df[target_column],
        random_state=42
    )
    return train, val

df_train, df_test = stratified_train_val_split(dataset_df, target_column='misinformation')
df_train, df_eval = stratified_train_val_split(df_train, target_column='misinformation')

print("Tamanhos dos conjuntos:")
print(f"Treino: {len(df_train)}")
print(f"Teste: {len(df_test)}")
print(f"Validação: {len(df_eval)}")

Tamanhos dos conjuntos:
Treino: 1855
Teste: 580
Validação: 464


In [123]:
df_train.groupby(['misinformation']).count()

Unnamed: 0_level_0,text
misinformation,Unnamed: 1_level_1
0,1270
1,584


In [124]:
df_test.groupby(['misinformation']).count()

Unnamed: 0_level_0,text
misinformation,Unnamed: 1_level_1
0,398
1,182


In [125]:
df_train.reset_index(drop=True, inplace=True)
df_eval.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [126]:
import transformers
from sklearn.metrics import accuracy_score, f1_score

In [127]:
# Configurações iniciais
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando o dispositivo: {device}")

Usando o dispositivo: cuda


In [128]:
from datasets import Dataset, DatasetDict
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import ClassLabel

df_train["text"] = df_train["text"].astype(str)
df_eval["text"] = df_eval["text"].astype(str)
df_test["text"] = df_test["text"].astype(str)

df_train = df_train.rename(columns={"misinformation": "label"})
df_eval = df_eval.rename(columns={"misinformation": "label"})
df_test = df_test.rename(columns={"misinformation": "label"})

train_dataset = Dataset.from_pandas(df_train)
eval_dataset = Dataset.from_pandas(df_eval)
test_dataset = Dataset.from_pandas(df_test)

# Crie o objeto ClassLabel
class_label = ClassLabel(num_classes=2)

# Aplique o cast no dataset Hugging Face
train_dataset = train_dataset.cast_column("label", class_label)
eval_dataset = eval_dataset.cast_column("label", class_label)
test_dataset = test_dataset.cast_column("label", class_label)

def tokenize_function(examples):
    return tokenizer(examples["text"], max_length=128, truncation=True, padding='max_length')

train_dataset_tokenized = train_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
test_dataset_tokenized = test_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
eval_dataset_tokenized = eval_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

Casting the dataset:   0%|          | 0/1855 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/464 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/580 [00:00<?, ? examples/s]

Map:   0%|          | 0/1855 [00:00<?, ? examples/s]

Map:   0%|          | 0/580 [00:00<?, ? examples/s]

Map:   0%|          | 0/464 [00:00<?, ? examples/s]

In [129]:
from transformers import BertForSequenceClassification, BertConfig

model_name = 'neuralmind/bert-base-portuguese-cased'
config = BertConfig.from_pretrained(model_name, num_labels=2)
model = BertForSequenceClassification.from_pretrained(model_name, config=config)

print(model.config)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.52.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 29794
}



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [154]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# 4. Configurar o Trainer e os parâmetros de treino
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # print(preds)
    f1_mi = f1_score(labels, preds, average='micro')
    f1_ma = f1_score(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc
        # 'f1-macro': f1_ma,
        # 'f1-micro': f1_mi
    }


#setando os hiperparâmetros POR EPOCA
batch_size = 8
epochs = 5
learning_rate = 3e-05
steps_per_epoch = round(len(train_dataset_tokenized) / batch_size)
print(len(train_dataset_tokenized), len(train_dataset_tokenized), steps_per_epoch)

training_args = TrainingArguments(
    output_dir='test_trainer',
    overwrite_output_dir=True,
    eval_strategy='epoch',
    save_strategy ='epoch',
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    logging_steps=20,
    report_to="none",
    learning_rate=learning_rate,
    num_train_epochs = epochs,
    load_best_model_at_end = True,
    metric_for_best_model='accuracy',
    #   report_to='tensorboard'
        )

1855 1855 232


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [151]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    compute_metrics=compute_metrics,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # early_stopping_threshold=0.001
)
trainer.label_names = ["label"]


In [152]:
import os
!mkdir -p models/covidbr_pt

trainer.train()

output_dir = 'models/covidbr_pt'

# 6. Salvar modelo e tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

# Recarregue o modelo do trainer e salve no formato .bin
model.save_pretrained(output_dir, safe_serialization=False)

# 7. Verificar arquivos
print("Arquivos salvos:")
print(os.listdir(output_dir))

Epoch,Training Loss,Validation Loss
1,0.031,No log


KeyError: "The `metric_for_best_model` training argument is set to 'eval_accuracy', which is not found in the evaluation metrics. The available evaluation metrics are: []. Consider changing the `metric_for_best_model` via the TrainingArguments."

## 5-Fold Cross Validation

In [None]:
from transformers import Trainer
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

fold_results = []

# prepare cross validation
k=5
kf = KFold(n_splits=k, random_state=1, shuffle=True)

for fold, (train_index, val_index) in enumerate(kf.split(data)):
    print(f"Fold {fold + 1}")

    # Split the data into training and validation sets
    train_df = data.iloc[train_index]
    val_df = data.iloc[val_index]

    # Convert to Hugging Face Dataset format
    train_dataset = train_df.to_dict(orient="list")
    val_dataset = val_df.to_dict(orient="list")

    # Create a Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=lambda p: {"f1": f1_score.compute(predictions=p.predictions.argmax(-1), references=p.label_ids)["f1"]}
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_result = trainer.evaluate()
    print(f"Fold {fold + 1} F1 Score: {eval_result['eval_f1']}")
    fold_results.append(eval_result['eval_f1'])

# Calculate and print the mean F1 score
mean_f1 = sum(fold_results) / len(fold_results)
print(f"Mean F1 Score: {mean_f1}")

ref https://www.philschmid.de/k-fold-as-cross-validation-with-a-bert-text-classification-example