In [1]:
!pip install transformers
!sudo apt-get install git-lfs
!git lfs install
!git clone https://huggingface.co/neuralmind/bert-large-portuguese-cased




git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 74 not upgraded.
Git LFS initialized.
Cloning into 'bert-large-portuguese-cased'...
remote: Enumerating objects: 36, done.[K
remote: Total 36 (delta 0), reused 0 (delta 0), pack-reused 36 (from 1)[K
Unpacking objects: 100% (36/36), 102.71 KiB | 5.41 MiB/s, done.
Filtering content: 100% (2/2), 2.49 GiB | 15.57 MiB/s, done.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForPreTraining
from transformers import get_linear_schedule_with_warmup, AdamW
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from torch.cuda.amp import autocast, GradScaler
import time, datetime
import torch.nn.functional as F



In [3]:
df = pd.read_csv("../data/hatebr_and_rationales.csv")

In [4]:
## split to train and val
TRAIN_SIZE = 0.8
TEST_SIZE = 0.1
VAL_SIZE = 0.1


x_train, x_test_val, y_train, y_test_val = train_test_split(df['normalized_text'], df['label final'], test_size=TEST_SIZE + VAL_SIZE, random_state=0)
x_test, x_val, y_test, y_val = train_test_split(x_test_val, y_test_val, test_size=VAL_SIZE/(TEST_SIZE + VAL_SIZE), random_state=0)

In [5]:
# sera necessario utilizar GPU para rodar o BERT
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
device

device(type='cuda', index=0)

In [6]:
# define as funcoes para preparar o dataset para dar entrada para o modelo

def tokenize_corpus(df, tokenizer, max_len):
    
    input_ids = []
    attention_masks = []
    
    for doc in df:
        encoded_dict = tokenizer.encode_plus(
                            doc,  # document to encode.
                            add_special_tokens=True,  # adiciona '[CLS] token para início' e '[SEP] token para fim'
                            max_length=max_len,  # define max length
                            truncation=True,  # trunca mensagens longas
                            padding='max_length',  # adiciona padding (adiciona 0 em sequências menores que o tamanho maximo)
                            return_attention_mask=True,  # cria mascaras de atenção
                            return_tensors='pt'  # retorna tensores pytorch
                       )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])  # o attention mask diferencia o que é padding do que não é

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)
     
def prepare_dataset(features, labels):
    
    padded_tokens, attention_masks = tokenize_corpus(features.values, tokenizer, 512) # tokeniza as mensagens
    target = np.array(labels.values, dtype=np.int64).reshape(-1, 1) # transforma target em np array
    tensor_df = TensorDataset(padded_tokens, attention_masks, torch.from_numpy(target))

    return tensor_df

In [7]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

# separa o dataset entre teste e treino e prepara para dar entrada ao modelo
train_dataset = prepare_dataset(x_train, y_train)
val_dataset = prepare_dataset(x_val, y_val)
test_dataset = prepare_dataset(x_test, y_test)

train_dataloader = DataLoader(train_dataset,
                              batch_size=8,
                              shuffle=True)

val_dataloader = DataLoader(val_dataset,
                              batch_size=8,
                              shuffle=True)

test_dataloader = DataLoader(test_dataset,
                              batch_size=8,
                              shuffle=True)
     

Downloading tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [8]:
model = AutoModelForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased',
                                            num_labels=2, return_dict=False)

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [10]:
from sklearn import metrics
def compute_metrics(y_true, y_pred):
    f1 = metrics.f1_score(y_true, y_pred, zero_division = 0, average='macro')       
    recall = metrics.recall_score(y_true, y_pred, zero_division = 0, average='macro')
    precision = metrics.precision_score(y_true, y_pred, zero_division = 0, average='macro')
    acc = metrics.accuracy_score(y_true, y_pred)
    
    return f1, recall, precision, acc

In [11]:
def train(model, dataloader, optimizer):
    t0 = time.time()

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))

    total_loss = 0
    total_f1 = 0
    total_recall = 0
    total_precision = 0
    total_acc = 0

    model.train() # coloca o modelo no modo de treino

    for i, batch in enumerate(dataloader): # itera nos batches de treino

        if i % 50 == 0 and not i == 0: # reportar o progresso a cada 50 batches
            print('  Batch {:>5,}  of  {:>5,}.'.format(i, len(dataloader)))
        
        # copia os tensores para GPU, pois devem estar no mesmo dispositivo que o modelo
        input_ids = batch[0].to(device) 
        input_mask = batch[1].to(device)
        labels = batch[2].to(device).long()
        
        # limapa os gradientes previamente calculados
        optimizer.zero_grad()

        # forward
        with autocast():
            loss, logits = model(input_ids=input_ids,
                                 attention_mask=input_mask,
                                 labels=labels.view(-1))
            total_loss += loss.item()

        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
        # Backward passes under autocast are not recommended.
        # Backward ops run in the same dtype autocast chose for corresponding forward ops.
        scaler.scale(loss).backward()

        # scaler.step() first unscales the gradients of the optimizer's assigned params.
        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped.
        scaler.step(optimizer)

        scaler.update() # Updates the scale for next iteration.
        scheduler.step() # update the learning rate

        logits = logits.detach().cpu().numpy() # passa o logits para CPU para executar os proximos calculos
        rounded_preds = np.argmax(logits, axis=1).flatten() # calcula a predição, pegando a label com maior probabilidade para cada instância

        f1, recall, precision, acc = compute_metrics(labels.detach().cpu().numpy(), rounded_preds)
        total_f1 += f1
        total_recall += recall
        total_precision += precision
        total_acc += acc

    # calcula a media das metricas sobre todos os batches
    avg_train_loss = total_loss / len(dataloader)
    avg_train_f1 = total_f1 / len(dataloader)
    avg_train_recall = total_recall / len(dataloader)
    avg_train_precision = total_precision / len(dataloader)
    avg_train_acc = total_acc / len(dataloader)

    training_time = str(datetime.timedelta(seconds=int(round(time.time() - t0))))

    print("")
    print("Summary Train Resuts")
    print("epoch | loss | acc | recall | f1 | precision | training time ")
    print(f"{epoch+1:5d} | {avg_train_loss:.5f} | {avg_train_acc:.5f} | {avg_train_recall:.5f} | {avg_train_f1:.5f} | {avg_train_precision:.5f} | {training_time:}")

    torch.cuda.empty_cache()
    

def validating(model, dataloader):
    t0 = time.time()

    # put the model in evaluation mode
    model.eval()

    total_loss = 0
    total_f1 = 0
    total_recall = 0
    total_precision = 0
    total_acc = 0
    save_logits = []
    rounded_preds = []
    labels_true = []
    probabilities = []
    
    for batch in dataloader: 

        input_ids = batch[0].cuda()
        input_mask = batch[1].cuda()
        labels = batch[2].cuda().long()

        with torch.no_grad():
            loss, logits = model(input_ids=input_ids,
                                 attention_mask=input_mask,
                                 labels=labels.view(-1))
    
        total_loss += loss.item()
        save_logits.extend(logits.detach().cpu().numpy())
        
        logits = logits.detach().cpu().numpy() 
        rounded_preds.extend(list(np.argmax(logits, axis=1).flatten())) 
        labels_true.extend(list(labels.detach().cpu().numpy()))

    f1, recall, precision, acc = compute_metrics(labels_true, rounded_preds)
    avg_test_loss = total_loss / len(dataloader)
    
    testing_time = str(datetime.timedelta(seconds=int(round(time.time() - t0))))

    print("")
    print("Summary Resuts")
    print("epoch | loss | acc | recall | f1 | precision | testing time ")
    print(f"{epoch+1:5d} | {avg_test_loss:.5f} | {acc:.5f} | {recall:.5f} | {f1:.5f} | {precision:.5f} | {testing_time:}")


In [12]:
def testing(model, dataloader):
    t0 = time.time()

    # put the model in evaluation mode
    model.eval()

    total_loss = 0
    total_f1 = 0
    total_recall = 0
    total_precision = 0
    total_acc = 0
    save_logits = []
    rounded_preds = []
    labels_true = []
    probabilities = []
    
    for batch in dataloader: 

        input_ids = batch[0].cuda()
        input_mask = batch[1].cuda()
        labels = batch[2].cuda().long()

        with torch.no_grad():
            logits = model(input_ids=input_ids,
                                 attention_mask=input_mask)
    
        save_logits.extend(logits[0].detach().cpu().numpy())
        
        probabilities.extend(F.softmax(logits[0].detach().cpu(), dim=-1).numpy())
        logits = logits[0].detach().cpu().numpy() 
        rounded_preds.extend(list(np.argmax(logits, axis=1).flatten()))
        labels_true.extend(list(labels.detach().cpu().numpy()))

    f1, recall, precision, acc = compute_metrics(labels_true, rounded_preds)
    probabilities = np.array(probabilities)
    roc = metrics.roc_auc_score(labels_true, probabilities[:, 1], average='macro')
    
    testing_time = str(datetime.timedelta(seconds=int(round(time.time() - t0))))

    print("")
    print("Summary Resuts")
    print("acc | recall | f1 | precision | roc | testing time ")
    print(f"{acc:.5f} | {recall:.5f} | {f1:.5f} | {precision:.5f} | {roc:.5f} | {testing_time:}")


In [13]:
optimizer = AdamW(model.parameters(), lr=0.00001)

epochs = 5

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

scaler = GradScaler()


for epoch in range(epochs):
    print('Training...')
    train(model, train_dataloader, optimizer)
    print("")
    print("Validating...")
    validating(model, val_dataloader)



Training...

  Batch    50  of    700.
  Batch   100  of    700.
  Batch   150  of    700.
  Batch   200  of    700.
  Batch   250  of    700.
  Batch   300  of    700.
  Batch   350  of    700.
  Batch   400  of    700.
  Batch   450  of    700.
  Batch   500  of    700.
  Batch   550  of    700.
  Batch   600  of    700.
  Batch   650  of    700.

Summary Train Resuts
epoch | loss | acc | recall | f1 | precision | training time 
    1 | 0.30157 | 0.87232 | 0.86767 | 0.84985 | 0.86805 | 0:05:10

Validating...

Summary Resuts
epoch | loss | acc | recall | f1 | precision | testing time 
    1 | 0.21700 | 0.91857 | 0.91848 | 0.91851 | 0.91855 | 0:00:12
Training...

  Batch    50  of    700.
  Batch   100  of    700.
  Batch   150  of    700.
  Batch   200  of    700.
  Batch   250  of    700.
  Batch   300  of    700.
  Batch   350  of    700.
  Batch   400  of    700.
  Batch   450  of    700.
  Batch   500  of    700.
  Batch   550  of    700.
  Batch   600  of    700.
  Batch   650  o

In [14]:
testing(model, test_dataloader)


Summary Resuts
acc | recall | f1 | precision | roc | testing time 
0.91571 | 0.91475 | 0.91526 | 0.91597 | 0.97098 | 0:00:12


In [15]:
torch.save(model.state_dict(), "../models/bertimbau-base/my_model")