In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import AdamW
from typing import Iterable, Dict, Callable

In [2]:
MODEL = "neuralmind/bert-base-portuguese-cased"

# Bert Dataset

In [3]:
class BertDataset(Dataset):

    def __init__(self, texts: Iterable[str],
                        labels: Iterable[int],
                        tokenizer: BertTokenizerFast,
                        max_seq_len: int):
        
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    @classmethod
    def from_df(cls, df: pd.DataFrame, tokenizer: BertTokenizerFast, max_seq_len: int = 512, text_column: str = "text", target_column: str = "label"):

        texts = df[text_column].values
        labels = df[target_column].values

        return cls(texts, labels, tokenizer, max_seq_len)

    def __len__(self):
        
        return len(self.labels) 

    def __getitem__(self, i):

        # Tokenization on

        tokens = self.tokenizer.encode_plus(
            self.texts[i],
            max_length = self.max_seq_len,
            pad_to_max_length=True,
            truncation=True,
            return_token_type_ids=False,
            return_tensors="pt"
        )

        label = self.labels[i]

        return tokens["input_ids"].squeeze(), tokens["attention_mask"].squeeze(), torch.tensor(label, dtype=torch.long)

In [4]:
class BertDataset2(Dataset):

    def __init__(self, texts: Iterable[str],
                        labels: Iterable[int],
                        tokenizer: BertTokenizerFast,
                        max_seq_len: int):
        
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    @classmethod
    def from_df(cls, df: pd.DataFrame, tokenizer: BertTokenizerFast, max_seq_len: int = 512, text_column: str = "text", target_column: str = "label"):

        texts = df[text_column].values
        labels = df[target_column].values

        return cls(texts, labels, tokenizer, max_seq_len)

    def __len__(self):
        
        return len(self.labels) 

    def __getitem__(self, i):

        # Tokenization on

        tokens = self.tokenizer.encode_plus(
        self.texts[i],
        max_length = self.max_seq_len,
        pad_to_max_length=True,
        truncation=True,
        return_token_type_ids=False,
        return_tensors="pt"
        )

        label = self.labels[i]

        return tokens["input_ids"].squeeze(), tokens["attention_mask"].squeeze(), torch.tensor(label, dtype=torch.long)

In [5]:
class FastBertDataset(Dataset):

  def __init__(self, texts: Iterable[str],
                     labels: Iterable[int],
                     tokenizer: BertTokenizerFast,
                     max_seq_len: int):
    
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_seq_len = max_seq_len

    examples = tokenizer.batch_encode_plus(
        # Gerar um lista do texto de Treino
        self.texts.tolist(),
        # Comprimento da sequência
        max_length = max_seq_len,
        # Controla o preenchimento, como sabemos o comprimento médio = True
        pad_to_max_length=True,
        # Truncar para um comprimento máximo especificado no argumento max_length
        truncation=True,
        # Não é necessário retornar tipo do ID dos Tokens
        # Link: https://huggingface.co/transformers/glossary.html#token-type-ids
        return_token_type_ids=False,
        return_tensors="pt"
    )

    self.input_ids = examples["input_ids"]
    self.attention_masks = examples["attention_mask"]

  @classmethod
  def from_df(cls, df: pd.DataFrame, tokenizer: BertTokenizerFast, max_seq_len: int = 512, text_column: str = "text", target_column: str = "label"):

    texts = df[text_column].values
    labels = df[target_column].values

    return cls(texts, labels, tokenizer, max_seq_len)

  def __len__(self):
    
    return len(self.labels) 

  def __getitem__(self, i):

    ids = self.input_ids[i]
    masks = self.attention_masks[i]
    label = torch.tensor(self.labels[i], dtype=torch.long)
    
    return ids, masks, label


# Bert Model

In [6]:
from torch.optim.lr_scheduler import OneCycleLR

class BertForClassification(nn.Module):

    def __init__(self, bert_model: BertForSequenceClassification, 
                        classes: int,
                        metrics: Dict[str, Callable],
                        freeze_layers: bool = False):

        super(BertForClassification, self).__init__()

        self.bert = bert_model
        self.classes = classes
        self.metrics = metrics
        # TODO: remover atributo classes
    
    def _freeze_layers() -> None:

        for param in self.bert.parameters():
            param.requires_grad = False


    def forward(self, input_ids, att_masks):
        # Use only the logits
        return self.bert(input_ids, att_masks)[0]


    def evaluate(self, eval_dl: DataLoader, 
                        criterion: torch.nn,
                        cuda: bool = False,
                        device: torch.device = torch.device("cpu:0")) -> None:
        # evaluate
        self.eval()

        with torch.no_grad():

            preds = []
            real = []
            batch_losses = []

            for input_batch in eval_dl:

                if cuda:
                    input_batch = [x.cuda(device) for x in input_batch]

                input_ids, att_masks, labels = input_batch

                outputs = self(input_ids, att_masks) 
                loss = criterion(outputs.squeeze(), labels)

                outputs = F.softmax(outputs, dim=1)
                outputs = outputs.argmax(axis=1)
                
                preds.extend(outputs.tolist())
                real.extend(labels.tolist())
                batch_losses.append(loss.item())

            results = {}
            for metric_name, metric in self.metrics.items():
                results[metric_name] = metric(real, preds)

            mean_loss = np.mean(batch_losses)

            print(f"\ttrain_loss: {self.last_train_loss} // test_loss: {mean_loss}// metrics: {str(results)}")

        return preds, mean_loss

    def fit(self, epochs: int, 
                    train_dl: DataLoader, 
                    test_dl: DataLoader,
                    criterion: torch.nn,
                    optimizer: torch.optim,
                    cuda: bool = False, 
                    device: torch.device = torch.device("cpu:0")):

        train_losses = []
        eval_losses = []

        batches = len(train_dl)

        scheduler = OneCycleLR(optimizer, max_lr=1e-5, steps_per_epoch=batches, epochs=epochs)

        for epoch in range(epochs):
            print(f"Epoch: {epoch+1}")
            # train
            self.train()
            batch_losses = []

            for batch_input in tqdm(train_dl, total=batches, desc="- Remaining batches"):

                if cuda:
                    batch_input = [x.cuda(device) for x in batch_input]

                input_ids, att_masks, labels = batch_input

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward + backward + optimize
                outputs = self(input_ids, att_masks)
                loss = criterion(outputs.squeeze(), labels)
                loss.backward()
                
                optimizer.step()
                scheduler.step()

                batch_losses.append(loss.item())
        
            train_loss = np.mean(batch_losses)
            self.last_train_loss = train_loss

            # evaluate
            _, eval_loss = self.evaluate(test_dl, criterion, cuda, device)

            train_losses.append(train_loss)
            eval_losses.append(eval_loss)

        return train_losses, eval_losses

# Loading tweet dataset

In [7]:
train_df = pd.read_csv("/home/kenzo/datasets/cleaned_tweetsentbr/train.tsv", sep="\t", names=["id", "label", "alfa", "text"], index_col=0)
test_df = pd.read_csv("/home/kenzo/datasets/cleaned_tweetsentbr/test.tsv", sep="\t", names=["id", "label", "alfa", "text"], index_col=0)

In [8]:
# TODO: Adicionar uma exploração dos dados

In [9]:
tokenizer = BertTokenizerFast.from_pretrained(MODEL)

In [10]:
# train_ds = BertDataset.from_df(train_df, tokenizer, max_seq_len=128)
# test_ds = BertDataset.from_df(test_df, tokenizer, max_seq_len=128)

train_ds = FastBertDataset.from_df(train_df, tokenizer, max_seq_len=128)
test_ds = FastBertDataset.from_df(test_df, tokenizer, max_seq_len=128)

# Preparing model

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from functools import partial

metrics = {
    "accuracy": accuracy_score,
    "precision": partial(precision_score, average="macro"),
    "recall": partial(recall_score, average="macro"),
    "f1": partial(f1_score, average="macro"),
}

In [12]:
gpu = torch.device("cuda:2")

In [13]:
bert_model = BertForSequenceClassification.from_pretrained(MODEL, num_labels=3).cuda(gpu)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

In [14]:
model = BertForClassification(bert_model, 3, metrics)

In [15]:
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=32, shuffle=True)

In [16]:
optimizer = AdamW(model.parameters(), lr=1e-5) 
criterion = nn.CrossEntropyLoss()

In [17]:
train, test = model.fit(5, train_dl, test_dl, criterion, optimizer, cuda=True, device=gpu)

Epoch: 1


HBox(children=(FloatProgress(value=0.0, description='- Remaining batches', max=308.0, style=ProgressStyle(desc…


	train_loss: 0.9994746659483228 // test_loss: 0.7552033024174827// metrics: {'accuracy': 0.665718927701056, 'precision': 0.6458132502397061, 'recall': 0.6266536360723477, 'f1': 0.6156258262765357}
Epoch: 2


HBox(children=(FloatProgress(value=0.0, description='- Remaining batches', max=308.0, style=ProgressStyle(desc…


	train_loss: 0.6879196470821058 // test_loss: 0.6672780587301625// metrics: {'accuracy': 0.7160844841592201, 'precision': 0.7006827924578855, 'recall': 0.6985323162000691, 'f1': 0.6982384555485498}
Epoch: 3


HBox(children=(FloatProgress(value=0.0, description='- Remaining batches', max=308.0, style=ProgressStyle(desc…


	train_loss: 0.5527749746650844 // test_loss: 0.6753748008957157// metrics: {'accuracy': 0.7197400487408611, 'precision': 0.6990380439264302, 'recall': 0.6981414352668086, 'f1': 0.6985703565787017}
Epoch: 4


HBox(children=(FloatProgress(value=0.0, description='- Remaining batches', max=308.0, style=ProgressStyle(desc…


	train_loss: 0.4438867103066537 // test_loss: 0.7057026469862306// metrics: {'accuracy': 0.7242079610073111, 'precision': 0.7033789063206987, 'recall': 0.7056915314065079, 'f1': 0.7040117473218747}
Epoch: 5


HBox(children=(FloatProgress(value=0.0, description='- Remaining batches', max=308.0, style=ProgressStyle(desc…


	train_loss: 0.38610948762529856 // test_loss: 0.7225480977590982// metrics: {'accuracy': 0.7258326563769293, 'precision': 0.7059645295098281, 'recall': 0.7030160678353831, 'f1': 0.704284732224609}


In [21]:
torch.save(model.bert.state_dict(), "data/checkpoints/bert_tweetsent_br.ckpt")