In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import OneCycleLR
from transformers import BertTokenizerFast, BertModel
from transformers import AdamW
from utils.datasets import TextDataset
from models.bert import BertForClassification
import tqdm.notebook as tqdm

In [2]:
MODEL = "neuralmind/bert-base-portuguese-cased"

# Loading tweet dataset

In [3]:
train_df = pd.read_csv("/home/kenzo/datasets/cleaned_tweetsentbr/train.tsv", sep="\t", names=["id", "label", "alfa", "text"], index_col=0)
test_df = pd.read_csv("/home/kenzo/datasets/cleaned_tweetsentbr/test.tsv", sep="\t", names=["id", "label", "alfa", "text"], index_col=0)

In [4]:
# TODO: Exploração dos dados

In [5]:
tokenizer = BertTokenizerFast.from_pretrained(MODEL)

In [6]:
# train_ds = BertDataset.from_df(train_df, tokenizer, max_seq_len=128)
# test_ds = BertDataset.from_df(test_df, tokenizer, max_seq_len=128)

train_ds = TextDataset.from_df(train_df, tokenizer, max_seq_len=128)
test_ds = TextDataset.from_df(test_df, tokenizer, max_seq_len=128)

# Preparing model

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from functools import partial

metrics = {
    "accuracy": accuracy_score,
    "precision": partial(precision_score, average="macro"),
    "recall": partial(recall_score, average="macro"),
    "f1": partial(f1_score, average="macro"),
}

In [8]:
gpu = torch.device("cuda:1")

In [9]:
bert_model = BertModel.from_pretrained(MODEL)

In [10]:
model = BertForClassification(bert_model, 3, metrics).cuda(gpu)

In [11]:
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=64, shuffle=True)

In [12]:
batches = len(train_dl)
epochs = 3
optimizer = AdamW(model.parameters(), lr=1e-5) 
scheduler = OneCycleLR(optimizer, max_lr=1e-5, steps_per_epoch=batches, epochs=epochs)
criterion = nn.CrossEntropyLoss()

In [13]:
train, test = model.fit(epochs, train_dl, test_dl, criterion, optimizer, scheduler=scheduler, cuda=True, device=gpu)

- Remaining batches: 100%|██████████| 154/154 [07:27<00:00,  2.91s/it]
Epoch: 1
- Remaining batches:   0%|          | 0/154 [00:00<?, ?it/s]	train_loss: 1.0045083145816605 // test_loss: 0.735122099900857// metrics: {'accuracy': 0.6803411860276198, 'precision': 0.6588421202371125, 'recall': 0.6498992602109009, 'f1': 0.6442061765449211}

- Remaining batches: 100%|██████████| 154/154 [07:27<00:00,  2.91s/it]
Epoch: 2
- Remaining batches:   0%|          | 0/154 [00:00<?, ?it/s]	train_loss: 0.6838446801359003 // test_loss: 0.6714265140203329// metrics: {'accuracy': 0.7164906580016247, 'precision': 0.6947038219588575, 'recall': 0.6917201475371231, 'f1': 0.6923268928100267}

- Remaining batches: 100%|██████████| 154/154 [07:27<00:00,  2.91s/it]
Epoch: 3
	train_loss: 0.5844345448853133 // test_loss: 0.6782754636727847// metrics: {'accuracy': 0.7160844841592201, 'precision': 0.6948619253541238, 'recall': 0.6947144293499066, 'f1': 0.6930486393018985}



In [16]:
torch.save(model.state_dict(), "data/checkpoints/bert_tweetsent_br.ckpt")