In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import OneCycleLR
from transformers import BertTokenizerFast, BertModel
from transformers import AdamW
from utils.datasets import TextDataset
from models.bert import BertForClassification

In [2]:
import tqdm.notebook as tqdm

In [3]:
MODEL = "neuralmind/bert-base-portuguese-cased"

# Loading tweet dataset

In [4]:
train_df = pd.read_csv("/home/kenzo/datasets/cleaned_tweetsentbr/train.tsv", sep="\t", names=["id", "label", "alfa", "text"], index_col=0)
test_df = pd.read_csv("/home/kenzo/datasets/cleaned_tweetsentbr/test.tsv", sep="\t", names=["id", "label", "alfa", "text"], index_col=0)

In [5]:
# TODO: Exploração dos dados

In [6]:
tokenizer = BertTokenizerFast.from_pretrained(MODEL)

In [7]:
train_ds = TextDataset.from_df(train_df, tokenizer, max_seq_len=128)
test_ds = TextDataset.from_df(test_df, tokenizer, max_seq_len=128)

# Preparing model

In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from functools import partial

metrics = {
    "accuracy": accuracy_score,
    "precision": partial(precision_score, average="macro"),
    "recall": partial(recall_score, average="macro"),
    "f1": partial(f1_score, average="macro"),
}

In [9]:
gpu = torch.device("cuda:0")

In [10]:
bert_model = BertModel.from_pretrained(MODEL)

In [11]:
model = BertForClassification(bert_model, 3, metrics, device=gpu).cuda(gpu)

RuntimeError: CUDA error: out of memory

In [None]:
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=64, shuffle=True)

In [12]:
batches = len(train_dl)
epochs = 3
optimizer = AdamW(model.parameters(), lr=1e-7) 
scheduler = OneCycleLR(optimizer, max_lr=1e-5, steps_per_epoch=batches, epochs=epochs)
criterion = nn.CrossEntropyLoss()

In [13]:
train, test = model.fit(epochs, train_dl, test_dl, criterion, optimizer, scheduler=scheduler)

HBox(children=(FloatProgress(value=0.0, description='Epochs', max=3.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='- Remaining batches', max=154.0, style=ProgressStyle(desc…


Epoch: 1
	train_loss: 1.0138633332469247 // test_loss: 0.7628194460502038// metrics: {'accuracy': 0.673842404549147, 'precision': 0.6539588815702859, 'recall': 0.6505038922243764, 'f1': 0.641654395616624}



HBox(children=(FloatProgress(value=0.0, description='- Remaining batches', max=154.0, style=ProgressStyle(desc…


Epoch: 2
	train_loss: 0.6923673816702582 // test_loss: 0.6837754325988965// metrics: {'accuracy': 0.7071486596263201, 'precision': 0.6836091196517629, 'recall': 0.6817984400850148, 'f1': 0.6810232999610674}



HBox(children=(FloatProgress(value=0.0, description='- Remaining batches', max=154.0, style=ProgressStyle(desc…


Epoch: 3
	train_loss: 0.5912742034181372 // test_loss: 0.6849900934940729// metrics: {'accuracy': 0.7112103980503656, 'precision': 0.6888164523297844, 'recall': 0.6872869911133153, 'f1': 0.6866286116057129}




In [14]:
torch.save(model.state_dict(), "data/checkpoints/bert_tweetsent_br.ckpt")