In [2]:
import pandas as pd
import numpy as np
import torch

from torchtext.vocab import build_vocab_from_iterator
from transformers import AutoTokenizer, AutoModelForPreTraining

import config as cfg


In [3]:
emb_model = AutoModelForPreTraining.from_pretrained('neuralmind/bert-base-portuguese-cased')
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')


### Import da base

In [4]:
LABELS = {'Historia': 0, 'Administracao': 1, 'Geografia': 2, 'Biologia': 3, 
          'Literatura': 4, 'Artes': 5, 'Matematica': 6}

In [5]:
x_train = pd.read_csv('../data/processed/train_final.csv')
x_test = pd.read_csv('../data/processed/test.csv')

x_test.drop(['titulo', 'genero'], axis=1, inplace=True)

# x_train['alt_title'] = x_train['alt_title'].map(lambda title: \
#                                                 tokenizer.encode_plus(title, 
#                                                                       add_special_tokens=True,
#                                                                       padding='longest'))
# x_test['alt_title'] = x_test['alt_title'].map(lambda title: \
#                                               tokenizer.encode_plus(title, 
#                                                                     add_special_tokens=True,
#                                                                     padding='longest'))

In [6]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe: pd.DataFrame, tokenizer, max_len: int = None) -> object:
        
        self.data = dataframe
        self.tokenizer = tokenizer 
        self.max_len = max_len 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        textos = self.data.loc[index, 'alt_title'], 
        labels = self.data.loc[index, 'label']

        encoding = tokenizer(textos, 
                             max_length=self.max_len,
                             padding='max_length',
                             truncation=True,
                             return_tensors='pt')
        
        inputs = encoding['input_ids']
        tkn_type = encoding['token_type_ids']
        att_mask = encoding['attention_mask']

        return inputs, tkn_type, att_mask, labels


In [7]:
train_dataset = MyDataset(x_train, tokenizer=tokenizer, max_len=32)
test_dataset = MyDataset(x_test, tokenizer=tokenizer, max_len=32)

del x_train, x_test

In [8]:
BATCH_SIZE = 128

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [9]:
from transformers import BertModel, BertTokenizer

In [10]:
import torch
from torch import nn
from transformers import BertModel

class CustomBERTModel(nn.Module):
    def __init__(self, num_classes):
        super(CustomBERTModel, self).__init__()

        # Load the pre-trained BERT model and tokenizer
        self.bert = BertModel.from_pretrained('neuralmind/bert-base-portuguese-cased')

        # Optionally, you can choose to freeze the embeddings
        for p in self.bert.embeddings.parameters():
            p.requires_grad = False

        self.fc1 = nn.Linear(self.bert.config.hidden_size, 256)  # Additional linear layer
        self.gelu = nn.GELU()
        self.fc2 = nn.Linear(256, 128)  # Additional linear layer
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 32)
        self.fc5 = nn.Linear(32, num_classes)
        

    def forward(self, input_ids, attention_mask, token_tp_ids):
        # Forward pass through the BERT model
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_tp_ids)

        # Extract the last hidden state of the token [CLS] for classification task
        pooled_output = outputs.pooler_output

        # Apply the additional linear layer with ReLU activation
        x = self.fc1(pooled_output)
        x = self.gelu(x)
        x = self.fc2(x)
        x = self.gelu(x)
        x = self.fc3(x)
        x = self.gelu(x)
        x = self.fc4(x)
        x = self.gelu(x)

        # Pass the transformed output through the classification layer
        logits = self.fc5(x)
        print(logits)
        return logits  # Softmax is applied outside the model during training

model = CustomBERTModel(num_classes=7)
# model = CustomBERTModel(num_classes=7)


In [12]:
from tqdm import tqdm

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

# Initialize variables to keep track of training statistics
total_loss = 0
correct = 0
total = 0

# Train the model
for epoch in range(10):
    model.train()  # Set the model to training mode
    total_loss = 0
    correct = 0
    total = 0

    for batch_idx, (inputs, tkn_type, att_mask, labels) in enumerate(train_loader):
        
        optimizer.zero_grad()

        # Forward pass
    
        logits = model(inputs.squeeze(1), att_mask.squeeze(1), tkn_type.squeeze(1))
        # print(outputs)
        # Calculate the loss
        
        loss = criterion(logits, labels)
        
        total_loss += loss.item()

        # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

        # Backward pass and update the model parameters
        loss.backward()
        optimizer.step()

        # Calculate accuracy for the current batch
        predicted = torch.softmax(logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        # Print training statistics for the epoch
        accuracy = 100 * correct / total
        print(f'batch_idx [{batch_idx + 1}/{len(train_loader)}], Loss: {total_loss / len(train_loader):.4f}, Accuracy: {accuracy:.2f}%', end='\r')

    # Print training statistics for the epoch
    accuracy = 100 * correct / total
    print(f'Epoch [{epoch + 1}/10], Loss: {total_loss / len(train_loader):.4f}, Accuracy: {accuracy:.2f}%')

# Final accuracy after training
print('Final Accuracy:', accuracy)

tensor([[-1.0623e+00, -6.3325e-01, -1.2283e+00,  7.9033e-01, -6.5021e-01,
         -7.9138e-01, -3.5793e-01],
        [-8.6612e-01, -5.9101e-01, -1.1515e+00,  9.1877e-01, -6.2645e-01,
         -5.5637e-01, -1.6336e-01],
        [-2.6077e+00, -1.9602e+00, -1.6902e+00,  1.8753e+00, -6.8836e-02,
         -1.2428e+00, -9.9353e-01],
        [-1.3198e+00, -1.0981e+00, -1.5834e+00,  1.2386e+00, -6.2984e-01,
         -8.7544e-01, -4.8791e-01],
        [-7.8601e-01, -4.6145e-01, -1.0827e+00,  8.7943e-01, -6.0326e-01,
         -5.1806e-01, -2.4364e-01],
        [-6.2665e-01, -2.7670e-01, -8.5965e-01,  6.6843e-01, -5.9180e-01,
         -3.9709e-01, -2.1887e-01],
        [-1.5117e+00, -1.3779e+00, -1.8075e+00,  1.4690e+00, -6.5238e-01,
         -1.0756e+00, -7.7838e-01],
        [-6.5022e-01, -1.4172e-01, -7.7179e-01,  5.2761e-01, -6.0880e-01,
         -4.1640e-01, -6.8984e-02],
        [-1.8651e+00, -1.4301e+00, -1.6874e+00,  1.4307e+00, -4.5705e-01,
         -1.0413e+00, -7.6289e-01],
        [-

ValueError: too many values to unpack (expected 2)