In [2]:
import pandas as pd
import numpy as np
import torch

### Import da base

In [3]:
LABELS = {'Historia': 0, 'Administracao': 1, 'Geografia': 2, 'Biologia': 3, 
          'Literatura': 4, 'Artes': 5, 'Matematica': 6}

x_train_og = pd.read_csv('../data/processed/train_final.csv')

In [4]:
x_test = pd.read_csv('../data/processed/test.csv')
x_test

Unnamed: 0,titulo,genero,alt_title,label,label_inf,sf_historia,sf_administracao,sf_geografia,sf_biologia,sf_literatura,sf_artes,sf_matematica
0,Introdução à Teoria da Literatura,Literatura,introducao teoria literatura,4,4,0,0,0,1,0,0,0
1,Viagem pitoresca e Histórica ao Brasil,Historia,viagem pitoresca historica brasil,0,0,0,0,0,0,0,0,0
2,Cem dias entre o céu e mar,Geografia,cem dias ceu mar,2,-1,0,0,0,1,0,0,1
3,Elementos de teoria literária,Literatura,elementos teoria literaria,4,4,0,0,0,0,0,0,0
4,Geografia da América Latina,Geografia,geografia america latina,2,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
286,Elementos de comportamento organizacional,Administracao,elementos comportamento organizacional,1,1,0,0,0,0,0,0,0
287,A História do pensamento econômico,Historia,historia pensamento economico,0,1,0,0,0,0,0,0,0
288,Matemática fundamental:uma nova abordagem,Matematica,matematica fundamental nova abordagem,6,6,0,0,0,0,0,1,0
289,A Guerra Civil Espanhola,Historia,guerra civil espanhola,0,0,0,0,0,0,0,0,0


In [5]:
def pad(l, content, width):
    """
    Pads a list with a given content to a specified width.

    Args:
        l: The list to pad.
        content: The content to use for padding.
        width: The desired width of the padded list.

    Returns:
        The padded list.
    """
    
    l.extend([content] * (width - len(l)))
    return l

In [6]:
from torchtext.vocab import build_vocab_from_iterator

from transformers import AutoTokenizer, AutoModelForPreTraining

emb_model = AutoModelForPreTraining.from_pretrained('neuralmind/bert-base-portuguese-cased')
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

def yield_tokens(data_iter):
    for i in data_iter: 
        yield tuple(tokenizer.tokenize(i))

vocab = build_vocab_from_iterator(yield_tokens(list(x_train_og['alt_title'])), specials=["[UNK]"])
vocab.set_default_index(vocab["[UNK]"])

def get_embeddings(word): 
    tokens = tokenizer.tokenize(word)
    tokens_id = tokenizer.convert_tokens_to_ids(tokens)
    input_embeddings = emb_model.get_input_embeddings()
    return torch.tensor(tokens_id)
    # return input_embeddings(torch.tensor(tokens_id))


text_pipeline = lambda word: get_embeddings(word)
label_pipeline = lambda idx: int(idx)


In [7]:
import torch
import pandas as pd

# Load the CSV file

# Create a Dataset object
class MyDataset(torch.utils.data.Dataset):
    
    def __init__(self, data_path, get_embeddings=True):

        data = pd.read_csv(data_path)

        max_len = max([len(text_pipeline(x)) for x in data['alt_title']])

        if get_embeddings:
            # self.alt_title = [text_pipeline(x).to(torch.long) for x in data.alt_title]
            self.alt_title = torch.tensor([pad(text_pipeline(x).tolist(), 0, max_len) for x in data['alt_title']])
        else: 
            self.alt_title = data['alt_title'].tolist()
        # self.label = torch.tensor(data['label'].tolist(), dtype=torch.long)
        self.label = torch.tensor([x for x in data['label']])

    def __len__(self):
        return len(self.alt_title)

    def __getitem__(self, idx):
        return self.alt_title[idx], self.label[idx]

# Create the Dataset object
train_dataset_text = MyDataset('../data/processed/train_final.csv')
test_dataset_text = MyDataset('../data/processed/test.csv')


In [8]:
BATH_SIZE = 124

def collate_fn(batch):
    """
    Collate function that pads all tensors to the same size.

    Args:
        batch: A list of tuples, where each tuple contains a text embedding tensor and a label tensor.

    Returns:
        A tuple of two tensors, where the first tensor is a padded text embedding tensor and the second tensor is a label tensor.
    """

    # Get the maximum length of the text embeddings in the batch.
    max_length = max([len(x[0]) for x in batch])

    # Pad all of the text embeddings to the same length.
    padded_text_embeddings = []
    for x in batch:
        padded_text_embeddings.append(torch.nn.functional.pad(x[0], (0, max_length - len(x[0]))))

    # Get the labels from the batch.
    labels = [x[1] for x in batch]

    # Return the padded text embeddings and the labels.
    return padded_text_embeddings, labels

# Create a DataLoader object
train_dl_text = torch.utils.data.DataLoader(train_dataset_text, batch_size=BATH_SIZE, shuffle=False, ) # collate_fn=collate_fn
test_dl_text = torch.utils.data.DataLoader(test_dataset_text, batch_size=BATH_SIZE, shuffle=False) # collate_fn=collate_fn

In [9]:
import torch
import torch.nn as nn

class GRUModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, num_layers=10):
        super(GRUModel, self).__init__()

        # Embedding layer
        # self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(emb_model.get_input_embeddings().weight.data), freeze=False)

        # GRU layer
        # self.gru = nn.GRU(embedding_dim, hidden_dim)
        self.gru = nn.GRU(emb_model.get_input_embeddings().weight.data.size(1), hidden_dim, num_layers)

        # Fully connected layer
        # self.fc = nn.Linear(hidden_dim, num_classes)
        self.fc = nn.Linear(hidden_dim, num_classes)

        self.dropout = nn.Dropout(0.2)

        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, x):
        # Embed the input text
        x = self.embedding(x)

        # Pass the embedded input to the GRU layer
        x, _ = self.gru(x)

        x = self.dropout(x)

        # Take the last hidden state vector and pass it to the fully connected layer
        x = self.fc(x[:, -1])

        return x

In [10]:
import torch
import torch.nn as nn

class GRUModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, num_layers=10):
        super(GRUModel, self).__init__()

        # Embedding layer
        # self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(emb_model.get_input_embeddings().weight.data), freeze=True)

        self.tokenizer = tokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

        # GRU layer
        # self.gru = nn.GRU(embedding_dim, hidden_dim)
        self.gru = nn.GRU(emb_model.get_input_embeddings().weight.data.size(1), hidden_dim, num_layers)

        self.dropout = nn.Dropout(0.2)

        # Fully connected layer
        # self.fc = nn.Linear(hidden_dim, num_classes)
        self.fc1 = nn.Linear(hidden_dim, num_classes)

        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc1.weight.data.uniform_(-initrange, initrange)
        self.fc1.bias.data.zero_()

    def forward(self, x):
        # x = self.tokenizer.encode_plus(x, return_tensors='pt', padding=True, truncation=True, max_length=768)
        # Embed the input text
        x = self.embedding(x)

        # Pass the embedded input to the GRU layer
        x, _ = self.gru(x)

        x = self.dropout(x)

        # Take the last hidden state vector and pass it to the fully connected layer
        x = self.fc1(x[:, -1])

        return x

In [11]:
import torch
from transformers import BertModel, BertTokenizer

class CustomBERTModel(torch.nn.Module):
    def __init__(self, pretrained_model_name):
        super(CustomBERTModel, self).__init__()
        
        # Load the pre-trained BERT model and tokenizer
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)

    def forward(self, input_ids, attention_mask):
        # Forward pass through the BERT model
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs



In [12]:
# Access the alt_title data and convert it into a list
alt_title_list = [item[0] for item in train_dataset_text]

# Find the unique values in the alt_title data
unique_alt_titles = list(set(alt_title_list))

# Get the number of unique titles
num_unique_titles = len(unique_alt_titles)

In [13]:
model = GRUModel(tokenizer.vocab_size, BATH_SIZE, BATH_SIZE, 7, num_layers=10)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Initialize variables to keep track of training statistics
total_loss = 0
correct = 0
total = 0

# Train the model
for epoch in range(10):
    model.train()  # Set the model to training mode
    total_loss = 0
    correct = 0
    total = 0

    for text, label in train_dl_text:
        
        optimizer.zero_grad()

        # Forward pass
    
        outputs = model(text)
        # print(outputs)
        # Calculate the loss
        loss = criterion(outputs, label)
        
        total_loss += loss.item()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

        # Backward pass and update the model parameters
        loss.backward()
        optimizer.step()

        # Calculate accuracy for the current batch
        outputs = torch.softmax(outputs, dim=0)
        _, predicted = torch.max(outputs, 0)
        print(f'Predicted: {predicted}')
        print(f'Label: {label}')
        total += label.size(0)
        correct += (predicted == label).sum().item()
        # Print training statistics for the epoch
        accuracy = 100 * correct / total

    # Print training statistics for the epoch
    accuracy = 100 * correct / total
    print(f'Epoch [{epoch + 1}/10], Loss: {total_loss / len(train_dl_text):.4f}, Accuracy: {accuracy:.2f}%')

# Final accuracy after training
print('Final Accuracy:', accuracy)


Predicted: tensor([0, 4, 6, 0, 3, 2, 5, 0, 3, 1, 0, 2, 0, 6, 3, 3, 1, 6, 1, 3, 6, 0, 1, 0,
        0, 1, 4, 6, 5, 2, 0, 3, 0, 0, 4, 1, 2, 3, 2, 6, 4, 1, 5, 5, 6, 5, 6, 5,
        1, 4, 5, 4, 1, 1, 4, 6, 6, 2, 5, 3, 3, 5, 2, 6, 4, 4, 3, 1, 4, 5, 2, 6,
        6, 2, 5, 0, 5, 4, 1, 1, 5, 4, 3, 3, 5, 1, 5, 1, 1, 6, 3, 4, 2, 6, 4, 4,
        3, 1, 6, 0, 4, 5, 2, 6, 1, 2, 2, 2, 6, 6, 4, 1, 1, 3, 1, 0, 4, 3, 6, 1,
        6, 4, 0, 3])
Label: tensor([3, 3, 3, 3, 6, 6, 6, 6, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6,
        0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 3, 3, 3, 3, 6, 6, 6, 6, 0, 0, 0, 0,
        2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 6, 6, 6, 6, 0, 0, 0, 0,
        0, 0, 0, 0, 6, 6, 6, 6, 3, 3, 3, 3, 0, 0, 0, 0, 6, 6, 6, 6, 3, 3, 3, 3,
        0, 0, 0, 0])
Predicted: tensor([4, 4, 4, 4, 4, 1, 1, 5, 2, 4, 4, 4, 5, 4, 2, 2, 1, 4, 6, 0, 4, 4, 0, 3,
        5, 3, 0, 2, 0, 3, 5, 6, 2, 0, 0, 1, 3, 0,