In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
x_train_og = pd.read_csv('../data/processed/train_og.csv')

LABELS = {'Historia': 0, 'Administracao': 1, 'Geografia': 2, 'Biologia': 3, 
          'Literatura': 4, 'Artes': 5, 'Matematica': 6}


In [3]:
x_test = pd.read_csv('../data/processed/test.csv')

In [4]:
from torchtext.vocab import build_vocab_from_iterator

from transformers import AutoTokenizer, AutoModelForPreTraining

emb_model = AutoModelForPreTraining.from_pretrained('neuralmind/bert-base-portuguese-cased')
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

def yield_tokens(data_iter):
    for i in data_iter: 
        yield tuple(tokenizer.tokenize(i))

vocab = build_vocab_from_iterator(yield_tokens(list(x_train_og['alt_title'])), specials=["[UNK]"])
vocab.set_default_index(vocab["[UNK]"])

def get_embeddings(word): 
    tokens = tokenizer.tokenize(word)
    tokens_id = tokenizer.convert_tokens_to_ids(tokens)
    input_embeddings = emb_model.get_input_embeddings()
    return torch.tensor(tokens_id)
    # return input_embeddings(torch.tensor(tokens_id))


text_pipeline = lambda word: get_embeddings(word)
label_pipeline = lambda idx: int(idx)


In [5]:
# Get the input embeddings
input_embeddings = emb_model.get_input_embeddings()

# You can access the embedding matrix and other properties
embedding_matrix = input_embeddings.weight
embedding_dim = embedding_matrix.size(1)  # Dimension of the word embeddings
vocab_size = embedding_matrix.size(0) 

In [6]:
def pad(l, content, width):
    l.extend([content] * (width - len(l)))
    return l

In [7]:
import torch
import pandas as pd

# Load the CSV file

# Create a Dataset object
class MyDataset(torch.utils.data.Dataset):
    
    def __init__(self, data_path, get_embeddings=True):

        data = pd.read_csv(data_path)

        if get_embeddings:
            # self.alt_title = [text_pipeline(x).to(torch.long) for x in data.alt_title]
            self.alt_title = torch.tensor([pad(text_pipeline(x).tolist(), 0, 18) for x in data['alt_title'].tolist()])
        else: 
            self.alt_title = data['alt_title'].tolist()
        # self.label = torch.tensor(data['label'].tolist(), dtype=torch.long)
        self.label = torch.tensor([x for x in data['label']])

    def __len__(self):
        return len(self.alt_title)

    def __getitem__(self, idx):
        return self.alt_title[idx], self.label[idx]

# Create the Dataset object
dataset = MyDataset('../data/processed/train_aug.csv')


In [9]:
BATH_SIZE = 32

def collate_fn(batch):
    """
    Collate function that pads all tensors to the same size.

    Args:
        batch: A list of tuples, where each tuple contains a text embedding tensor and a label tensor.

    Returns:
        A tuple of two tensors, where the first tensor is a padded text embedding tensor and the second tensor is a label tensor.
    """

    # Get the maximum length of the text embeddings in the batch.
    max_length = max([len(x[0]) for x in batch])

    # Pad all of the text embeddings to the same length.
    padded_text_embeddings = []
    for x in batch:
        padded_text_embeddings.append(torch.nn.functional.pad(x[0], (0, max_length - len(x[0]))))

    # Get the labels from the batch.
    labels = [x[1] for x in batch]

    # Return the padded text embeddings and the labels.
    return padded_text_embeddings, labels

# Create a DataLoader object
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATH_SIZE, shuffle=False, ) # collate_fn=collate_fn

In [10]:
import torch
import torch.nn as nn

class GRUModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, num_layers=10):
        super(GRUModel, self).__init__()

        # Embedding layer
        # self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding = nn.Embedding.from_pretrained(emb_model.get_input_embeddings().weight, freeze=True)

        # GRU layer
        # self.gru = nn.GRU(embedding_dim, hidden_dim)
        self.gru = nn.GRU(emb_model.get_input_embeddings().weight.size(1), hidden_dim, num_layers)

        # Fully connected layer
        # self.fc = nn.Linear(hidden_dim, num_classes)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        # Embed the input text
        x = self.embedding(x)

        # Pass the embedded input to the GRU layer
        x, _ = self.gru(x)

        # Take the last hidden state vector and pass it to the fully connected layer
        x = self.fc(x[:, -1])

        return x

In [11]:
# Access the alt_title data and convert it into a list
alt_title_list = [item[0] for item in dataset]

# Find the unique values in the alt_title data
unique_alt_titles = list(set(alt_title_list))

# Get the number of unique titles
num_unique_titles = len(unique_alt_titles)

In [12]:
num_unique_titles

3488

In [14]:

# Create the model
model = GRUModel(num_unique_titles, 128, 128, 7)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Train the model
for epoch in range(10):
    for text, label in dataloader:
        # Forward pass
        outputs = model(text)
        # _, predicted_classes = torch.max(outputs, 1)
        # print(predicted_classes)
        # Calculate the loss
        loss = criterion(outputs, label)

        # Backward pass and update the model parameters
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Evaluate the model
correct = 0
total = 0
with torch.no_grad():
    for batch in dataloader:
        outputs = model(text)
        _, predicted = torch.max(outputs.data, 1)
        total += label.size(0)
        correct += (predicted == label).sum().item()

accuracy = 100 * correct / total
print('Accuracy:', accuracy)
