In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
x_train_og = pd.read_csv('../data/processed/train_og.csv')

LABELS = {'Historia': 0, 'Administracao': 1, 'Geografia': 2, 'Biologia': 3, 
          'Literatura': 4, 'Artes': 5, 'Matematica': 6}


In [3]:
x_test = pd.read_csv('../data/processed/test.csv')

In [29]:
def pad(l, content, width):
    l.extend([content] * (width - len(l)))
    return l

In [33]:
import torch
import pandas as pd

# Load the CSV file

# Create a Dataset object
class MyDataset(torch.utils.data.Dataset):
    
    def __init__(self, data_path, get_embeddings=True):

        data = pd.read_csv(data_path)

        max_len = max([len(text_pipeline(x)) for x in datas['alt_title']])

        if get_embeddings:
            # self.alt_title = [text_pipeline(x).to(torch.long) for x in data.alt_title]
            self.alt_title = torch.tensor([pad(text_pipeline(x), 0, max_len) for x in data['alt_title'].tolist()])
        else: 
            self.alt_title = data['alt_title'].tolist()
        # self.label = torch.tensor(data['label'].tolist(), dtype=torch.long)
        self.label = torch.tensor([x for x in data['label']])

    def __len__(self):
        return len(self.alt_title)

    def __getitem__(self, idx):
        return self.alt_title[idx], self.label[idx]

# Create the Dataset object
dataset = MyDataset('../data/processed/train_aug.csv')


In [34]:
BATH_SIZE = 32

def collate_fn(batch):
    """
    Collate function that pads all tensors to the same size.

    Args:
        batch: A list of tuples, where each tuple contains a text embedding tensor and a label tensor.

    Returns:
        A tuple of two tensors, where the first tensor is a padded text embedding tensor and the second tensor is a label tensor.
    """

    # Get the maximum length of the text embeddings in the batch.
    max_length = max([len(x[0]) for x in batch])

    # Pad all of the text embeddings to the same length.
    padded_text_embeddings = []
    for x in batch:
        padded_text_embeddings.append(torch.nn.functional.pad(x[0], (0, max_length - len(x[0]))))

    # Get the labels from the batch.
    labels = [x[1] for x in batch]

    # Return the padded text embeddings and the labels.
    return padded_text_embeddings, labels

# Create a DataLoader object
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATH_SIZE, shuffle=False, ) # collate_fn=collate_fn

In [54]:
import torch
import torch.nn as nn

class GRUModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, num_layers=10):
        super(GRUModel, self).__init__()

        # Embedding layer
        # self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # GRU layer
        # self.gru = nn.GRU(embedding_dim, hidden_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers)

        # Fully connected layer
        # self.fc = nn.Linear(hidden_dim, num_classes)
        self.fc = nn.Linear(hidden_dim, num_classes)

        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, x):
        # Embed the input text
        x = self.embedding(x)

        # Pass the embedded input to the GRU layer
        x, _ = self.gru(x)

        # Take the last hidden state vector and pass it to the fully connected layer
        x = self.fc(x[:, -1])

        return x

In [61]:
import torch
from transformers import BertModel, BertTokenizer

class CustomBERTModel(torch.nn.Module):
    def __init__(self, pretrained_model_name):
        super(CustomBERTModel, self).__init__()
        
        # Load the pre-trained BERT model and tokenizer
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)

    def forward(self, input_ids, attention_mask):
        # Forward pass through the BERT model
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs



In [55]:
# Access the alt_title data and convert it into a list
alt_title_list = [item[0] for item in dataset]

# Find the unique values in the alt_title data
unique_alt_titles = list(set(alt_title_list))

# Get the number of unique titles
num_unique_titles = len(unique_alt_titles)

In [56]:
num_unique_titles

3488

In [57]:
vocab_size = len(vocab)

In [60]:
model = GRUModel(vocab_size, BATH_SIZE, BATH_SIZE, 7, num_layers=20)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Initialize variables to keep track of training statistics
total_loss = 0
correct = 0
total = 0

# Train the model
for epoch in range(10):
    model.train()  # Set the model to training mode
    total_loss = 0
    correct = 0
    total = 0

    for text, label in dataloader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(text)

        # Calculate the loss
        loss = criterion(outputs, label)
        
        total_loss += loss.item()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

        # Backward pass and update the model parameters
        loss.backward()
        optimizer.step()

        # Calculate accuracy for the current batch
        _, predicted = torch.max(outputs, 1)
        total += label.size(0)
        correct += (predicted == label).sum().item()

    # Print training statistics for the epoch
    accuracy = 100 * correct / total
    print(f'Epoch [{epoch + 1}/10], Loss: {total_loss / len(dataloader):.4f}, Accuracy: {accuracy:.2f}%')

# Final accuracy after training
print('Final Accuracy:', accuracy)


Epoch [1/10], Loss: 1.7786, Accuracy: 37.99%
Epoch [2/10], Loss: 1.7399, Accuracy: 39.91%
Epoch [3/10], Loss: 1.7354, Accuracy: 39.91%
Epoch [4/10], Loss: 1.7325, Accuracy: 39.91%
Epoch [5/10], Loss: 1.7304, Accuracy: 39.91%
Epoch [6/10], Loss: 1.7287, Accuracy: 39.91%
Epoch [7/10], Loss: 1.7273, Accuracy: 39.91%
Epoch [8/10], Loss: 1.7261, Accuracy: 39.91%
Epoch [9/10], Loss: 1.7250, Accuracy: 39.91%
Epoch [10/10], Loss: 1.7242, Accuracy: 39.91%
Final Accuracy: 39.908256880733944


In [64]:
model = CustomBERTModel('neuralmind/bert-base-portuguese-cased')

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Initialize variables to keep track of training statistics
total_loss = 0
correct = 0
total = 0

# Inside your training loop
for epoch in range(10):
    model.train()
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = custom_bert(input_ids, attention_mask)
        logits = outputs.logits
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

    # Validation and evaluation code here

# Final accuracy after training
print('Final Accuracy:', accuracy)


ValueError: not enough values to unpack (expected 3, got 2)