<a href="https://colab.research.google.com/github/jpmedras/programming-problem-classifier/blob/main/inferencia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classificação de Dificuldade em Problemas de Programação

In [None]:
#@title Encoder
#@markdown Aqui está a definição das funções de encoder. Não se preocupe com essa célula, você não precisa alterá-la, **apenas executá-la**.

from torch import tensor, long
from torch import unsqueeze, cat
from transformers import BertTokenizer

def define_encoders(max_len):
    def inputs_encoder(inputs):
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased').encode_plus
        encoded_inputs = []
        for input in inputs:
            encoding = tokenizer(
                text=input,
                add_special_tokens=True,
                padding='max_length',
                truncation='longest_first',
                max_length=max_len
            )

            input_ids = encoding['input_ids']
            attention_mask = encoding['attention_mask']
            token_type_ids = encoding['token_type_ids']

            encoded_input_ids = tensor(input_ids, dtype=long).unsqueeze(0)
            encoded_attention_mask = tensor(attention_mask, dtype=long).unsqueeze(0)
            encoded_token_type_ids = tensor(token_type_ids, dtype=long).unsqueeze(0)

            encoded_input = cat((encoded_input_ids, encoded_attention_mask, encoded_token_type_ids), dim = 0).unsqueeze(0)
            encoded_inputs.append(encoded_input)

        try:
            encoded = cat(encoded_inputs)
            return encoded
        except:
            print('Number of inputs:', len(encoded_inputs))
            max_dif = max([e.shape[2] for e in encoded_inputs])
            print('Max shape:', max_dif)
            for idx, e in enumerate(encoded_inputs):
                if e.shape[2] != max_len:
                    print(idx)
                    print(e.shape[2])
                    print(inputs[idx])
                    break
            return None

    def labels_encoder(labels):
        encoded_labels = []
        for label in labels:
            encoded_labels.append(
                tensor([label], dtype=long).unsqueeze(0)
            )

        encoded = cat(encoded_labels)

        return encoded

    return inputs_encoder, labels_encoder

In [None]:
#@title Modelo
#@markdown Aqui está a definição do modelo. Não se preocupe com essa célula, você não precisa alterá-la, **apenas executá-la**.

import torch
from torch import nn, long, argmax, optim, save
from transformers import BertModel
from torch import cuda

import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay

class BERTModule(nn.Module):
    def __init__(self, n_classes, dropout_p = 0.3):
        super(BERTModule, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        # for param in self.bert.parameters():
        #   param.requires_grad = False
        self.dropout = nn.Dropout(p = dropout_p)
        self.fc = nn.Linear(768, n_classes)

        self.device = 'cuda' if cuda.is_available() else 'cpu'
        self.to(self.device)

    def forward(self, ids, masks, ttis):
        _, pooled_output = self.bert(ids, attention_mask = masks, token_type_ids = ttis, return_dict = False)
        output_drop = self.dropout(pooled_output)
        output = self.fc(output_drop)

        return output

    def fit(self, train_loader, test_loader, epochs = 10, learning_rate = 1e-05):
        self.epochs = epochs
        self.learning_rate = learning_rate

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(params = self.parameters(), lr = self.learning_rate)

        self.to(self.device)

        self.train()

        train_losses = []
        test_losses = []

        print('Begin training...')

        for epoch in range(self.epochs):
            train_loss = 0.

            for inputs, labels in train_loader:
                optimizer.zero_grad()

                ids = inputs[:, 0].to(self.device, dtype=long)
                masks = inputs[:, 1].to(self.device, dtype=long)
                tti = inputs[:, 2].to(self.device, dtype=long)
                labels = labels.squeeze().to(self.device, dtype=long)

                assert ids.shape == masks.shape, 'Ids != Masks'
                assert masks.shape == tti.shape, 'Masks != Ttis'
                assert ids.shape == tti.shape, 'Ids != Ttis'

                assert ids.shape[0] == labels.shape[0], 'inputs and labels are incompatible'

                outputs = self(ids, masks, tti)

                loss = criterion(outputs, labels)

                loss.backward()
                optimizer.step()

                train_loss += loss.item()

            avg_train_loss = train_loss / len(train_loader)
            avg_test_loss = calc_loss(self, test_loader, criterion)

            train_losses.append(avg_train_loss)
            test_losses.append(avg_test_loss)

            print(f'Epoch {epoch + 1}/{self.epochs} Train Loss: {avg_train_loss} Test Loss: {avg_test_loss}')

        print('Ending training...')

        model_name = 'model' + '_' + 'ep' + str(self.epochs) + '_' + 'lr' + str(self.learning_rate) + '.pth'
        save(self.state_dict(), model_name)

        return train_losses, test_losses

    def evaluate(self, dataloader):
        self.eval()

        data_labels = []
        data_outputs = []

        with torch.no_grad():
            for inputs, labels in dataloader:

                ids = inputs[:, 0].to(self.device, dtype=long)
                masks = inputs[:, 1].to(self.device, dtype=long)
                tti = inputs[:, 2].to(self.device, dtype=long)
                labels = labels.squeeze().to(self.device, dtype=long)

                assert ids.shape == masks.shape, 'Ids != Masks'
                assert masks.shape == tti.shape, 'Masks != Ttis'
                assert ids.shape == tti.shape, 'Ids != Ttis'

                assert ids.shape[0] == labels.shape[0], 'inputs and labels are incompatible'

                outputs = self(ids, masks, tti)
                outputs = nn.functional.softmax(outputs, dim=1)
                outputs = argmax(outputs, dim=1)

                data_labels.extend(labels.cpu().detach().numpy().tolist())
                data_outputs.extend(outputs.cpu().detach().numpy().tolist())

        target_names = ['Easy', 'Medium', 'Hard']
        macro_f1 = f1_score(data_labels, data_outputs, average='macro')
        cm = confusion_matrix(data_labels, data_outputs)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
        print(f'Macro F1: {macro_f1}')
        disp.plot()
        plt.show()

    def predict(self, text):
        self.eval()

        target_names = ['Easy', 'Medium', 'Hard']

        input_encoder, _ = define_encoders(max_len=300)

        with torch.no_grad():
            input = input_encoder([text])

            ids = input[:, 0].to(self.device, dtype=long)
            masks = input[:, 1].to(self.device, dtype=long)
            tti = input[:, 2].to(self.device, dtype=long)

            assert ids.shape == masks.shape, 'Ids != Masks'
            assert masks.shape == tti.shape, 'Masks != Ttis'
            assert ids.shape == tti.shape, 'Ids != Ttis'

            output = self(ids, masks, tti)
            output = nn.functional.softmax(output, dim=1)
            output = argmax(output, dim=1)

            output = output.cpu().detach().numpy().tolist()

            print(f'Text: {text}')
            print(f'Difficulty: {target_names[output[0]]}')

## Inferência

Aqui está a inferência. Siga os seguintes passos:

1. Baixe os pesos do modelo treinado pelo link no repositório (acesse [aqui](https://github.com/jpmedras/programming-problem-classifier/)).
2. Execute a célula abaixo para fazer o download do arquivo nesse Colab.
3. Altere a variável `model_path` de acordo com o arquivo baixado.
4. Altere a variável `text` para o texto de um problema que deseja inferir a dificuldade.
5. Execute a última célula.


In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
from torch import load

model_path = 'model_ep5_lr1e-05.pth'

model = BERTModule(n_classes = 3)
model.load_state_dict(load(model_path))

text = """Given an integer x, return true if x is a palindrome, and false otherwise."""
# text = """You are given an array of k linked-lists lists, each linked-list is sorted in ascending order. Merge all the linked-lists into one sorted linked-list and return it."""

model.predict(text=text)