In [1]:
import pandas as pd
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


# Preparação de contextos para prompts

In [5]:
from typing import List
from abc import abstractmethod

In [6]:
from abc import ABC, abstractmethod


class PromptParser(ABC):

    @abstractmethod
    def prepare_context(self, questions: List[str], answers: List[str]) -> str:
        raise NotImplementedError

    @abstractmethod
    def parse_output(self, output: str) -> str:
        raise NotImplementedError

In [14]:
class NaiveParser(PromptParser):

    def prepare_context(self, questions: List[str], answers: List[str]) -> str:
        # Abordagem ingênua de apenas utilizar o texto das respostas como contexto.
        context_str = '\n'.join(answers) + ' '

        return context_str
    
    def parse_output(self, output: str) -> str:
        # Nenhum pos-processamento
        return output

In [25]:
class FormattedParser(PromptParser):

    def prepare_context(self, questions: List[str], answers: List[str]) -> str:
        # Abordagem com formatação do contexto da seguinte forma:
        # Question: [texto pergunta] Answer: [texto resposta]

        context_str = [f'Question: {q} Answer: {a}\n' for q,a in zip(questions, answers)]
        context_str = " ".join(context_str)
        context_str += "Question: "
        
        return " ".join(context_str)
    
    def parse_output(self, output: str) -> str:
        # Nenhum pos-processamento
        return output

# Carregando perguntas e respostas

In [18]:
qa_df = pd.read_csv('data/qa_data.csv', index_col=0)

In [20]:
qa_df.head()

Unnamed: 0,questions,answers
0,What is the hero's name in The Legend of Zelda?,"Despite most people's believes, he's called Link"
1,What are the names of the ghosts who chase Pac...,"Inky, Blinky, Pinky, and Clyde"
2,What's the name of the Mythbusters' crash test...,The Mythbusters' crash test dummy is called Bu...
3,What is an Oxford comma?,The hotly contested punctuation before a conju...
4,Who was the captain of the Enterprise in the p...,The captain of the Enterprise in the pilot epi...


# Preparando modelo para inferência

In [12]:
qa_pipeline = pipeline('text-generation', model='facebook/opt-350m', device='cpu')

In [15]:
parser = NaiveParser()

In [16]:
question = "What does the acronym GNU represent?"

In [22]:
prepared_input = parser.prepare_context(qa_df['questions'], qa_df['answers']) + question
prepared_input

"Despite most people's believes, he's called Link\nInky, Blinky, Pinky, and Clyde\nThe Mythbusters' crash test dummy is called Buster\nThe hotly contested punctuation before a conjunction in a list\nThe captain of the Enterprise in the pilot episode was Captain Pike\nThe percentage symbol is used as modulus operator in C\nThe main function\nFortran was introduced by IBM in 1957\nThe first programmer was Ada Lovelace\nThe first known case of robot homicide occurred in 1981, when a robotic arm crushed a Japanese Kawasaki factory worker\nA hitchhiking robot that relied on the kindness of strangers to travel the world and was slain by humans\nYes. Car accidents are product of human misconduct\nGrace Hoper. She wrote it in her spare time\nPython was invented by Guido van Rossum\nSure. I've never seen him drink water\nI heard an electric can opener\nSalads made with ice-borg lettuce.\nI had a byte\nPeople kept pushing our buttons.\nHe says so many foul words they have to bleep everything he 

In [34]:
output = qa_pipeline(prepared_input, max_new_tokens=32, max_length=None, return_full_text=False, num_beams=1, num_return_sequences=1)

In [35]:
output

[{'generated_text': '\nGNU is a recursive acronym meaning GNU is Not Unix\nMPEG stands for Moving Picture Experts Group\nSMTP stands for Simple Mail Transport Protocol\nGN'}]