In [16]:
import nltk
from gensim.utils import tokenize
import psycopg2
import spacy
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.similarities import SparseMatrixSimilarity
from transformers import BertTokenizerFast, BertForQuestionAnswering, AdamW, BertModel, pipeline
import torch
from torch.utils.data import DataLoader, TensorDataset

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')
import os

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
class Preprocessor:
    def __init__(self):
        self.stopwords = stopwords

    def preprocess(self, line):
        line = line.lower()
        tokens = tokenize(line)
        tokens = [token for token in tokens if token not in self.stopwords]
        return list(tokens)

    def fetch_documents(self):
        conn = psycopg2.connect(database="diariorepublica",
                                user="postgres",
                                host='localhost',
                                password="1597535",
                                port=5432)
        cur = conn.cursor()
        cur.execute("SELECT * FROM public.dreapp_document ORDER BY id;")
        records = cur.fetchall()
        conn.close()
        return records

    def preprocess_notes(self, records):
        notes = [self.preprocess(record[10]) for record in records]
        return notes

In [18]:
class AnswerExtractor:
    def __init__(self):
        self.nlp = spacy.load("pt_core_news_sm")
        self.nlp.max_length = 10000000 

    def get_named_entities(self, text):
        doc = self.nlp(text)
        self.nlp.max_length = 10000000 
        entities = [(ent.text, ent.start_char) for ent in doc.ents]
        return entities

In [19]:
class QAPairCreator:
    def __init__(self):
        self.preprocessor = Preprocessor()
        self.answer_extractor = AnswerExtractor()

    def create_qa_pairs(self, documents):
        qa_pairs = []

        for doc in documents:
            document_id = doc[0]
            print("Document_id:", document_id)
            text = doc[10]  # Assuming the relevant text is in the 11th column

            # Fetch the full text of the document
            conn = psycopg2.connect(database="diariorepublica",
                                user="postgres",
                                host='localhost',
                                password="1597535",
                                port=5432)
            cur = conn.cursor()
            cur.execute(f"SELECT html_text FROM public.dreapp_documenttext WHERE document_id = {document_id}")
            result = cur.fetchone()
            if result is not None:
                full_text = result[0]
            else:
                full_text = ""

            # Generate candidate answers using NER
            entities = self.answer_extractor.get_named_entities(full_text)

            # Define questions (in Portuguese)
            questions = [
                "Qual é o tema principal do documento?",
                "Quem é o autor ou a autoridade emissora do documento?",
                "Quando o documento foi publicado?",
                "Quais ações ou medidas são propostas no documento?",
                "Quais organizações ou indivíduos são mencionados no documento?",
                "Quais locais ou regiões são referenciados no documento?",
                "Qual é o propósito do documento?"
            ]

            # Match questions with identified entities to create QA pairs
            for question in questions:
                for entity, start_char in entities:
                    qa_pairs.append({
                        "context": full_text,
                        "question": question,
                        "answers": {"text": [entity], "answer_start": [start_char]}
                    })

        

        return qa_pairs

In [20]:
class QA_System:
    def __init__(self):
        self.preprocessor = Preprocessor()
        self.qa_pair_creator = QAPairCreator()
        self.generator = pipeline('question-answering', model='pierreguillou/bert-base-cased-squad-v1.1-portuguese')

    def preprocess_and_create_qa_pairs(self):
        records = self.preprocessor.fetch_documents()
        qa_pairs = self.qa_pair_creator.create_qa_pairs(records)
        return qa_pairs

    def answer_question(self, question):
        # Preprocess the question and convert it to a vector
        question_tokens = self.preprocessor.preprocess(question)
        dictionary = Dictionary(self.preprocessor.preprocess_notes(self.preprocessor.fetch_documents()))
        question_bow = dictionary.doc2bow(question_tokens)

        # Create a dictionary and TF-IDF model
        corpus = [dictionary.doc2bow(note) for note in self.preprocessor.preprocess_notes(self.preprocessor.fetch_documents())]
        tfidf = TfidfModel(corpus, normalize=True)

        # Calculate the cosine similarity between the question and each note
        index = SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary))
        similarities = index[tfidf[question_bow]]

        # Get the index of the most similar note
        most_similar_index = similarities.argmax()
        records = self.preprocessor.fetch_documents()
        print(f"Most similar document: {records[most_similar_index][0]}")

        # Fetch the full text of the most similar document
        conn = psycopg2.connect(database="diariorepublica",
                                user="postgres",
                                host='localhost',
                                password="1597535",
                                port=5432)
        cur = conn.cursor()
        cur.execute(f"SELECT html_text FROM public.dreapp_documenttext WHERE document_id = {records[most_similar_index][0]}")
        result = cur.fetchone()
        conn.close()

        full_text = result[0] if result else ""

        # Generate an answer based on the full text
        answer = self.generator(question=question, context=full_text)

        return answer

In [21]:
qa_system = QA_System()

In [22]:
qa_pairs = qa_system.preprocess_and_create_qa_pairs()

Document_id: 1
Document_id: 2
Document_id: 3
Document_id: 4
Document_id: 5
Document_id: 6
Document_id: 7
Document_id: 8
Document_id: 9
Document_id: 10
Document_id: 11
Document_id: 12
Document_id: 13
Document_id: 14
Document_id: 15
Document_id: 16
Document_id: 17
Document_id: 18
Document_id: 19
Document_id: 20
Document_id: 21
Document_id: 22
Document_id: 23
Document_id: 24
Document_id: 25
Document_id: 26
Document_id: 27
Document_id: 28
Document_id: 29
Document_id: 30
Document_id: 31
Document_id: 32
Document_id: 33
Document_id: 34
Document_id: 35
Document_id: 36
Document_id: 37
Document_id: 38
Document_id: 39
Document_id: 40
Document_id: 41
Document_id: 42
Document_id: 43
Document_id: 44
Document_id: 45
Document_id: 46
Document_id: 47
Document_id: 48
Document_id: 49
Document_id: 50
Document_id: 51
Document_id: 52
Document_id: 53
Document_id: 54
Document_id: 55
Document_id: 56
Document_id: 57
Document_id: 58
Document_id: 59
Document_id: 60
Document_id: 61
Document_id: 62
Document_id: 63
D

KeyboardInterrupt: 

In [None]:
answer = qa_system.answer_question("O que é aconteceu no dia 11 de setembro de 2001?")
print(answer)

Most similar document: 80
{'score': 0.19785882532596588, 'start': 1879, 'end': 1932, 'answer': 'o uso de indicações, desenhos, figuras ou ilustrações'}
