In [9]:
import nltk
from gensim.utils import tokenize
import psycopg2
import spacy
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.similarities import SparseMatrixSimilarity
from transformers import BertTokenizerFast, BertForQuestionAnswering, AdamW, pipeline
import torch
from torch.utils.data import DataLoader, TensorDataset

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
class Preprocessor:
    def __init__(self):
        self.stopwords = stopwords

    def preprocess(self, line):
        line = line.lower()
        tokens = tokenize(line)
        tokens = [token for token in tokens if token not in self.stopwords]
        return list(tokens)

    def fetch_documents(self):
        conn = psycopg2.connect(database="diariorepublica",
                                user="postgres",
                                host='localhost',
                                password="1597535",
                                port=5432)
        cur = conn.cursor()
        cur.execute("SELECT * FROM public.dreapp_document;")
        records = cur.fetchall()
        conn.close()
        return records

    def preprocess_notes(self, records):
        notes = [self.preprocess(record[10]) for record in records]
        return notes

In [11]:
class AnswerExtractor:
    def __init__(self):
        self.nlp = spacy.load("pt_core_news_sm")

    def get_named_entities(self, text):
        doc = self.nlp(text)
        entities = [(ent.text, ent.start_char) for ent in doc.ents]
        return entities

In [12]:
class QAPairCreator:
    def __init__(self):
        self.preprocessor = Preprocessor()
        self.answer_extractor = AnswerExtractor()

    def create_qa_pairs(self, documents):
        qa_pairs = []

        for doc in documents:
            document_id = doc[0]
            text = doc[10]  # Assuming the relevant text is in the 11th column

            # Fetch the full text of the document
            conn = psycopg2.connect(database="diariorepublica",
                                user="postgres",
                                host='localhost',
                                password="1597535",
                                port=5432)
            cur = conn.cursor()
            cur.execute(f"SELECT html_text FROM public.dreapp_documenttext WHERE document_id = {document_id}")
            result = cur.fetchone()
            if result is not None:
                full_text = result[0]
            else:
                full_text = ""

            # Generate candidate answers using NER
            entities = self.answer_extractor.get_named_entities(full_text)

            # Define questions (in Portuguese)
            questions = [
                "Qual é o tema principal do documento?",
                "Quem é o autor ou a autoridade emissora do documento?",
                "Quando o documento foi publicado?",
                "Quais ações ou medidas são propostas no documento?",
                "Quais organizações ou indivíduos são mencionados no documento?",
                "Quais locais ou regiões são referenciados no documento?",
                "Qual é o propósito do documento?"
            ]

            # Match questions with identified entities to create QA pairs
            for question in questions:
                for entity, start_char in entities:
                    qa_pairs.append({
                        "context": full_text,
                        "question": question,
                        "answers": {"text": [entity], "answer_start": [start_char]}
                    })

        return qa_pairs

In [13]:
class ModelTrainer:
    def __init__(self):
        self.tokenizer = BertTokenizerFast.from_pretrained('neuralmind/bert-base-portuguese-cased')

    def create_dataset(self, qa_pairs):
        # Tokenize the QA pairs
        encodings = self.tokenizer(
            [pair['context'] for pair in qa_pairs],
            [pair['question'] for pair in qa_pairs],
            truncation=True,
            padding=True,
            max_length=512,
            return_tensors='pt'
        )

        # Find the start and end positions of the answers
        start_positions = []
        end_positions = []
        for i, pair in enumerate(qa_pairs):
            start_positions.append(encodings.char_to_token(i, pair['answers']['answer_start'][0]))
            end_positions.append(encodings.char_to_token(i, pair['answers']['answer_start'][0] + len(pair['answers']['text'][0]) - 1))

        # Convert to tensors
        start_positions = torch.tensor(start_positions)
        end_positions = torch.tensor(end_positions)

        # Create a dataset and dataloader
        dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], start_positions, end_positions)
        dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

        return dataloader

    def pretrain_model(self, dataloader):
        # Load the model
        model = BertForQuestionAnswering.from_pretrained('neuralmind/bert-base-portuguese-cased')

        # Define the optimizer
        optimizer = AdamW(model.parameters(), lr=1e-5)

        # Training loop
        num_epochs = 3  # Define the number of epochs
        for epoch in range(num_epochs):
            model.train()
            for batch in dataloader:
                optimizer.zero_grad()
                input_ids = batch[0]
                attention_mask = batch[1]
                start_positions = batch[2]
                end_positions = batch[3]

                outputs = model(input_ids, attention_mask=attention_mask,
                                start_positions=start_positions, end_positions=end_positions)

                loss = outputs.loss
                loss.backward()
                optimizer.step()

        # Save the trained model
        model.save_pretrained("trained_qa_model")

In [14]:
class QA_System:
    def __init__(self):
        self.preprocessor = Preprocessor()
        self.qa_pair_creator = QAPairCreator()
        self.model_trainer = ModelTrainer()
        self.generator = pipeline('question-answering', model='trained_qa_model')

    def preprocess_and_create_qa_pairs(self):
        records = self.preprocessor.fetch_documents()
        qa_pairs = self.qa_pair_creator.create_qa_pairs(records)
        return qa_pairs

    def train_model(self, qa_pairs):
        dataloader = self.model_trainer.create_dataset(qa_pairs)
        self.model_trainer.pretrain_model(dataloader)

    def answer_question(self, question):
        # Preprocess the question and convert it to a vector
        question_tokens = self.preprocessor.preprocess(question)
        dictionary = Dictionary(self.preprocessor.preprocess_notes(self.preprocessor.fetch_documents()))
        question_bow = dictionary.doc2bow(question_tokens)

        # Create a dictionary and TF-IDF model
        corpus = [dictionary.doc2bow(note) for note in self.preprocessor.preprocess_notes(self.preprocessor.fetch_documents())]
        tfidf = TfidfModel(corpus, normalize=True)

        # Calculate the cosine similarity between the question and each note
        index = SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary))
        similarities = index[tfidf[question_bow]]

        # Get the index of the most similar note
        most_similar_index = similarities.argmax()
        records = self.preprocessor.fetch_documents()
        print(f"Most similar document: {records[most_similar_index][0]}")

        # Fetch the full text of the most similar document
        conn = psycopg2.connect(database="diariorepublica",
                                user="postgres",
                                host='localhost',
                                password="1597535",
                                port=5432)
        cur = conn.cursor()
        cur.execute(f"SELECT html_text FROM public.dreapp_documenttext WHERE document_id = {records[most_similar_index][0]}")
        result = cur.fetchone()
        conn.close()

        full_text = result[0] if result else ""

        # Generate an answer based on the full text
        answer = self.generator(question=question, context=full_text)

        return answer

In [15]:
qa_system = QA_System()

ValueError: GPU is not accessible. Was the library installed correctly?

In [8]:
qa_pairs = qa_system.preprocess_and_create_qa_pairs()

KeyboardInterrupt: 