In [18]:
import os
import PyPDF2
import random

def extract_text_from_pdfs(pdf_folder):
    all_text = ''
    for filename in os.listdir(pdf_folder):
        if filename.endswith('.pdf'):
            path = os.path.join(pdf_folder, filename)
            with open(path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                for page in reader.pages:
                    all_text += page.extract_text() + '\n'
    return all_text

pdf_folder = './pdfs'  # altere para o caminho real
corpus_text = extract_text_from_pdfs(pdf_folder)


In [13]:
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def preprocess_text(text):
    text = text.lower()
    # text = re.sub(r'\n+', ' ', text)
    # # MANTÉM . ! ? para detecção de fim de frase
    # text = re.sub(r'[^a-záéíóúàãõç\.\!\?\s]', '', text)
    # return text
    text = text.replace('\xa0', ' ')
    text = re.sub(r'\n+', '\n', text)  # mantém quebras de parágrafo
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r' +\n', '\n', text)
    return text.strip()


# corpus_text = "Belém é a capital do Pará. É conhecida pelo Círio de Nazaré! Você já visitou?"
processed_text = preprocess_text(corpus_text)
# clean_text = sent_tokenize(processed_text, language='portuguese')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\carlo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
def split_paragraphs_into_sentences(text):
    paragraphs = text.split('\n')
    sentences = []
    for paragraph in paragraphs:
        parts = re.split(r'(?<=[.!?])\s+', paragraph.strip())
        for part in parts:
            part = part.strip()
            if len(part) > 30:
                sentences.append(part)
    return sentences

# === 4. Gerar perguntas/respostas ===
def generate_qa_pairs(sentences):
    qa_pairs = []
    for i in range(1, len(sentences)):
        prev = sentences[i-1]
        curr = sentences[i]
        question = f"O que você pode me dizer sobre: '{prev[:60]}...'"
        answer = curr
        qa_pairs.append((question, answer))
    return qa_pairs

def generate_qa_pairs_improved(sentences, min_words=6, max_chars=180):
    question_templates = [
        "Você pode me explicar sobre \"{}\"?",
        "O que é \"{}\"?",
        "Fale sobre \"{}\".",
        "O que você sabe sobre \"{}\"?",
        "Qual a importância de \"{}\"?",
        "Conte-me algo sobre \"{}\"."
    ]

    qa_pairs = []
    for i in range(len(sentences) - 1):
        context = sentences[i].strip()
        response = sentences[i+1].strip()

        # Filtro: só usar como pergunta se tiver palavras suficientes
        if len(context.split()) >= min_words and len(response.split()) >= min_words:
            # Garante que termina com pontuação
            context_clean = re.sub(r'\s+', ' ', context)
            context_clean = context_clean.strip()

            # Se for muito longo, corta no último ponto final, interrogação ou exclamação antes do limite
            if len(context_clean) > max_chars:
                match = re.search(r'^(.{30,' + str(max_chars) + r'}[.!?])', context_clean)
                if match:
                    context_clean = match.group(1)

            question_text = random.choice(question_templates).format(context_clean)
            qa_pairs.append((question_text, response))

    return qa_pairs



In [29]:
sentences = split_paragraphs_into_sentences(processed_text)
qa_pairs = generate_qa_pairs_improved(sentences)
# qa_pairs

In [23]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Separar perguntas e respostas
perguntas = [q for q, _ in qa_pairs]
respostas = [a for _, a in qa_pairs]

# Tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(perguntas + respostas)

word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Sequenciar e padronizar
max_len = 40  # pode ajustar
X = tokenizer.texts_to_sequences(perguntas)
y = tokenizer.texts_to_sequences(respostas)

X = pad_sequences(X, maxlen=max_len, padding='post')
y = pad_sequences(y, maxlen=max_len, padding='post')

# Ajustar formato do y para categorical_crossentropy
y = np.expand_dims(y, -1)


In [24]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

embedding_dim = 128

input_seq = Input(shape=(max_len,))
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_seq)
lstm = LSTM(128, return_sequences=True)(embedding)
output = Dense(vocab_size, activation='softmax')(lstm)

model = Model(input_seq, output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [30]:
model.fit(X, y, epochs=100, batch_size=16, validation_split=0.1)


Epoch 1/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.7485 - loss: 1.3467 - val_accuracy: 0.7135 - val_loss: 2.9999
Epoch 2/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.7405 - loss: 1.3740 - val_accuracy: 0.7115 - val_loss: 3.0325
Epoch 3/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.7438 - loss: 1.3515 - val_accuracy: 0.7067 - val_loss: 3.0446
Epoch 4/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.7475 - loss: 1.3283 - val_accuracy: 0.7067 - val_loss: 3.0562
Epoch 5/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.7519 - loss: 1.3006 - val_accuracy: 0.7118 - val_loss: 3.0609
Epoch 6/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.7547 - loss: 1.2852 - val_accuracy: 0.7045 - val_loss: 3.0782
Epoch 7/100
[1m50/50[0m [

<keras.src.callbacks.history.History at 0x211b7328e30>

In [31]:
def responder(pergunta, max_len=40):
    seq = tokenizer.texts_to_sequences([pergunta])
    padded = pad_sequences(seq, maxlen=max_len, padding='post')
    pred = model.predict(padded)[0]
    pred_ids = np.argmax(pred, axis=-1)

    resposta = " ".join([tokenizer.index_word.get(i, '') for i in pred_ids if i != 0])
    return resposta.strip()

# Exemplo:
print(responder("O que é o Ver-o-Peso?"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
o de do 136 faculdades com com com
