In [1]:
import os
import PyPDF2
import random

def extract_text_from_pdfs(pdf_folder):
    all_text = ''
    for filename in os.listdir(pdf_folder):
        if filename.endswith('.pdf'):
            path = os.path.join(pdf_folder, filename)
            with open(path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                for page in reader.pages:
                    all_text += page.extract_text() + '\n'
    return all_text

pdf_folder = './pdfs'  # altere para o caminho real
corpus_text = extract_text_from_pdfs(pdf_folder)


In [3]:
import re
import nltk
# from nltk.tokenize import sent_tokenize

# nltk.download('punkt',download_dir='./nltk_data', quiet=True)

def preprocess_text(text):
    text = text.lower()
    # text = re.sub(r'\n+', ' ', text)
    # # MANTÉM . ! ? para detecção de fim de frase
    # text = re.sub(r'[^a-záéíóúàãõç\.\!\?\s]', '', text)
    # return text
    text = text.replace('\xa0', ' ')
    text = re.sub(r'\n+', '\n', text)  # mantém quebras de parágrafo
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r' +\n', '\n', text)
    return text.strip()

# corpus_text = "Belém é a capital do Pará. É conhecida pelo Círio de Nazaré! Você já visitou?"
processed_text = preprocess_text(corpus_text)
# clean_text = sent_tokenize(processed_text, language='portuguese')


In [5]:
def split_paragraphs_into_sentences(text):
    paragraphs = text.split('\n')
    sentences = []
    for paragraph in paragraphs:
        parts = re.split(r'(?<=[.!?])\s+', paragraph.strip())
        for part in parts:
            part = part.strip()
            if len(part) > 30:
                sentences.append(part)
    return sentences

# === 4. Gerar perguntas/respostas ===
def generate_qa_pairs(sentences):
    qa_pairs = []
    for i in range(1, len(sentences)):
        prev = sentences[i-1]
        curr = sentences[i]
        question = f"O que você pode me dizer sobre: '{prev[:60]}...'"
        answer = curr
        qa_pairs.append((question, answer))
    return qa_pairs

def generate_qa_pairs_improved(sentences, min_words=6, max_chars=180):
    question_templates = [
        "Você pode me explicar sobre \"{}\"?",
        "O que é \"{}\"?",
        "Fale sobre \"{}\".",
        "O que você sabe sobre \"{}\"?",
        "Qual a importância de \"{}\"?",
        "Conte-me algo sobre \"{}\"."
    ]

    qa_pairs = []
    for i in range(len(sentences) - 1):
        context = sentences[i].strip()
        response = sentences[i+1].strip()

        # Filtro: só usar como pergunta se tiver palavras suficientes
        if len(context.split()) >= min_words and len(response.split()) >= min_words:
            # Garante que termina com pontuação
            context_clean = re.sub(r'\s+', ' ', context)
            context_clean = context_clean.strip()

            # Se for muito longo, corta no último ponto final, interrogação ou exclamação antes do limite
            if len(context_clean) > max_chars:
                match = re.search(r'^(.{30,' + str(max_chars) + r'}[.!?])', context_clean)
                if match:
                    context_clean = match.group(1)

            question_text = random.choice(question_templates).format(context_clean)
            qa_pairs.append((question_text, response))

    return qa_pairs



In [6]:
sentences = split_paragraphs_into_sentences(processed_text)
qa_pairs = generate_qa_pairs_improved(sentences)
# qa_pairs

In [8]:
print(sentences)

['cultura e turismo de belém (pará)', 'cultura e turismo de belém são diversificadas as opções culturais e turísticas da capital do', 'estado do pará , influenciadas por indígenas e imigrantes estrangeiros, que manifestam-se através', 'das manifestações religiosas, da gastronomia, do folclore, danças, músicas, teatros, museus, etc.', 'capital paraense desponta como grande roteiro turístico do brasil, gerando uma excelente', 'oportunidade para investimentos turísticos.', 'bioparque amazônia (1989 ): parque particular', 'bondinho de belém: tecnologia implantada na', 'época da belle époque (1871-1914).', 'casa das onze janelas (século xviii ,', 'aproximadamente 1750 ): vizinho do forte do', 'castelo, feito pelo dono de engenho de açúcar domingos da costa bacelar .[6][7][8]', 'complexo feliz lusitânia (1616 ): um povoado colonial português criado pelo capitão francisco', 'caldeira castelo branco às margens da baía do guajará.', 'complexo do v er-o-peso (1625) – inciou com a "casa de haver-

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Separar perguntas e respostas
# perguntas = [q for q, _ in qa_pairs]
# respostas = [a for _, a in qa_pairs]

# Tokenizer
# tokenizer = Tokenizer(num_words=5000,oov_token="<OOV>")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

input_seq=[]

for line in sentences:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_seq.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in input_seq])

input_seq = np.array(pad_sequences(input_seq, maxlen=max_sequence_len, padding='pre'))

xs = input_seq[:,:-1]
label = input_seq[:,-1]

ys = to_categorical(label, num_classes=vocab_size)

# Sequenciar e padronizar
# max_len = 40  # pode ajustar
# X = tokenizer.texts_to_sequences(perguntas)
# y = tokenizer.texts_to_sequences(respostas)

# X = pad_sequences(X, maxlen=max_len, padding='post')
# y = pad_sequences(y, maxlen=max_len, padding='post')

# # Ajustar formato do y para categorical_crossentropy
# y = np.expand_dims(y, -1)


In [None]:
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# import numpy as np

# # Separar perguntas e respostas
# perguntas = [q for q, _ in qa_pairs]
# respostas = [a for _, a in qa_pairs]

# # Tokenizer
# # tokenizer = Tokenizer(num_words=5000,oov_token="<OOV>")
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(perguntas + respostas)

# word_index = tokenizer.word_index
# vocab_size = len(word_index) + 1

# # Sequenciar e padronizar
# max_len = 40  # pode ajustar
# X = tokenizer.texts_to_sequences(perguntas)
# y = tokenizer.texts_to_sequences(respostas)
# # 
# X = pad_sequences(X, maxlen=max_len, padding='post')
# y = pad_sequences(y, maxlen=max_len, padding='post')

# # Ajustar formato do y para categorical_crossentropy
# y = np.expand_dims(y, -1)


2025-04-01 09:04:09.602869: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743509049.617201  697415 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743509049.620966  697415 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743509049.633540  697415 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743509049.633563  697415 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743509049.633565  697415 computation_placer.cc:177] computation placer alr

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

embedding_dim = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_sequence_len-1))
model.add(LSTM(128, return_sequences=True))
model.add(Dense(vocab_size, activation='softmax'))
adam =  Adam(learning_rate=0.01)
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

# input_seq = Input(shape=(max_len,))
# embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_seq)
# lstm = LSTM(128, return_sequences=True)(embedding)
# output = Dense(vocab_size, activation='softmax')(lstm)

# model = Model(input_seq, output)
# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()




In [24]:
model.fit(xs, ys, epochs=100, batch_size=16, validation_split=0.1)


2025-04-01 11:25:24.754260: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 439876980 exceeds 10% of free system memory.
2025-04-01 11:25:25.189028: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 439876980 exceeds 10% of free system memory.


Epoch 1/100


ValueError: Arguments `target` and `output` must have the same rank (ndim). Received: target.shape=(None, 5301), output.shape=(None, 28, 5301)

In [1]:
def responder(pergunta, max_len=400):
    seq = tokenizer.texts_to_sequences([pergunta])
    padded = pad_sequences(seq, maxlen=max_len, padding='post')
    pred = model.predict(padded)[0]
    print(pred.shape)
    pred_ids = np.argmax(pred, axis=-1)

    resposta = " ".join([tokenizer.index_word.get(i, '') for i in pred_ids if i != 0])
    return resposta.strip()

# Exemplo:
print(responder("O que é o Ver-o-Peso?"))


NameError: name 'tokenizer' is not defined