In [15]:
import PyPDF2
import re
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS
from transformers import AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
from collections import Counter

In [16]:
embedding_model = SentenceTransformer('bert-base-nli-mean-tokens')
summarization_model = pipeline("summarization", model="facebook/bart-large-cnn")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Device set to use cpu


In [17]:
def get_text(pdf_file):
    """Extract text from PDF using PyPDF2."""
    try:
        with open(pdf_file, 'rb') as f:
            pdfReader = PyPDF2.PdfReader(f, strict=False)
            pdf_text = []
            for page in pdfReader.pages:
                text = page.extract_text()
                if text:
                    text = re.sub(r'[^\x20-\x7E]', ' ', text)  # Remove non-ASCII
                    text = re.sub(r'[\d]', ' ', text)
                    pdf_text.append(text.strip())
            return " ".join(pdf_text)
    except Exception as e:
        print(f"Error reading PDF file: {e}")
        return ""

In [18]:
def extract_keywords(text, top_n=5):
    """Extracts top N keywords based on word frequency."""
    words = re.findall(r'\b\w+\b', text.lower())
    common_words = Counter(words).most_common(top_n)
    return [word for word, _ in common_words]

In [19]:
def truncate_text(text, max_tokens=500):
    tokens = tokenizer.encode(text, truncation=True, max_length=max_tokens)
    return tokenizer.decode(tokens, skip_special_tokens=True)

In [20]:
def preprocess(pdf_file):
    """Preprocesses PDF text and builds FAISS in memory (no saving)."""
    pdf_text = get_text(pdf_file)
    pdf_text = truncate_text(pdf_text)
    
    keywords = extract_keywords(pdf_text, top_n=10)
    
    splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
    docs = splitter.split_text(pdf_text)
    
    documents = [Document(page_content=text) for text in docs if any(kw in text.lower() for kw in keywords)]
    
    texts = [doc.page_content for doc in documents]
    embeddings = embedding_model.encode(texts)
    
    text_embeddings_zip = list(zip(texts, embeddings))
    
    vector_db = FAISS.from_embeddings(text_embeddings_zip, embedding_model)
    return vector_db

In [25]:
def ask_llm(question, pdf_file):
    vector_db = preprocess(pdf_file)
    
    print("Question: ", question)
    
    # Encode the question into an embedding
    question_embedding = embedding_model.encode([question])[0]
    
    # Use the correct method to query the FAISS index
    relevant_docs = vector_db.similarity_search_by_vector(question_embedding, k=5)
    
    context = " ".join([doc.page_content for doc in relevant_docs])
    keywords = extract_keywords(context, top_n=7)
    filtered_sentences = [sent for sent in context.split(". ") if any(kw in sent.lower() for kw in keywords)]
    refined_context = " ".join(filtered_sentences)
    
    if not refined_context.strip():
        refined_context = context  
    
    prompt = f"Summarize the following text: {refined_context}"
    output = summarization_model(prompt, max_length=200, min_length=40, do_sample=False)[0]['summary_text']
    
    return re.sub(r'\s+', ' ', output).strip()

In [26]:
questions = [
    "What is the significance of using a unified representation in the 'One Model for All' framework?",
    "How does the 'One Model for All' framework handle tasks with limited data?",
    "What role do modality-specific sub-networks play in the 'One Model for All' framework?"
]

for question in questions:
    print("Answer:", ask_llm(question, "test-1.pdf"))

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.
Your max_length is set to 200, but your input_length is only 103. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


Question:  What is the significance of using a unified representation in the 'One Model for All' framework?
Answer: A single model yields good results on a number of problems span - ning multiple multiple domains in particular. This single model is trained concurrently on imagenet, multiple we train on. Even if a block is not crucial for a task, we observe that adding it - gated layers each of these computational blocks is crucial.


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.
Your max_length is set to 200, but your input_length is only 101. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


Question:  How does the 'One Model for All' framework handle tasks with limited data?
Answer: A single model is trained concurrently on imagenet, multiple domains. It contains convolutional layers, an attention mechanism, and sparsely - gated present a single model that yields good results on a number of problems.


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.
Your max_length is set to 200, but your input_length is only 101. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


Question:  What role do modality-specific sub-networks play in the 'One Model for All' framework?
Answer: This model is trained concurrently on imagenet, multiple multiple domains. It contains convolutional layers, an attention mechanism, and sparsely - gated -gated layers. Each of these computational blocks is crucial for a subset of the tasks we train.
