In [7]:
import PyPDF2
import re
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS
from transformers import AutoTokenizer
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import pipeline

In [8]:
def get_text(pdf_file):
    try:
        with open(pdf_file, 'rb') as f:
            pdfReader = PyPDF2.PdfReader(f, strict=False)
            pdf_text = []

            for page in pdfReader.pages:
                text = page.extract_text()

                for line in text.split('\n'):
                    if len(line) > 5:
                        line = re.sub(r'[^\x20-\x7E]', ' ', line)
                        line = re.sub(r'\s+', ' ', line).strip()
                        pdf_text.append(line)

            return " ".join(pdf_text)
    except Exception as e:
        print(f"Error reading PDF file: {e}")
        return ""

In [9]:
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

def truncate_text(text, max_tokens=1000):
    tokens = tokenizer.encode(text, truncation=True, max_length=max_tokens)
    return tokenizer.decode(tokens, skip_special_tokens=True)

In [10]:
def preprocess(pdf_file):
    pdf_text = get_text(pdf_file)
    pdf_text = truncate_text(pdf_text)
    
    print("Text: ", pdf_text, "\n\n\n")
    
    splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
    docs = splitter.split_text(pdf_text)
    
    print("Docs: ", docs, "\n\n\n")
    
    documents = [Document(page_content=text) for text in docs]
    vector_db = FAISS.from_documents(documents, embedding_model)
    
    retriever = vector_db.as_retriever()
    return retriever


In [11]:
def ask_llm(question, pdf_file):
    ret = preprocess(pdf_file)
    llm = pipeline("summarization", model="google/flan-t5-base") 

    doc = ret.invoke(question)  

    context = "\n".join([d.page_content for d in doc]) 
    prompt = f'Summarize the given information: {context}'
    output = llm(prompt, max_length=101)[0]['summary_text']
    
    return output

In [12]:
print(ask_llm("What is this ?", "test-1.pdf"))

Text:  One Model To Learn Them All ukasz Kaiser Google Brain lukaszkaiser@google.comAidan N. Gomez University of Toronto aidan@cs.toronto.eduNoam Shazeer Google Brain noam@google.com Ashish Vaswani Google Brain avaswani@google.comNiki Parmar Google Research nikip@google.comLlion Jones Google Research llion@google.comJakob Uszkoreit Google Research usz@google.com Abstract Deep learning yields great results across many elds, from speech recognition, image classi cation, to translation. But for each problem, getting a deep model to work well involves research into the architecture and a long period of tuning. We present a single model that yields good results on a number of problems span- ning multiple domains. In particular, this single model is trained concurrently on ImageNet, multiple translation tasks, image captioning (COCO dataset), a speech recognition corpus, and an English parsing task. Our model architecture incor- porates building blocks from multiple domains. It contains conv

Device set to use cpu


Google Brain learned representations in the unsupervised Work performed while at Google Brain. Code available at https://github.com/tensorflow/tenson2tensonarXiv:1706.05137v1 [cs.LG]
