In [None]:
# pdf_file = "./doc/1706.03762.pdf"
pdf_file = "./doc/2005.11401.pdf"
text_file = "./doc/textfile.txt"

In [4]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text

In [26]:
unclean_text = extract_text_from_pdf(pdf_file)
print(unclean_text)

Retrieval-Augmented Generation for
Knowledge-Intensive NLP Tasks
Patrick Lewis†‡, Ethan Perez⋆,
Aleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,
Mike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†
†Facebook AI Research; ‡University College London; ⋆New York University;
plewis@fb.com
Abstract
Large pre-trained language models have been shown to store factual knowledge
in their parameters, and achieve state-of-the-art results when ﬁne-tuned on down-
stream NLP tasks. However, their ability to access and precisely manipulate knowl-
edge is still limited, and hence on knowledge-intensive tasks, their performance
lags behind task-speciﬁc architectures. Additionally, providing provenance for their
decisions and updating their world knowledge remain open research problems. Pre-
trained models with a differentiable access mechanism to explicit non-parametric
memory have so far been only investigated for extractive downstream t

In [32]:
import re
import unicodedata

def clean_text(text):
    text = text.replace("\n", " ")  # Replace newlines with spaces
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()  # Trim leading and trailing spaces

def remove_special_chars(text):
    text = re.sub(r'[^a-zA-Z0-9.,!?\'" ]', '', text)  # Keep letters, numbers, and common punctuation
    return text

def fix_hyphenation(text):
    return re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)  # Removes hyphenation across lines

def normalize_unicode(text):
    return unicodedata.normalize("NFKD", text)

def remove_headers_footers(text):
    lines = text.split("\n")
    cleaned_lines = [line for line in lines if not re.match(r'(Page \d+|Confidential|Company Name)', line)]
    return " ".join(cleaned_lines)

def normalize_text(text):
    return " ".join(text.lower().split())  # Lowercase and remove extra spaces

def full_text_cleanup(text):
    """"
    Takes in unclean text and return cleaned text by applying a series of cleaning functions.
    
    """
    text = clean_text(str(text))
    text = fix_hyphenation(text)
    text = remove_special_chars(text)
    text = normalize_unicode(text)
    text = remove_headers_footers(text)
    text = normalize_text(text)
    return text

In [33]:
clean_text = full_text_cleanup(unclean_text)
print(clean_text)

retrievalaugmented generation for knowledgeintensive nlp tasks patrick lewis, ethan perez, aleksandra piktus, fabio petroni, vladimir karpukhin, naman goyal, heinrich kttler, mike lewis, wentau yih, tim rocktschel, sebastian riedel, douwe kiela facebook ai research university college london new york university plewisfb.com abstract large pretrained language models have been shown to store factual knowledge in their parameters, and achieve stateoftheart results when netuned on downstream nlp tasks. however, their ability to access and precisely manipulate knowledge is still limited, and hence on knowledgeintensive tasks, their performance lags behind taskspecic architectures. additionally, providing provenance for their decisions and updating their world knowledge remain open research problems. pretrained models with a differentiable access mechanism to explicit nonparametric memory have so far been only investigated for extractive downstream tasks. we explore a generalpurpose netuning 

In [23]:
# Using Langchain to extract text 
from langchain.document_loaders import PyMuPDFLoader

def extract_text_langchain(pdf_path):
    loader = PyMuPDFLoader(pdf_path)
    documents = loader.load()
    return "\n".join([doc.page_content for doc in documents])

text = extract_text_langchain(pdf_file)
print(text)


Retrieval-Augmented Generation for
Knowledge-Intensive NLP Tasks
Patrick Lewis†‡, Ethan Perez⋆,
Aleksandra Piktus†, Fabio Petroni†, Vladimir Karpukhin†, Naman Goyal†, Heinrich Küttler†,
Mike Lewis†, Wen-tau Yih†, Tim Rocktäschel†‡, Sebastian Riedel†‡, Douwe Kiela†
†Facebook AI Research; ‡University College London; ⋆New York University;
plewis@fb.com
Abstract
Large pre-trained language models have been shown to store factual knowledge
in their parameters, and achieve state-of-the-art results when ﬁne-tuned on down-
stream NLP tasks. However, their ability to access and precisely manipulate knowl-
edge is still limited, and hence on knowledge-intensive tasks, their performance
lags behind task-speciﬁc architectures. Additionally, providing provenance for their
decisions and updating their world knowledge remain open research problems. Pre-
trained models with a differentiable access mechanism to explicit non-parametric
memory have so far been only investigated for extractive downstream t