In [1]:
%%capture
!pip install llama-index==0.10.37 llama-index-readers-smart-pdf-loader pymupdf llamasherpa 

In [2]:
import os
import sys
import getpass
import nest_asyncio
import fitz
from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv()

sys.path.append('../helpers')

from text_cleaning_helpers import clean

In [3]:
PDF_PATH = "../data/gaa_rules_simple.pdf"

In [4]:
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import PDFReader

simple_directory_reader_docs = SimpleDirectoryReader(input_files=[PDF_PATH]).load_data()

pdf_reader_docs = PDFReader().load_data(PDF_PATH)

In [10]:
document = fitz.open(PDF_PATH)

def extract_text(document, opt="text"):
    '''Extract text from a page and returns a list of strings'''
    text = document.get_text(opt, sort=True)
    text = text.split("\n")
    return text

pages = [extract_text(page) for page in document]

In [11]:
def get_document(file_path, pages):
    """
    Opens a PDF file and optionally selects specific pages to create a document object.

    This function utilizes the `fitz` library to open a PDF file located at `file_path`. 
    If a list of `pages` is provided, the function selects only these pages from the document.
    This is useful for focusing on certain parts of a PDF without loading the entire document into memory.

    Parameters:
        file_path (str): The path to the PDF file to be opened.
        pages (list of int, optional): A list of page numbers to select from the PDF. 
            If `None`, the entire document is loaded.

    """
    document = fitz.open(file_path)
    if pages is not None:
        document.select(pages)  # Select specific pages if pages are provided
    return document

def handle_chapter_headers_footers(strings, flag):
    """
    Modify a list of strings based on a specified flag and join them into a single string.

    This function first removes any empty strings from the input list. It then checks if the
    remaining list has more than three elements. If so, it modifies the list by removing the
    first element, last element, or both, based on the value of the flag. The final list is then
    joined into a single string with spaces separating the elements.

    Parameters:
        strings (list of str): The list of strings to modify.
        flag (str): A flag indicating the modification to perform on the list:
            - 'remove_first': Remove the first element of the list.
            - 'remove_last': Remove the last element of the list.
            - 'remove_first_last': Remove both the first and last elements of the list.
            - 'remove_first_two': Remove the first two elements of the list.
            - Any other value leaves the list unchanged.

    Returns:
        str: A single string composed of the modified list elements, separated by spaces.
    """
    # Filter out empty strings
    filtered_strings = [s for s in strings if s]

    # Check if the filtered list has more than three elements
    if len(filtered_strings) > 3:
        if flag == 'remove_first':
            filtered_strings = filtered_strings[1:]  # Slice off the first element
        elif flag == 'remove_last':
            filtered_strings = filtered_strings[:-1]  # Slice off the last element
        elif flag == 'remove_first_last':
            filtered_strings = filtered_strings[1:-1]  # Slice off the first and last elements
        elif flag == 'remove_first_two':
            filtered_strings = filtered_strings[2:]  # Slice off the first two elements

    # Join all strings with a space and return the result
    return ' '.join(filtered_strings).strip()

def extract_text(page, file_name, title, author, flag, opt="text"):
    """
    Extracts text from a specified page of a document and returns a dictionary containing
    the extracted text and associated metadata.

    The function first retrieves text from the given `page` object using the specified `opt` method.
    It then processes this text to remove chapter headers, footers, and applies various cleaning
    procedures according to the `flag` and other parameters set in the `clean` function.

    Parameters:
        page (fitz.Page): The page object from which to extract text.
        file_name (str): The name of the file from which the page is taken.
        title (str): The title of the document.
        author (str): The author of the document.
        flag (str): A flag used to customize how chapter headers and footers are handled.
        opt (str, optional): The method of text extraction to be used by `get_text`.
            Defaults to "text", but can be changed to other methods supported by the library.

    Returns:
        dict: A dictionary with two keys:
            - 'text': A string containing the cleaned and processed text from the page.
            - 'metadata': A dictionary containing metadata about the text, including the
                          page number, file name, title, and author.
    """

    text = page.get_text(opt, sort=True)

    text = text.split("\n")

    text = handle_chapter_headers_footers(text, flag)

    text = clean(
        text,
        extra_whitespace=True,
        broken_paragraphs=True,
        bullets=True,
        ascii=True,
        lowercase=False,
        citations=True,
        merge_split_words=True,
    )

    return {
        "text": text,
        "metadata": {
            "page_number": page.number,
            "file_name": file_name,
            "title": title,
            "author": author
        }
    }

def extract_texts_from_pdf(file_path, title, author, pages, flag):
    document = get_document(file_path, pages)
    file_name = os.path.basename(file_path)
    extracted_texts = [extract_text(page, file_path, title, author, flag) for page in document]
    return extracted_texts

In [13]:
pdf_files = [
    {
        "file_path": "../data/gaa_rules_simple.pdf",
        "title": "OFFICIAL GUIDE - PART 2",
        "author": "Gaelic Athletic Association",
        "pages": list(range(4, 101)),
        "flag": None
    },
    {
        "file_path": "../data/Playing-Rules-2021-1.pdf",
        "title": "LGFA GUIDE",
        "author": "LGFA",
        "pages": list(range(2, 32)),
        "flag": None
    },
]

all_texts = []

for pdf in pdf_files:
    print(f"Extracting texts from {pdf['title']} by {pdf['author']}...")
    texts = extract_texts_from_pdf(pdf["file_path"], pdf["title"], pdf["author"], pdf["pages"], pdf["flag"])
    print(f"Finished extracting texts from {pdf['title']}.")
    all_texts.extend(texts)

Extracting texts from OFFICIAL GUIDE - PART 2 by Gaelic Athletic Association...
Finished extracting texts from OFFICIAL GUIDE - PART 2.
Extracting texts from LGFA GUIDE by LGFA...
Finished extracting texts from LGFA GUIDE.


In [14]:
len(all_texts)

127

In [16]:
from llama_index.core import Document

llama_index_docs = [Document(text=doc["text"], metadata=doc["metadata"]) for doc in all_texts]

In [19]:
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.storage import StorageContext

# Create a SimpleDocumentStore and add the documents
docstore = SimpleDocumentStore()
docstore.add_documents(llama_index_docs)

# Create a storage context
storage_context = StorageContext.from_defaults(docstore=docstore)

# Persist the document store to disk
storage_context.persist("../data/rules-of-the-gaa")