In [17]:
from solution.utilities.memory import chat_to_memory
from solution.utilities.model import model_embedding, ChatGPT
from solution.utilities.preprocess import get_pdf_text, store_in_vector_database
from langchain.vectorstores import FAISS

from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import MessagesState, START, StateGraph

from langchain_core.messages import (AIMessage, BaseMessage, HumanMessage, 
                                     SystemMessage, trim_messages)

from langchain.prompts import (ChatPromptTemplate, HumanMessagePromptTemplate, 
                               MessagesPlaceholder, PromptTemplate)

from langchain.chains import (StuffDocumentsChain, create_history_aware_retriever, 
                              create_retrieval_chain)

from langchain.chains.combine_documents import create_stuff_documents_chain
from unstructured.partition.pdf import partition_pdf
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader

In [14]:
pdf_path = "./pdf/Football_Laws_of_the_Game_2425.pdf"

In [15]:
def get_pdf_text(pdf_path):
    """
    Extracts text and metadata from a PDF and returns them as Document objects.

    Args:
        pdf_path (str): Path to the PDF file to process.

    Returns:
        extracted_pdf (list): A list of `Document` objects containing page content and metadata for each PDF page.
    """
    extracted_element =  partition_pdf(
        filename=pdf_path,
        extract_images_in_pdf=True,
        infer_table_structure=False,
        chunking_strategy="by_title",
        max_characters=4000,
        new_after_n_chars=3800,
        combine_text_under_n_chars=2000,
        strategy="fast",
        include_page_break = True
    )

    extracted_pdf = []
    for i in range(0, len(extracted_element)):
        data_metadata = extracted_element[i].metadata.to_dict()
        extracted_pdf.append(Document(page_content=extracted_element[i].text, metadata=data_metadata))
    
    return extracted_pdf


In [16]:
get_pdf_text(pdf_path)

[Document(metadata={'file_directory': './pdf', 'filename': 'Football_Laws_of_the_Game_2425.pdf', 'languages': ['eng'], 'last_modified': '2024-11-19T10:10:34', 'page_number': 1, 'orig_elements': 'eJzdWltv3DYW/ivEPG2LocK7xOxTHjZFgGy3QAP0IRsYFC8eIRppVqLsGO3+9z2kNN65OKmLYgx0kCD2EQ8pkt93rsrHX1e+9VvfxZvGrV6jlVOKGlUSLFxwWFhVYhOcxNZabYPSuqrDao1WWx+NM9HAnF9Xtu8H13Qm+jHLrXnop3iz8c3tJsITqWXBSgXTlpH7xsUNDAiqC8kqGNj1TRfT7I8fZVmINVK8kJ/WaJG0miVGSVGdiVkX5NX4MEa/Tef4qfni2593xvrVf2EgNK2/cc3gbeyHh6RQvNq5fJI01JmtTw/f9n2sTdvevDf3400fbuLG3/wAgzdMMFksM1rT3U7mNh/248p3t6tP+ekYb7a9a0Lj81UywgSmFFP9gZLX8JeLNHsHM2+6aVv7AbRo2l30X9I1rT5sPPoJrsgPY9KMD7u8qw9NbPMxTsGSlhLtiMQ1kQwLLw2uvPXYeOpKH1xFubkwWKIq2BpRVRUkAbKIZTmLlFeFfELO6lcAGC3Qj3kA9QHtzpF734zxXTrgE+BpHbQtLRhZbQMWipdYU6WxkgxGZMVCuLSlKZZsixFaiITOInI1i7wkhXpCzupXAN4btDXRblAzztA5VD+geN+j6M12XCNvYPC+iRtkQPNLs522CWZA8c53e7T/jvrOo+00RlR7BAdAt71pP3u/80OB9q/YmgfU9RGN0QwR9QOyfRebbvKogQXhFcCg9Fa0MSMK/j6JG9Oh8fBVxSGzfjTDYGJz5z+k0zxBryArUpeSYk84ePMAV1P7yuNSkorL0hnC9cvQS6hCH

In [18]:
def get_pdf_text_pypdf(pdf_path):
    """
    Extracts text and metadata from a PDF and returns them as Document objects using PyPDFLoader.

    Args:
    pdf_path (str): Path to the PDF file to process.

    Returns:
    extracted_pdf (list): A list of `Document` objects containing page content and metadata for each PDF page.
    """
    loader = PyPDFLoader(pdf_path)
    extracted_pdf = []
    
    for page in loader.load():
        extracted_pdf.append(page)
    
    return extracted_pdf

In [19]:
get_pdf_text_pypdf(pdf_path)

[Document(metadata={'source': './pdf/Football_Laws_of_the_Game_2425.pdf', 'page': 0}, page_content='51\nThe Players\n1. Number of players\n A match is played by two teams, each with a maximum of eleven players;  \none must be the goalkeeper. A match may not start or continue  if either team \nhas fewer than seven players.\n If a team has fewer than seven players because one or more players has \ndeliberately left the field of play, the referee is not obliged to stop play and  \nthe advantage may be played, but the match must not resume after the ball has \ngone out of play if a team does not have the minimum number of seven players.\n If the competition rules state that all players and substitutes must be named \nbefore kick-off and a team starts a match with fewer than eleven players,  \nonly the players and substitutes named on the team list may take part in the \nmatch upon their arrival.\n2. Number of substitutions\nOfficial competitions\n The number of substitutes, up to a maximum