In [1]:
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# LOAD DOCUMENT --> SPLIT CHUNKS

# EMBEDDING --> EMBED CHUNKS --> VECTORS

# VECTOR CHUNKS -- SAVE DB

# "query" --> similarity search faiss db

In [None]:
loader = TextLoader('../../test.txt', encoding = 'UTF-8')
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        separators=["\n\n", "\n", ".", " "]
    )
docs = text_splitter.split_documents(documents)

In [None]:
# docs

In [3]:
embedding_model = OpenAIEmbeddings()

In [None]:
vector_db = FAISS.from_documents(docs, embedding_model)

In [None]:
# https://python.langchain.com/docs/integrations/vectorstores/faiss/#saving-and-loading
vector_db.save_local("faiss_index")

# Let's pick up where we left off

In [4]:
new_vector_store = FAISS.load_local(
    "faiss_index", embedding_model, allow_dangerous_deserialization=True
)

In [6]:
retriever = new_vector_store.as_retriever(
    search_type="similarity", 
    search_kwargs = {"k": 3}
)

In [7]:
queries = [
    "Can you give some decorative styles in ancient Greek life?"
]

query = queries[0]

retrievals = retriever.invoke(query)

In [8]:
retrievals

[Document(id='c45950a0-2caa-4eca-9e50-d4391030aba8', metadata={'source': '../../test.txt'}, page_content='Decorative Styles:\nGeometric Style (900-700 BCE): Features abstract patterns and motifs.\nBlack-Figure Technique (700-500 BCE): Figures are painted in black silhouette against the natural red clay.\nRed-Figure Technique (530-300 BCE): The reverse of black-figure, allowing for greater detail and expression.\nPainting\nWhile few examples survive, Greek painting was highly esteemed, with influences seen in vase paintings and frescoes.\nTechniques: Included fresco, encaustic, and tempera.'),
 Document(id='0a9312c4-a098-4ebc-a162-95da52760517', metadata={'source': '../../test.txt'}, page_content='Sculpture and Pottery\nGreek sculptors excelled in creating lifelike statues that captured the human form with remarkable realism and beauty. Works such as the Venus de Milo and the Discobolus exemplify the Greek pursuit of idealized proportions and expressive detail. Pottery was another signi

In [None]:
response_schema = [
    ResponseSchema(name="summary", description="A concise summary of the wikipedia page"),
    ResponseSchema(name="key_points", description="The key points relevant to the query"),
    ResponseSchema(name="wikipedia_reference", description="Relevant information retrieved from Wikipedia")
]

parser = StructuredOutputParser.from_response_schemas(response_schema)
format_instructions = parser.get_format_instructions()

context = "\n".join([doc.page_content for doc in retrievals])

prompt = PromptTemplate.from_template("""
You are an expert assistant. Based on the following context, generate a structured response:

Context: {context}
Wikipedia: {wiki_data}
{format_instructions}
""")

In [None]:
def load_and_chunk(file_path, chunk_size=500, chunk_overlap=100):
    print("Loading and Splitting the PDF Document...")

    loader = PyMuPDFLoader(file_path)
    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " "]
    )

    chunk = text_splitter.split_documents(documents)

    print(f"Number of chunks: {len(chunk)}")
    return chunk

In [None]:
def create_vector_database(chunks):
    print("Creating FAISS Index...")
    vector_db = FAISS.from_documents(chunks, embedding_model)
    print("FAISS Index Created")
    return vector_db

In [None]:
print(" Starting Complete RAG Demo ")

file_path = "../../ai-report.pdf"
chunks = load_and_chunk(file_path)
vector_db = create_vector_database(chunks)

queries = [
    "What examples of AI-driven solutions in tutoring are given?"
]

query = queries[0]

retriever = vector_db.as_retriever(
    search_type="similarity",
    search_kwargs = {"k": 3}
)

retrievals = retriever.get_relevant_documents(query)