In [None]:
#run 1-1 line in each cell if all dont work together.
#pip install -qU pypdf
#pip install sentence-transformers
#pip install hf_xet

In [1]:
#Importing necessary models
from langchain.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
#setting up imp paths
marketingdbfaiss_localstore__vectordbpath = r"/Users/hitesh.modi/Desktop/Kinda Personal/LLM Marketing Bot/FAISS_marketingbot"

In [9]:
#loading our mistral model
llm = Ollama(model="mistral")
debug_llm = Ollama(model="qwen2.5-coder:7b")
#print(llm("Test! are you working?"))

In [27]:
print(debug_llm("""
                Can you debug this piece of python code for me?:
                my original function was this
                #Reading our pdf file via langchain
loader = PyPDFLoader(filepaths[0])
pages = []
async for page in loader.alazy_load():
    pages.append(page)
    it returned a list object.mro
    
    
    The function that you helped me with
    async def read_pdfs_into_pages(filepaths):
    pages = []
    for filepath in filepaths:
        loader = PyPDFLoader(filepath)
        async for page in loader.alazy_load():
            pages.append(page)
    return pages
    
    returns a coroutine object
                """))

The reason why your function is returning a coroutine object instead of the list of pages is because you're using an `async for` loop inside another `async for` loop. This means that each iteration of the outer loop will start a new coroutine, but it won't wait for it to finish before moving on to the next one.

To fix this, you should use a regular `for` loop instead of an `async for` loop to iterate over the filepaths. You can then call `await loader.alazy_load()` inside the loop, which will wait for each PDF file to be fully loaded and appended to the `pages` list before moving on to the next one.

Here's the corrected code:

```python
import asyncio

async def read_pdfs_into_pages(filepaths):
    pages = []
    for filepath in filepaths:
        loader = PyPDFLoader(filepath)
        async for page in loader.alazy_load():
            pages.append(page)
    return pages
```

You can then call this function using `await`:

```python
filepaths = ['path/to/file1.pdf', 'path/to/file2.pd

In [3]:
filepaths = ["/Users/hitesh.modi/Desktop/Kinda Personal/LLM Marketing Bot/pdf_files/Alex Hormozi 100 million leads.pdf",
             "/Users/hitesh.modi/Desktop/Kinda Personal/LLM Marketing Bot/pdf_files/Alex Hormozi 100m Offers.pdf",
             "/Users/hitesh.modi/Desktop/Kinda Personal/LLM Marketing Bot/pdf_files/DotCom Secrets Russel Brunson.pdf",
             "/Users/hitesh.modi/Desktop/Kinda Personal/LLM Marketing Bot/pdf_files/Expert-Secrets-Russel Brunson.pdf",
             "/Users/hitesh.modi/Desktop/Kinda Personal/LLM Marketing Bot/pdf_files/Russel Brunson Lead Funnels.pdf",
             "/Users/hitesh.modi/Desktop/Kinda Personal/LLM Marketing Bot/pdf_files/Sabri Suby Sell like crazy.pdf",
             "/Users/hitesh.modi/Desktop/Kinda Personal/LLM Marketing Bot/pdf_files/Traffic Secrets Russell Brunson.pdf"]

In [4]:
async def read_pdfs_into_pages(filepaths):
    pages = []
    for filepath in filepaths:
        loader = PyPDFLoader(filepath)
        async for page in loader.alazy_load():
            pages.append(page)
    return pages

In [5]:
def split_pages(pages):
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    separators=["\n\n", "\n", " ", ""]
    )

    docs = text_splitter.split_documents(pages)
    return docs

In [6]:
def create_faiss_embeddings(docs):
    #All have been splitted correctly, now time to load embeddings creator
    embedding = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2"
    )   
    embeddings = FAISS.from_documents(docs, embedding)
    return embeddings

In [7]:
def get_answers_from_FAISS_usingllm(FAISS_embeddings, query):
    qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=FAISS_embeddings.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True
)

    return(qa_chain.invoke(query))

In [8]:
all_marketing_pdf_pages = await read_pdfs_into_pages(filepaths)

In [None]:
print("Metadata is : ")
print(f"{all_marketing_pdf_pages[1].metadata}\n")
print(" ")
print("content is : ")
print(all_marketing_pdf_pages[1].page_content)

In [10]:
all_marketing_pdf_split = split_pages(all_marketing_pdf_pages)

In [17]:
all_marketing_pdf_split[0].id

In [12]:
#Requires internet
all_marketing_embeddings = create_faiss_embeddings(all_marketing_pdf_pages)

  embedding = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#DB related operations

In [None]:
# Saving FAISS vector DB
all_marketing_embeddings.save_local(marketingdbfaiss_localstore__vectordbpath)

In [None]:
# Loading FAISS db from local

"""
allow_dangerous_deserialization=True
this is just a warning, if the warning comes you can enable this as we ahve created our model locally!
"""

embedding = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2"
    ) 
all_marketing_embeddings = FAISS.load_local(folder_path=marketingdbfaiss_localstore__vectordbpath, embeddings=embedding,allow_dangerous_deserialization=True)

In [None]:
#Starting to query our LLM

In [47]:
question = """
You have information on what all books from the below mentioned authors?
1) Alex Hormozi
2) Russel Brunson
3) Sabri Suby
"""
response = get_answers_from_FAISS_usingllm(all_marketing_embeddings, question)
print(response)

{'query': '\nYou have information on what all books from the below mentioned authors?\n1) Alex Hormozi\n2) Russel Brunson\n3) Sabri Suby\n', 'result': "1) Alex Hormozi - Traffic Secrets (as mentioned in the context)\n  2) Russell Brunson is not explicitly mentioned in the provided context, but he has written books such as DotCom Secrets and Expert Secrets.\n  3) Sabri Suby is also not explicitly mentioned in the provided context, so it's unclear if there are any books from this author within the given context.", 'source_documents': [Document(page_content='ALEX HORMOZI', metadata={'source': '/Users/hitesh.modi/Desktop/Kinda Personal/LLM Marketing Bot/pdf_files/Alex Hormozi 100m Offers.pdf', 'page': 3}), Document(page_content='Figure\t0.3:\t\nTraffic\tSecrets\n\thelps\tmarketers\tlearn\tthe\tstrategies\tto\tdrive\tconsistent\ttraffic\tto\ttheir\nfunnels.\nEach\tbook\tin\tthe\tSecrets\ttrilogy\twas\twritten\tas\ta\tstand-alone\tplaybook,\tbut\tmastering\tthe\nskills\tfrom\tall\tthree\tboo

In [48]:
response

{'query': '\nYou have information on what all books from the below mentioned authors?\n1) Alex Hormozi\n2) Russel Brunson\n3) Sabri Suby\n',
 'result': "1) Alex Hormozi - Traffic Secrets (as mentioned in the context)\n  2) Russell Brunson is not explicitly mentioned in the provided context, but he has written books such as DotCom Secrets and Expert Secrets.\n  3) Sabri Suby is also not explicitly mentioned in the provided context, so it's unclear if there are any books from this author within the given context.",
 'source_documents': [Document(page_content='ALEX HORMOZI', metadata={'source': '/Users/hitesh.modi/Desktop/Kinda Personal/LLM Marketing Bot/pdf_files/Alex Hormozi 100m Offers.pdf', 'page': 3}),
  Document(page_content='Figure\t0.3:\t\nTraffic\tSecrets\n\thelps\tmarketers\tlearn\tthe\tstrategies\tto\tdrive\tconsistent\ttraffic\tto\ttheir\nfunnels.\nEach\tbook\tin\tthe\tSecrets\ttrilogy\twas\twritten\tas\ta\tstand-alone\tplaybook,\tbut\tmastering\tthe\nskills\tfrom\tall\tthree\