# LLAMA3 RAG

### Index

In [5]:
import os
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import FireCrawlLoader
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.docstore.document import Document

# Load environment variables from .env file
load_dotenv()

local_llm = "llama3"

urls = [
    "https://mormonr.org/qnas/a9l1T/the_kinderhook_plates",
]

docs = [
    FireCrawlLoader(
        api_key=os.getenv("FIRECRAWL_API_KEY"), url=url, mode="scrape"
    ).load()
    for url in urls
]

# Split documents
docs_list = []
for sublist in docs:
    for item in sublist:
        docs_list.append(item)

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1024,
    chunk_overlap=30,
)

docs_splits = text_splitter.split_documents(docs_list)

# Filter out complex metadata and ensure proper document formatting
filtered_docs = []
for doc in docs_splits:
    # Ensure the doc is an instance of Document and has a 'metadata' attribute
    if isinstance(doc, Document) and hasattr(doc, "metadata"):
        clean_metadata = {
            k: v
            for k, v in doc.metadata.items()
            if isinstance(v, (str, int, float, bool))
        }
        filtered_docs.append(Document(doc.page_content, metadata=clean_metadata))

# Add to vectorDB. For deployment, we want to use pinecone
vectorstore = Chroma.from_documents(
    documents=filtered_docs,
    collection_name="rag-chroma",
    embedding=OpenAIEmbeddings(),
)

retriever = vectorstore.as_retriever()

### Retrieval Grader

In [6]:
from langchain.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import JsonOutputParser

llm = ChatOllama(temperature=0, format="json", model=local_llm)

prompt = PromptTemplate(
    template="""
    <|begin_of_text|><|start_header_id|>system<|end_header_id|> 
    You are a grader assessing relevance of a retrieved doucment to a user question. If the document contains keywords related to the user queston, grade it as relevant. It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give the binary score, 'yes' or 'no' score, to indicate whether the retrieved document is relevant to question. \n 
    Provide the binary score as a JSON with a single key 'score' and no premable or explanation. <|eot_id|><|start_header_id|>user<|end_header_id|>
    Here is the retrieved document: {document} \n
    Here is the user question: {question} \n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["document", "question"],  # do I really need this?
)

retrieval_grader = prompt | llm | JsonOutputParser()
question = (
    "Does kinderhook plates prove that Joseph Smith as a false prophet or a liar?"
)
docs = retriever.invoke(question)

doc_text = docs[1].page_content
print(retrieval_grader.invoke({"question": question, "document": doc_text}))

{'score': 'yes'}


### Generate Answer

In [7]:
# Generate
from langchain import hub
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

# prompt
prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are an assistant for question-answering tasks.
    Use the following pieces of retrived context to answer the question. When you answer the question, it's MOST important that you back it up with specific citations or source links. If the documents use footnotes, be sure to track it down and use the information provided by the footnotes. Provide the URL if you are using a source link. Give as many realiable sources you find in the documents. If there are conflicts or inconsistency between multiple sources you found from the retrived context, choose one based on sound logic (i.e., firsthand accounts are preferred over second hands account, verified resarch with newer dates are preferred) and explain why you made the choce. See the examples below. 
    
    Example 1
    Input: 
    Did Emma Smith push Fanny Alger down the stairs and cause her to miscarry?

    Output:
    Answer: No, based on this article, [source 1] This has been confused with a story circulated about Eliza R. Snow,[source 2, 3] but that story is unlikely to be true. [source 4]
    
    Sources:
    [1] https://mormonr.org/qnas/qp3yc/joseph_smith_and_fanny_alger
    [2] https://mormonr.org/qnas/qp3yc/joseph_smith_and_fanny_alger#bio-0MvZJi
    [3] https://mormonr.org/qnas/qp3yc/joseph_smith_and_fanny_alger#footnote-marker-55  
    [4] https://mormonr.org/qnas/qp3yc/joseph_smith_and_fanny_alger/research#re-psWfCb-sF7Akb 
             
    
    Example 2
    Input:
    Were Joseph and Fanny caught having sex in a barn?

    Answer: Most unlikely. The only historical record with that detail comes from an 1872 account from William McLellin [source 1], who claimed Joseph and Fanny were "caught in the act" of being "sealed" in a barn by Emma Smith.[source 2, 3, 4] In addition to the account being thirdhand and a recollection from many decades later, McLellin had been excommunicated for apostasy, had a personal vendetta against Joseph, and was an active participant in the Missouri mobs.[source 5, 6]
    
    Sources:
    [1] https://mormonr.org/qnas/qp3yc/joseph_smith_and_fanny_alger#bio-0lnabw 
    [2] https://mormonr.org/qnas/qp3yc/joseph_smith_and_fanny_alger#bio-mgbYrb 
    [3] https://mormonr.org/qnas/qp3yc/joseph_smith_and_fanny_alger#footnote-50 
    [4] https://mormonr.org/qnas/qp3yc/joseph_smith_and_fanny_alger/research#re-jZTiDc-eUuNic 
    [5] https://mormonr.org/qnas/qp3yc/joseph_smith_and_fanny_alger#footnote-51 
    [6] https://mormonr.org/qnas/qp3yc/joseph_smith_and_fanny_alger/research#re-psWfCb-OmYWic 
    

    If you don't know the answer, just say that you don't know. <|eot_id|><|start_header_id|>user<|end_header_id|>
    Question: {question}
    Context: {context}
    Answer: <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["question", "context"],
)

llm = ChatOpenAI(temperature=0.5, model="gpt-4o")

rag_chain = prompt | llm | StrOutputParser()

question = (
    "Does kinderhook plates prove that Joseph Smith as a false prophet or a liar?"
)
docs = retriever.invoke(question)
generation = rag_chain.invoke({"context": docs, "question": "question"})
print(generation)

Based on the provided context, it is clear that Joseph Smith did not attempt to use revelation, the seer stone, or the Urim and Thummim when examining the Kinderhook plates. Instead, Joseph seemed to have used secular methods to analyze the plates. The historical record indicates that Joseph made a cursory translation of one of the symbols on the plates, likely comparing it to his Egyptian studies, and then asked for an expert to authenticate them. He subsequently lost interest and never followed up on the matter ([source](https://mormonr.org/qnas/a9l1T/the_kinderhook_plates)).

In summary, Joseph Smith did not use revelation, the seer stone, or the Urim and Thummim in relation to the Kinderhook plates.


In [None]:
# Just playing with Jina AI for readable crawl, it's free! But it didn't parse footnotes, perhaps llamaparse or llamaindex would do a better job.
import requests


def scrape_jina_ai(url: str) -> str:
    response = requests.get("https://r.jina.ai/" + url)
    return response.text


jina_response = scrape_jina_ai("https://mormonr.org/qnas/a9l1T/the_kinderhook_plates")
print(jina_response)

### Web Search via Tavily

In [None]:
travily_api_key = os.getevn("TAVILY_API_KEY")

from langchain_community.tools.tavily_search import TavilySearchResults

web_search_tool = TavilySearchResults(k=3, api_key=travily_api_key)