In [None]:
%pip install langchain_community pymupdf openai faiss-cpu tiktoken

In [None]:
my_file = "./S&DS 230 Final Report (1).pdf"
query = "What is this report about?"

In [None]:
from langchain_community.document_loaders import PyMuPDFLoader

def parse_pdf(file_path):
    """Outputs the text documents on a parsed pdf"""

    loader = PyMuPDFLoader(file_path)
    return loader.load()

# my_file = "./Resume - Parikh, Jeet.docx.pdf"

docs = parse_pdf(file_path=my_file)

In [32]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000, # should be larger later -> ~1000
    chunk_overlap = 100
)

chunks = []

for doc in docs:
    chunks += splitter.split_text(doc.page_content)


In [33]:
from dotenv import load_dotenv
load_dotenv()

True

In [34]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document

docs = [Document(page_content=chunk) for chunk in chunks]
embedding_model = OpenAIEmbeddings()
faiss_index = FAISS.from_documents(docs, embedding_model)

faiss_index.save_local("faiss_index")

In [39]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

rag_prompt_template = """
You are a helpful assistant. Use the following context to answer the question. 
If the context does not contain the answer, say you don't know — do not make anything up.

Context:
{context}

Question: {question}
Answer:
"""

prompt = PromptTemplate(
    input_variables=["context", "question"], 
    template=rag_prompt_template
)

llm = ChatOpenAI(
    model_name = "gpt-4",
    temperature = 0.5
)

chain = LLMChain(llm=llm, prompt=prompt)

In [None]:
loaded_faiss = FAISS.load_local("faiss_index", embedding_model, allow_dangerous_deserialization=True)

results = loaded_faiss.similarity_search(query=query, k=4)
retrieved_chunks = "\n\n".join([res.page_content for res in results])

output = chain.run(context=retrieved_chunks, question=query)
print(output)

The report discusses the use of statistical tools and knowledge to examine the patterns and predictors of life expectancy across the world. It covers the process of data cleaning, exploratory graphics, basic statistical tests, correlation exploration, multiple regression, ANOVA, ANCOVA, and logistic regression. It also talks about the selection of variables for the statistical analysis and the handling of missing values in the data.
