In [1]:
#import python os interface to interact with API and docs 
#import the openai library
import os
import openai

from dotenv import load_dotenv
#loading the env that contains our api key 
load_dotenv("file.env")
openai.api_key = os.environ['OPENAI_API_KEY']

In [2]:
# Install required dependencies for embeddings, retrieval, and PDF loading
    pip install langchain langchain-openai pypdf faiss-cpu


Note: you may need to restart the kernel to use updated packages.


In [3]:
# to Load PDF documents
from langchain.document_loaders import PyPDFLoader
# Split text into manageable chunks for embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter 

There was a problem when trying to write in your cache folder (C:\Users\ayodelefalajiki/.cache\huggingface\hub). You should set the environment variable TRANSFORMERS_CACHE to a writable directory.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [5]:
## Load the pdf 
loader = PyPDFLoader(r"C:\Users\ayodelefalajiki\Desktop\Grad_school_ Doc\SOP GEORGIA STATE UNIVERSITY.pdf")
document= loader.load()

In [9]:
#split the pdf using a recursive splitter
text_splitter= RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=150)
docs = text_splitter.split_documents(document)


In [14]:
# Import OpenAI embeddings model (used to convert text into numerical vectors)
from langchain_openai import OpenAIEmbeddings 

# Import FAISS (Facebook AI Similarity Search) as the vector store for storing and retrieving embeddings
from langchain.vectorstores import FAISS

In [20]:
#intialize the embedding 
embeddings = OpenAIEmbeddings()
#create a vector store 
vectordb= FAISS.from_documents(docs, embeddings)

In [18]:
# Define the local directory where FAISS index will be saved
# (Use a raw string r"" for Windows paths to avoid escape character issues)
persist_directory = r"C:\Users\ayodelefalajiki\Desktop\FAISS"


# Save the FAISS vector database locally so it can be reloaded later
vectordb.save_local(persist_directory)

In [25]:
# Load the FAISS vector database from local storage
# - folder_path: the directory where FAISS was saved
# - embeddings: the same embeddings model used when creating the index
# - index_name: optional, default is "index" (useful if you saved multiple indexes)
# - allow_dangerous_deserialization: set to True only if you trust the source of the index
vectordb= FAISS.load_local(
       folder_path=persist_directory,
    embeddings=embeddings,
    index_name="index",
    allow_dangerous_deserialization=True
)

In [44]:
# Define a query to search against the FAISS vector database
query = "what is the document about"
# Perform a Max Marginal Relevance (MMR) search
# - k=3 means return the top 3 most relevant and diverse chunks
results = vectordb.max_marginal_relevance_search(query,k=3,)

# print the results 
for i, doc in enumerate(results, 1):
    print(f"Result {i}:")
    print(doc.page_content[:200], "\n")

Result 1:
Additionally, I am interested in Professor Lipeng Wan research study on Adios 2: The adaptable 
input output system. a framework for high -performance data management. His work focuses 
on a tool that 

Result 2:
One key finding was a consistent range of two to three million vehicles traversing Minnesota’s 
interstate annually until 2018, when we observed a sudden shift and noticeable decline in traffic 
volum 

Result 3:
COMPUTER SCIENCE  MSA STATEMENT OF PURPOSE, GEORGIA STATE 
UNIVERSITY  
My research focuses on  an intersection between Data Science and Analytics , an area I find 
increasingly essential as data tran 



In [36]:
# Import OpenAI chat model (used as the LLM for answering questions)
from langchain_openai import ChatOpenAI
# Import RetrievalQA chain (connects retriever with LLM for end-to-end Q&A)
from langchain.chains import RetrievalQA

In [37]:
#set up the llm and setting the temp to zero to avoid indirect answers
llm= ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [43]:
# Convert FAISS vector store into a retriever
# - search_type="similarity": retrieves the most relevant chunks based on similarity
# - search_kwargs={"k": 4}: fetches the top 4 matching chunks
retriever = vectordb.as_retriever(seach_type= "similarity", search_kwargs={"k":4})

# Build a RetrievalQA chain
# - llm: the language model to generate answers
# - retriever: the retriever that fetches relevant docs
# - chain_type="stuff": the simplest chain (concatenates retrieved docs and passes them to the LLM)
qa_chain = RetrievalQA.from_chain_type(
            llm= llm,
            retriever = retriever,
             chain_type="stuff"
)
# Ask a question and get an answer from the document
response = qa_chain.invoke("What is the document about")
print("Answer:", response)

#You can also switch to an maximum marginal relevance search to compare with the similarity search 



Answer: {'query': 'What is the document about', 'result': "The document appears to be a statement of purpose for a graduate program in computer science, specifically focusing on data science and analytics. It discusses the author's research interests, including work on high-performance data management systems, privacy challenges in data analysis, and a comparative analysis of resource allocation algorithms in device-to-device communication. The author expresses a desire to advance the field through critical thinking and research, and mentions the influence of mentors in their academic journey."}


16
