In [13]:
from langchain_community.document_loaders import WebBaseLoader, TextLoader, PyPDFLoader
from langchain_openai import ChatOpenAI
import bs4
import os
from dotenv import load_dotenv

load_dotenv(r'../venv/.env')

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY2")


#### Loading

In [2]:
# text loading
loader = TextLoader("input.txt")
text_docs=loader.load()
# text_documents

# web doc loading
web_loader = WebBaseLoader(web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
                     bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                         class_=("post-title","post-content","post-header")

                     )))
web_docs = web_loader.load()
# web_docs

# pdf loading
pdf_loader = PyPDFLoader("rag_pdf.pdf", )
pdf_docs = pdf_loader.load()
# pdf_docs

#### Transform

In [6]:
pdf_docs.extend(web_docs)
pdf_docs.extend(text_docs)

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200 )
documents = text_splitter.split_documents(pdf_docs)


In [9]:
len(documents)

88

#### embeddings

In [10]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma 

db = Chroma.from_documents(documents, OpenAIEmbeddings())

In [11]:
# querying vector database
query = "instructor name"
db.similarity_search(query)[0]

Document(page_content='academic misconduct to appropriate personnel. If you have any questions or concerns, please consult with the \ninstructor or TAs in this class.  \n \nCommitment to a Safe and Inclusive Learning Environment  \nThe Herbert Wertheim College of Engineering values varied perspectives and lived experiences within our \ncommunity and is committed to supporting the University’s core values, including the elimi nation of discrimination.  \nIt is expected that every person in this class will treat one another with dignity and respect regardless of race, creed, \ncolor, religion, age, disability, sex, sexual orientation, gender identity and expression, marital status, national origin, \npolitical opinions or affiliations, genetic information, and veteran status.', metadata={'page': 2, 'source': 'rag_pdf.pdf'})

In [21]:
# chatprompt template
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(""" Answer the followig question only using the following context.
                                            <context>
                                          {context}
                                          </context>
                                          Question : {input}

""")


In [26]:

from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain

llm = ChatOpenAI()
retriever = db.as_retriever()

document_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(retriever, document_chain)



In [28]:
retrieval_chain.invoke({"input" : "who are the teaching assistants for this course CDA 5155: Computer Architecture Principles, just give me names nothing else"})

{'input': 'who are the teaching assistants for this course CDA 5155: Computer Architecture Principles, just give me names nothing else',
 'context': [Document(page_content='CDA 5155: Computer Architecture Principles                                                                                                                                             Page 1 \nPrabhat Mishra, Fall  2023  CDA 5155: Computer Architecture Principles  \nSections: 28700  \nClass Periods:    Tuesday  3:00 – 4:55 pm and Thursday 4:05 – 4:55 pm  \nLocation:    CSE E121  \nAcademic Term:   Fall 2023  \n \nInstructor:  \nName : Prabhat Mishra  \nEmail Address : prabhat@ufl.edu  \nOffice Phone Number : 352 294 6658  \nOffice Hours:   Wednesday  3:00 – 5:00 pm in CSE 568   \n \nTeaching Assistants:   \nName : Anisha Ashok Wadhwani  \nEmail Address: a.wadhwani @ufl.edu  \nOffice Hours:   Monday 3:00 – 5:00 pm in CSE 309  \n \nName : Uma Saisree Avula  \nEmail Address: umasaisree.avula @ufl.edu  \nOffice Hours:   

In [25]:
retrieval_chain.invoke({"input" : "In this context, what the university is being discussed"})

{'input': 'In this context, what the university is being discussed',
 'context': [Document(page_content='governing software use.  Failure to do so can lead to monetary damages and/or criminal penalties for the individual \nviolator.  Because such violations are also against University policies and rul es, disciplinary action will be taken as \nappropriate.  We, the members of the University of Florida community, pledge to uphold ourselves and our peers to \nthe highest standards of honesty and integrity.  \n \nStudent Privacy  \nThere are federal laws protecting yo ur privacy with regards to grades earned in courses and on individual \nassignments.  For more information, please see:  https://registrar.ufl.edu/ferpa.html  \n \nCampus Resources:  \nHealth and Wellness  \nU Matt er, We Care:  \nYour well -being is important to the University of Florida.   The U Matter, We Care initiative is committed to \ncreating a culture of care on our campus by encouraging members of our community to 