In [None]:
## Tutorial: https://python.langchain.com/docs/integrations/llms/huggingface_endpoint/
## Inspired by: https://medium.com/@mohammed97ashraf/building-a-retrieval-augmented-generation-rag-model-with-gemma-and-langchain-a-step-by-step-f917fc6f753f

!pip install langchain
!pip install huggingface_hub
!pip install beautifulsoup4
!pip install sentence-transformers
!pip install chromadb
!pip install langchainhub

In [None]:
from langchain_community.llms import HuggingFaceEndpoint
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import os

## Can get huggingface token from: https://huggingface.co/settings/tokens
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ''

## Select a model available on huggingface
model_selection = "mistralai/Mistral-7B-Instruct-v0.2"


In [41]:
#### Create LLM/huggingface connection with API token, model, and setting lengths/temperature...
llm = HuggingFaceEndpoint(
    repo_id=model_selection, max_length=128, temperature=0.5, token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
)

                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.
                    token was transferred to model_kwargs.
                    Please make sure that token is what you intended.


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [42]:
#### setup quesiton / response /
question = "Do you know if Hants Williams work at Stony Brook University? "
template = """Question: {question} Answer: Let's think step by step."""
prompt = PromptTemplate.from_template(template)

llm_chain = LLMChain(prompt=prompt, llm=llm)
print(llm_chain.run(question))

 First, we need to find out if Hants Williams is a real person. A simple internet search shows that Hants Williams is indeed a person, and he has been associated with various universities and research institutions. However, there is no clear indication that he has worked at Stony Brook University. It would be best to contact the university directly for confirmation. If you have more information or would like further assistance, please let me know.


In [43]:
##### Rag
from langchain_community.document_loaders import WebBaseLoader
##### Get some new Data
loader = WebBaseLoader("https://healthprofessions.stonybrookmedicine.edu/programs/ahi/faculty/hants_williams")
data = loader.load()
print(data)


[Document(page_content='\n\n\n\n\n\n\n\n\n\n\n\n\nHants Williams, PhD, RN | School of Health Professions\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n      Skip to main content\n    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHome\n\nContact Us\n\nGive\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nMaster of Science in Applied Health Informatics (MS/AHI) \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n          Home\n          \n\n\n\n\n\n\n\n\n\n          Curriculum\n          \n\n\n\n          Admissions\n          \n\n\n\n          Tuition, Financial Aid & Housing\n          \n\n\n\n          FAQs\n          \n\n\n\n          Apply\n          \n\n\n\n\n\n\n\n\n\n\n          Faculty and Staff\n          \n\n\n\n          Info Sessions\n          \n\n\n\n          Projects\n          \n\n\n\n          Alumni\n          \n\n\n\n          Apply\n          \n\n\n\n          Contact\n          \n\n\n\n          Give\n          \n\n\n

In [44]:
##### Parse the data
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter

# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(data)




In [45]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


In [46]:
# load it into Chroma
db = Chroma.from_documents(docs, embedding_function)

In [47]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.chains import RetrievalQA

retriever = db.as_retriever(search_type="mmr", search_kwargs={'k': 4, 'fetch_k': 20})
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
)

In [48]:
## test the ragChain

rag_chain.invoke("Do you know if Hants Williams work at Stony Brook University?")


'\nYes, Hants Williams is a faculty member at Stony Brook University, specifically in the School of Health Professions and the Applied Health Informatics program.'