# Retriever Testing and Formatting
I'll use this to explore how to improve formatting of hte OpenSearch retriever

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os

# Assuming your notebook is in the 'notebooks' folder in the root directory
root_dir = os.path.dirname(os.getcwd())  # Navigate up one level to the root directory
sys.path.append(root_dir)  # Ap

In [8]:
from langchain.chains import ConversationalRetrievalChain
from datetime import datetime


In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import json
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import BeautifulSoupTransformer, Html2TextTransformer
import re
from tqdm import tqdm
import pandas as pd
from joblib import Memory
import os
import boto3
import pandas as pd
from langchain.embeddings import HuggingFaceEmbeddings
from dotenv import load_dotenv
from langchain.document_loaders import DataFrameLoader
from langchain.schema import Document
from langchain.vectorstores import OpenSearchVectorSearch
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chat_models import ChatAnthropic
from langchain.schema import HumanMessage
from requests_aws4auth import AWS4Auth
from opensearchpy import OpenSearch, RequestsHttpConnection
import boto3
from crawler.scrape import scrape_whole_domain,create_opensearch_index, generate_vectorstore
from dotenv import load_dotenv



In [3]:

anthropic_api_key = os.environ.get('ANTHROPIC_API_KEY')
secret_key = os.environ.get('AWS_ACCESS_KEY_ID')
access_key = os.environ.get('AWS_SECRET_KEY')
opensearch_endpoint = os.environ.get('OPENSEARCH_ENDPOINT')
opensearch_index = os.environ.get('OPENSEARCH_INDEX')
opensearch_https = os.environ.get('OPENSEARCH_HTTPS')

opensearch_admin = os.environ.get('OPENSEARCH_ADMIN')
opensearch_password = os.environ.get('OPENSEARCH_ADMIN_PASSWORD')


auth_creds = (opensearch_admin, opensearch_password) # For testing only. Don't store credentials in code.

embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")

opensearch_vectorstore = generate_vectorstore(auth_creds, embedding_model)

opensearch_vectorstore.similarity_search("hello", top_hits=5)

[Document(page_content='\n\n', metadata={'hash': 'd41d8cd98f00b204e9800998ecf8427e', 'source_url': 'https://www.citizensadvice.org.uk/about-us/our-work/our-campaigns/awareness-raising-campaigns/besw/', 'domain_description': 'Citizens Advice', 'scrape_date': '2024-01-12 11:44:59'}),
 Document(page_content='\n\n', metadata={'hash': 'd41d8cd98f00b204e9800998ecf8427e', 'source_url': 'https://www.citizensadvice.org.uk/work/employment-tribunals/employment-tribunals/before-you-go-to-the-tribunal/check-what-it-might-cost-to-make-an-employment-tribunal-claim/', 'domain_description': 'Citizens Advice', 'scrape_date': '2024-01-12 11:44:59'}),
 Document(page_content='\n\n', metadata={'hash': 'd41d8cd98f00b204e9800998ecf8427e', 'source_url': 'https://www.citizensadvice.org.uk/debt-and-money/action-your-creditor-can-take/bailiffs/how-bailiffs-should-treat-you/writing-a-complaint-letter-about-bailiffs/', 'domain_description': 'Citizens Advice', 'scrape_date': '2024-01-12 11:44:59'}),
 Document(page_c

In [6]:
from langchain.prompts import PromptTemplate, FewShotPromptTemplate

# example way of setting up few shot prompt templates
# create QA examples
EXAMPLES = [
{
    "question": "I’ve moved into a leasehold flat and have to pay service charges of £40 per week.  I get UC, can I get help to pay the charges?",
    "answer": """
    Yes, a housing costs element in UC can be paid for service charges that you have to pay to occupy your home.
    They must be related to the provision of adequate accommodation. Examples of service charges that may be included are:
    - Charges for using shared facilities, such as rubbish collection or communal lifts
    - Charges for using essential items in your home, such as domestic appliances
    - Charges window cleaning of upper floors
    """
}, {
    "question": "I get PIP as I’m disabled, but I also care for my elderly mum (aged 76).  Can I claim Carers Allowance?",
    "answer": """
    If your mum receives a disability benefit (middle/higher rate care component of DLA, daily living component of PIP, or AA),
    are not in gainful employment and you provide 35 hours a week care you might be able to claim CA.
    If your mum gets a severe disability addition/premium in Pension Credit or Housing Benefit this will stop if you are paid Carers Allowance.
    If you get the daily living component of PIP, the DWP might look again at your PIP award if the activities you are able to carry out as a carer
    for your mum suggest that you don’t need help with your own daily living activities. Being a carer and qualifying for PIP aren’t necessarily incompatible.
    It will depend on the reasons you get PIP and the tasks you carry out for your mum. For example, if you have a mental health condition and need prompting
    to carry out daily living tasks, but are physically able to carry them out there wouldn’t be a contradiction in receiving PIP and being a carer.
    """
}
]

# create a example template
EXAMPLE_TEMPLATE = """
User: {question}
AI: {answer}
"""

# create a prompt example from above template
EXAMPLE_PROMPT = PromptTemplate(
    input_variables=["question", "answer"],
    template=EXAMPLE_TEMPLATE
)

# now break our previous prompt into a prefix and suffix
# the prefix is our instructions
PREFIX = """
System: You are Caddy, a friendly and helpful AI that provides advice to advisors at Citizens Advice, a charity based in the United Kingdom.\
Advisors at Citizens Advice need to help citizens of the United Kingdom who come to Citizens Advice with a broad range of issues.\
Your role is to help the advisors with answering the questions that are given to them by citizens. You are not a replacement for human judgement\
but you can help humans make more informed decisions. Caddy is truthful, and creates action points for the advisor with clients options from a range of sources\
Caddy provides specific details from its context.\
If Caddy does not know the answer to a question, Caddy truthfully says they does not know.

Caddy: OK, got it, I'll be a helpful and truthful AI to support advisers to give accurate advise to citizens.

System: Advisors will ask you to provide advice on a citizen's question which can often be cross-cutting, do you know what that means?

Caddy: Yes, I understand that cross-cutting issues in this context means that issues will have multiple themes.

System: That's right, it's important to understand that an issue related to a disabled person falling behind on their energy bills relates to \
energy, debt, benefits and disability-based discrimination. Your advice to the advisor at Citizens Advice should take into account all of \
these themes.

Caddy: Okay, got it, I understand!

System: You will also encounter questions where citizens say they are having issues in specific locations in the United Kingdom. \
The United Kingdom is made up of four separate nations - England, Scotland, Wales and Northern Ireland - which can have different laws \
and therefore means that your answer must be relevant to the nation that the question has mentioned. For example, if a question refers to a town in Wales, \
such as Cardiff, then your answer must be relevant to the rules and laws in Wales.

Caddy: Yes, I understand that if a question references a location I will check which nation in the United Kingdom this is in and make sure that my answer \
is relevant to relevant nation.
"""
# and the suffix our user input and output indicator
SUFFIX = """
Adviser: Here are a few documents in <documents> tags:
<documents>
{context}
</documents>
Based on the above documents, provide a detailed answer for, {question}. Be concise in your response and make sure to include reference to any location names \
stated in the question, and make sure your answer is relevant to the laws and rules of the location specified in the question.

If the question discusses 'my client', your answer should refer to 'your client'. \
In your answer, refer to the documents you use as information rather than documents. \
Do not cite the url of your sources directly in your response.

If information is needed to definitively answer the question, phrase this as a step by step list of questions that the adviser should ask the client and use language like 'could be' instead if 'is'. In the list of questions, use simple language.

".

Use <b>bold</b> to highlight the most question relevant parts in your response.

Caddy:
"""

CORE_PROMPT = FewShotPromptTemplate(
    examples=EXAMPLES,
    example_prompt=EXAMPLE_PROMPT,
    prefix=PREFIX,
    suffix=SUFFIX,
    input_variables=["question", "context"],
    # example_separator="\n\n"
)

CONDENSE_QA_TEMPLATE = """
Given the following conversation and a follow up question, rephrase the follow up question
to be a standalone question.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""

STANDALONE_PROMPT = PromptTemplate.from_template(CONDENSE_QA_TEMPLATE)


In [7]:
STANDALONE_PROMPT = PromptTemplate.from_template(CONDENSE_QA_TEMPLATE)
CORE_PROMPT = FewShotPromptTemplate(
    examples=EXAMPLES,
    example_prompt=EXAMPLE_PROMPT,
    prefix=PREFIX,
    suffix=SUFFIX,
    input_variables=["question", "context"],
    # example_separator="\n\n"
)

In [33]:
llm = ChatAnthropic(
    temperature=0.2,
    max_tokens=500,
    anthropic_api_key=anthropic_api_key,
    verbose=True
    )



retriever = opensearch_vectorstore.as_retriever()

chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    max_tokens_limit=500,
    retriever=retriever,
    condense_question_prompt=STANDALONE_PROMPT,
    return_source_documents=True,
    return_generated_question=True,
    combine_docs_chain_kwargs={"prompt":CORE_PROMPT},
    verbose=True
)

ai_prompt_timestamp = datetime.now()


In [34]:
ai_response = chain({"question": prompt, "chat_history": history})




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
System: You are Caddy, a friendly and helpful AI that provides advice to advisors at Citizens Advice, a charity based in the United Kingdom.Advisors at Citizens Advice need to help citizens of the United Kingdom who come to Citizens Advice with a broad range of issues.Your role is to help the advisors with answering the questions that are given to them by citizens. You are not a replacement for human judgementbut you can help humans make more informed decisions. Caddy is truthful, and creates action points for the advisor with clients options from a range of sourcesCaddy provides specific details from its context.If Caddy does not know the answer to a question, Caddy truthfully says they does not know.

Caddy: OK, got it, I'll be a helpful and truthful AI to support advisers to give accurate advise to citizens.

System: Advisors will ask you to provide

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m> Finished chain.[0m

[1m> Finished chain.[0m


In [35]:
ai_response

{'question': 'can you help me build a bridge?',
 'chat_history': [],
 'answer': ' Unfortunately, there were no documents provided to reference in order to provide a detailed answer to the question "can you help me build a bridge?". Without any context or details about the specific situation, I cannot give a definitive response. \n\nHowever, here is a general outline of the type of information the advisor would need to gather from their client in order to provide relevant advice:\n\nThe advisor should ask their client:\n\n<b>- Where exactly they would like to build a bridge (to determine relevant laws/regulations)</b>\n\n- What purpose the bridge would serve (pedestrian, vehicle, etc.) \n\n- Whether they have approval from local authorities to build on the land\n\n- If they own the land on both sides of the bridge or have easements\n\n- What their budget is for the project\n\n- If they have engineering plans/drawings for the bridge\n\nBased on the client\'s responses, the advisor <b>cou

In [19]:
prompt = 'can you help me build a bridge?'

history = []

In [20]:
# search for documents using retriever

opensearch_vectorstore.similarity_search(prompt, top_hits=5)

[Document(page_content="\n\nWe use cookies to improve your experience of our website. You can find out\nmore or opt-out from some cookies.\n\nSkip to navigation Skip to main content Skip to footer\n\n  * Cymraeg\n  * Sign in\n\nSearch\n\n  * Benefits\n  * Work\n  * Debt and money\n  * Consumer\n  * Housing\n  * Family\n  * Law and courts\n  * Immigration\n  * Health\n  * More from us\n\n  * Home\n  * Consumer\n  * Home improvements\n  * Before you get building work done\n\n#  Before you get building work done\n\nThis advice applies to England.  See advice for See advice for Northern\nIreland, See advice for Scotland, See advice for Wales\n\nThese steps should help you save time, money and stress when you’re preparing\nto get building work, renovations or repairs done on your home. They’ll also\nhelp you avoid problems with builders, plumbers or other contractors - for\nexample decorators and electricians.\n\n##  Step 1: Check if you need permission or approval\n\nYou may have to get mo

In [25]:
retriever.get_relevant_documents(prompt, top_hits=5)

[Document(page_content="\n\nWe use cookies to improve your experience of our website. You can find out\nmore or opt-out from some cookies.\n\nSkip to navigation Skip to main content Skip to footer\n\n  * Cymraeg\n  * Sign in\n\nSearch\n\n  * Benefits\n  * Work\n  * Debt and money\n  * Consumer\n  * Housing\n  * Family\n  * Law and courts\n  * Immigration\n  * Health\n  * More from us\n\n  * Home\n  * Consumer\n  * Home improvements\n  * Before you get building work done\n\n#  Before you get building work done\n\nThis advice applies to England.  See advice for See advice for Northern\nIreland, See advice for Scotland, See advice for Wales\n\nThese steps should help you save time, money and stress when you’re preparing\nto get building work, renovations or repairs done on your home. They’ll also\nhelp you avoid problems with builders, plumbers or other contractors - for\nexample decorators and electricians.\n\n##  Step 1: Check if you need permission or approval\n\nYou may have to get mo

In [32]:
ai_response = chain({"question": prompt, "chat_history": history})


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [24]:
retriever.

VectorStoreRetriever(tags=['OpenSearchVectorSearch', 'HuggingFaceEmbeddings'], vectorstore=<langchain.vectorstores.opensearch_vector_search.OpenSearchVectorSearch object at 0x2b568a450>)

In [27]:
test_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    max_tokens_limit=500,
    retriever=retriever,
    return_source_documents=True,
    verbose=True)