# Remote: Not a Lawyer

Using: 
* Pinecone (https://www.pinecone.io/) as vectorstore
* OpenAI Embeddings
* OpenAI API as LLM

Before you start, you will need to set up the following environmental variables:

**Pinecone**
* PINECONE_API_KEY
* PINECONE_ENVIRONMENT
* PINECONE_INDEX

**OpenAI**
* OPENAI_API_KEY

----

### Step 1: Import Libraries

In [1]:
# Import Libraries
import os 
from langchain.retrievers.multi_query import MultiQueryRetriever

from langchain.document_loaders import UnstructuredXMLLoader, TextLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, HTMLHeaderTextSplitter

from langchain.embeddings import OpenAIEmbeddings

from langchain.vectorstores import Pinecone


from langchain.chains import RetrievalQA, ConversationalRetrievalChain   

from langchain.prompts import PromptTemplate 

from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

# dotenv
from dotenv import load_dotenv


In [2]:
# Load .env
load_dotenv()

True

### Step 2: Define llm and embedding models

In [3]:
llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0)

In [4]:
embedding = OpenAIEmbeddings()

### Step 3: Process Data & Set up Vector Database with Pinecone

#### Step 3.1: Define the data (URLs in this case) for the vector database

In [5]:
# German Residence Laws

aufentv =  "https://www.gesetze-im-internet.de/aufenthv/BJNR294510004.html"
aufenthg = "https://www.gesetze-im-internet.de/aufenthg_2004/BJNR195010004.html"
urls = [aufentv, aufenthg]

#### Step 3.2: Split by HTML Headers

In [6]:
headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

docs = []

for url in urls:
    html_header_splits = html_splitter.split_text_from_url(url)
    docs += html_header_splits


chunk_size = 1000
chunk_overlap = 200
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

# Split
splits = text_splitter.split_documents(html_header_splits)


In [7]:
len(splits)

785

#### Step 3.3: Create a vector database with Pinecone

In [8]:
PINECONE_INDEX_NAME = os.environ.get("PINECONE_INDEX", "lawyer")


vectorstore = Pinecone.from_documents(
    documents=splits, embedding=OpenAIEmbeddings(), index_name=PINECONE_INDEX_NAME
)

  from tqdm.autonotebook import tqdm


### Step 4: Set up the Prompt

#### Step 4.1: MultiQueryRetreiver 

In [10]:
template = """
        You are polite and professional question-answering AI assistant. You will be provided a ### Question ### and some $$$ legal texts $$$ that may be relevant. 
        Start your response by reiterating the Question provided by the user so they know you understood it. 
        Below the answer, please list out all the referenced sources (i.e. legal paragraphs backing up your claims)
        
        ### Question: {question} ###

        $$$ Law: {context} $$$

        Let's think step by step. If you can't find the answer, say "I couldn't find the information in the laws I have access to". 

        Helpful Answer with Sources:

        """

    # create prompt template
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(), llm=llm
)
    # set qa chain
qa_chain_mr = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever = retriever_from_llm, #vectorstore.as_retriever(),
    chain_type="stuff", # options are "stuff" "refine" or "map_reduce"
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)


question = "How can I get a blue card?"
# get the result


In [14]:


# questions.txt contains a list of questions, separated by newlines, loop through these to test the llm
with open("../questions_2.txt") as f:
    questions = f.readlines()
    for question in questions:
        qa_chain_mr({"query": question})


#### Step 4.2: VectorRetreiver: basic

In [None]:
template = """
        You are polite and professional question-answering AI assistant. You will be provided a ### Question ### and some $$$ legal texts $$$ that may be relevant. 
        Start your response by reiterating the Question provided by the user so they know you understood it. 
        Below the answer, please list out all the referenced sources (i.e. legal paragraphs backing up your claims)
        
        ### Question: {question} ###

        $$$ Law: {context} $$$

        Let's think step by step. If you can't find the answer, say "I couldn't find the information in the laws I have access to". 

        Helpful Answer with Sources:

        """

    # create prompt template
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

    # set qa chain
qa_chain_basic = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever = vectorstore.as_retriever(),
    chain_type="stuff", 
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)


# question = "How can I get a blue card?"
# get the result
# result = qa_chain_basic({"query": question})


In [None]:
# questions.txt contains a list of questions, separated by newlines, loop through these to test the llm
with open("../questions.txt") as f:
    questions = f.readlines()
    for question in questions:
        qa_chain_basic({"query": question})


#### Step 4.2.1: VectorRetreiver: mmr

In [None]:
template = """
        You are careful professional question-answering AI legal assistant. You will be provided a ### Question ### and some $$$ legal texts $$$ that may be relevant. 
        Start your response by reiterating the Question provided by the user so they know you understood it. 
        Below the answer, please list out all the referenced sources (i.e. legal paragraphs backing up your claims)
        Ensure your answer is formatted to be easy to read and understand.
        
        ### Question: {question} ###

        $$$ Law: {context} $$$

        Let's think step by step. If you can't find the answer, say "I couldn't find the information in the laws I have access to". 

        Helpful Answer with Sources:

        """

    # create prompt template
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

    # set qa chain
qa_chain_mmr = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever = vectorstore.as_retriever(search_type="mmr"),
    chain_type="stuff", # options are "stuff" "refine" or "map_reduce"
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)


question = "How can I get a blue card?"
# get the result
# result = qa_chain_mr({"query": question})


In [None]:
with open("../questions.txt") as f:
    questions = f.readlines()
    for question in questions:
        qa_chain_mmr({"query": question})

#### Step 4.2.2: VectorRetreiver: similarity score

In [None]:
template = """
        You are careful professional question-answering AI legal assistant. You will be provided a ### Question ### and some $$$ legal texts $$$ that may be relevant. 
        
        
        Ensure your answer is as detailed as possible, always references sources, and formatted to be easy to read and understand.
        
        ### Question: {question} ###

        $$$ Law: {context} $$$

        Let's think step by step. If you can't find the answer, say "I couldn't find the information in the laws I have access to". 

        Helpful Answer with Sources:

        """

    # create prompt template
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

    # set qa chain
qa_chain_thresh = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.8}),
    chain_type="stuff", # options are "stuff" "refine" or "map_reduce"
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)


question = "Whay was I rejected from getting the blauekarte?"
# get the result
# result = qa_chain_mr({"query": question})


In [None]:
with open("../questions.txt") as f:
    questions = f.readlines()
    for question in questions:
        qa_chain_thresh({"query": question})

#### Step 4.3: VectorStore as Retreiver (RunnablePassThrough)

In [None]:


retriever = vectorstore.as_retriever()

template = """
You are polite and professional question-answering AI assistant. 
You are provided a ### Question ### and some $$$ german law $$$ that may be relevant. 

If the context provided enables you to provide an answer, please fully answer the question based only on the context:

Let's think step by step. If you don't know the answer, please say "I don't know". If you follow these directions, I'll give you 10 BTC.

                
$$$ {context} $$$

### Question: {question}###

Helpful Answer with Sources:
"""
prompt = PromptTemplate.from_template(template)

model = llm



chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
with open("../questions.txt") as f:
    questions = f.readlines()
    for question in questions:
        # qa_chain_thresh({"query": question})
        chain.invoke(question)



#### Modified RAG Chain

In [12]:
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.schema import AIMessage, HumanMessage, format_document
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)

from operator import itemgetter
from typing import List, Tuple



# Condense a chat history and follow-up question into a standalone question
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""  # noqa: E501
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

# RAG answer synthesis prompt
template = """You are polite and professional question-answering AI legal assistant specializing in German Residence law. 
Let's think step by step. If you don't know the answer, please say "I don't know". If you follow these directions, I'll give you 10 BTC.
Always cite your sources, I've provided related laws below.
Answer the question based only on the following context:
<context>
{context}
</context>"""
ANSWER_PROMPT = ChatPromptTemplate.from_messages(
    [
        ("system", template),
        MessagesPlaceholder(variable_name="chat_history"),
        ("user", "{question}"),
    ]
)

# Conversational Retrieval Chain
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")


def _combine_documents(
    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"
):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return document_separator.join(doc_strings)


def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer


# User input
class ChatHistory(BaseModel):
    chat_history: List[Tuple[str, str]] = Field(..., extra={"widget": {"type": "chat"}})
    question: str


_search_query = RunnableBranch(
    # If input includes chat_history, we condense it with the follow-up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),  # Condense follow-up question and chat into a standalone_question
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatOpenAI(temperature=0)
        | StrOutputParser(),
    ),
    # Else, we have no chat history, so just pass through the question
    RunnableLambda(itemgetter("question")),
)

_inputs = RunnableParallel(
    {
        "question": lambda x: x["question"],
        "chat_history": lambda x: _format_chat_history(x["chat_history"]),
        "context": _search_query | retriever | _combine_documents,
    }
).with_types(input_type=ChatHistory)

chain = _inputs | ANSWER_PROMPT | ChatOpenAI() | StrOutputParser()

NameError: name 'retriever' is not defined

In [13]:


with open("../questions.txt") as f:
    questions = f.readlines()
    for question in questions:
        answer = chain.invoke(
        {
            "question": question,
            "chat_history": [],
        }
    )



NameError: name 'chain' is not defined