## Installs

In [None]:
%pip install langchain langchain_community langchain-openai langchainhub chromadb tiktoken -q

In [None]:
! pip install nbstripout -q

In [None]:
! pip install langchain-text-splitters -q

## Warm up & Config

In [None]:
import os
import time
import json
import shutil
from pprint import pprint
import pandas as pd 
from operator import itemgetter

import langchain
print("langchain.__version__ ", langchain.__version__)

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate

from langchain_openai import ChatOpenAI
from langchain_core.load import loads, dumps

# for lcel
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


from langsmith import Client

In [None]:
from dotenv import load_dotenv
load_dotenv(dotenv_path=".env")

In [None]:

ls_client = Client(api_key=os.getenv("LANGSMITH_API_KEY"))

## Data load

In [None]:
raw_docs_base_dir = '../data/processed/p_jsons'

In [None]:
## check the len of each doc
all_len = []
all_char_len = []
for item in os.listdir(raw_docs_base_dir):
    with open(os.path.join(raw_docs_base_dir, item), 'r') as f:
        data = json.load(f)
        all_len.append(len(' '.join(data['doc_judgement']).split()))
        all_char_len.append(len(' '.join(data['doc_judgement'])))

pprint(pd.Series(all_len).describe())
pprint(pd.Series(all_char_len).describe())

print('Percent len > 7k: ', (len([item for item in all_len if item > 7000])/len(all_len)) * 100)

# Basic RAG

## Utils

In [None]:
def custom_chunker(text: str):

    # based on len of doc, we can set different chunk size
    num_chars = len(text)

    if num_chars < 3000:
        return [text]
    
    elif num_chars > 3000 and num_chars < 12000:
        splitter = RecursiveCharacterTextSplitter(chunk_size=3000, 
                                                  chunk_overlap=300, 
                                                  separators=["\n\n", "\n", ".", " "])
        return splitter.split_text(text)
        
    else:
        coarse_splitter = RecursiveCharacterTextSplitter(chunk_size=9000,
                                                         chunk_overlap=900,
                                                         separators=["\n\n", "\n", ".", " "])
        coarse_chunks = coarse_splitter.split_text(text)
        fine_splitter = RecursiveCharacterTextSplitter(chunk_size=3000,
                                                       chunk_overlap=300,
                                                       separators=["\n\n", "\n", ".", " "])
        final_chunks = []
        for coarse_chunk in coarse_chunks:
            fine_chunks = fine_splitter.split_text(coarse_chunk)
            final_chunks.extend(fine_chunks)

        return final_chunks

## Chunking

In [None]:
## exec: All chunks Extraction 

chunks_all = []
for item in os.listdir(raw_docs_base_dir):
    with open(os.path.join(raw_docs_base_dir, item), 'r') as f:
        data = json.load(f)
        doc_text = ' '.join(data['doc_judgement'])
        chunks = custom_chunker(doc_text)
        print(f"Document: {item}, Original Length: {len(doc_text)}, Number of Chunks: {len(chunks)}")
        for idx, chunk in enumerate(chunks):
            chunk_metadata = {
                'source_doc': item,
                'chunk_index': idx,
                'chunk_length': len(chunk),
                'original_length': len(doc_text)
            }
            chunks_all.append((chunk, chunk_metadata))
        print("\n")

In [None]:
any([item[1]['chunk_length'] > 3000 for item in chunks_all])

In [None]:
lc_documents = [Document(page_content=item[0], metadata=item[1]) for item in chunks_all]
print(len(lc_documents))

import random
print(random.choice(lc_documents))

In [None]:
# intermediatory dump of data

## Indexing

In [None]:
# initialize the chroma dir

tmp_db_dir = '/tmp/chroma_db_test_vf'

## enable only if you want to clear existing db otherwise it can be used to load existing one
# if os.path.exists(tmp_db_dir):
#     print(f"Removing existing db dir: {tmp_db_dir}")
#     shutil.rmtree(tmp_db_dir)

vector_store_chroma = Chroma(collection_name='legal_mini_rag', 
                             embedding_function=OpenAIEmbeddings(),
                             persist_directory=tmp_db_dir,
                             )

In [None]:
print("Total documents:", vector_store_chroma._collection.count())

In [None]:
vector_store_chroma.add_documents(lc_documents)
# vector_store_chroma.persist() # to save them to disk  

In [None]:
## test collection 
my_collection = vector_store_chroma._collection
print('Total docs indexed: ', my_collection.count())

random_embedding = my_collection.get(include=["embeddings"], limit=1)
print('embedding len: ', random_embedding['embeddings'].shape)

## Retrieval

In [None]:
retriever_ = vector_store_chroma.as_retriever()

In [None]:
query_ = "real estate case outcomes and judgments?"

fetched_docs = retriever_.invoke(query_)

In [None]:
pprint(fetched_docs)

## Generation

In [None]:
prompt_ = ls_client.pull_prompt("rlm/rag-prompt")
print(prompt_)

In [None]:
llm_ = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [None]:
# chaining it together using LCEL

def format_content(docs):
    return '\n\n'.join([doc.page_content for doc in docs])

# rag_chain

rag_chain = (
    {"context": retriever_ | format_content, "question": RunnablePassthrough()}
    | prompt_
    | llm_
    | StrOutputParser()
)

In [None]:
# asking a question using the RAG chain

response = rag_chain.invoke("whcih legislations are being used in real estate cases?")
pprint(response)

# Optimizing RAG Components

## Advance Query Transformation

### Multi Query Generation

In [None]:
# prompt for generating multiple queries
multi_query_template_ = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""

multi_query_prompt_ = ChatPromptTemplate.from_template(multi_query_template_)

generate_new_queries_chain = (
    multi_query_prompt_
    | llm_
    | StrOutputParser() 
    | (lambda x: x.split('\n'))
)

In [None]:
## generate multiple queries for a sample question

tmp_question = 'which legislations are being used in real estate cases?'

generated_queries = generate_new_queries_chain.invoke({'question': tmp_question})

pprint(generated_queries)

In [None]:
# integrating it back into main Retrieval chain

def process_output_of_multi_query(doc_list):

    flattened_docs = [dumps(doc) for sublist in doc_list for doc in sublist]
    unique_docs = list(set(flattened_docs))
    return [loads(doc) for doc in unique_docs]

rag_chain_multi_query = (
    generate_new_queries_chain |
    retriever_.map() |
    process_output_of_multi_query)

multi_query_context = rag_chain_multi_query.invoke({'question': tmp_question})

print("Docs fetched using multi query RAG: ", len(multi_query_context))
pprint(multi_query_context)

In [None]:
## integrating multi query context into Generation chain

mq_rag_chain = (

    {"context": rag_chain_multi_query, "question": itemgetter('question')}
    | prompt_
    | llm_
    | StrOutputParser()
)

pprint(mq_rag_chain.invoke({'question': tmp_question}))


## conclusion on experiment: better in terms of variety with multi query RAG

### RAG Fusion (on top of Multi Query)

Re-ranking docs using technique: RRF (Reciprocal Rank Fusion)

RRF is a simple scoring method to merge ranked search results from multiple retrievers by giving higher weight to top-ranked items using reciprocal rank scoring.

RRF_score = 1 / (k + rank)

In [None]:
## ip = [[a,b,c],[d,b,f],[g,d,a]]

def get_rrf_docs(docs, k = 60):

    fusion_score_dict = {}

    for doc_set in docs:
        for rank, doc in enumerate(doc_set):
            doc_str = dumps(doc)
            if doc_str not in fusion_score_dict:
                fusion_score_dict[doc_str] = 0
            fusion_score_dict[doc_str] += 1 / (k + rank)
    
    reranked_docs = [(loads(doc_str), score) for doc_str, score in sorted(fusion_score_dict.items(), key=itemgetter(1), reverse=True)]
    return reranked_docs

In [None]:
## integrate it back into main Retrieval chain

template_rrf = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output:"""

rrf_prmpt_ = ChatPromptTemplate.from_template(template_rrf)

multi_queries_rrf_chain = (
    rrf_prmpt_ 
    | llm_ 
    | StrOutputParser() 
    | (lambda x: x.split('\n'))
)

retrieval_chain_rrf = (
    multi_queries_rrf_chain |
    retriever_.map() |
    get_rrf_docs
)

docs =  retrieval_chain_rrf.invoke({'question': tmp_question})

print("Docs fetched using multi query RAG: ", len(docs))
print("RRF Retrieval: ", docs)

In [None]:
## integrate into generation chain

generation_chain = (

    {"context": retrieval_chain_rrf, "question": itemgetter('question')}
    | prompt_
    | llm_
    | StrOutputParser()
)

pprint(mq_rag_chain.invoke({'question': tmp_question}))


## conclusion on experiment: #num of acts (entites) caputured better than with RRF based RAG