## Installs

In [1]:
!pip install langchain langchain_community langchain-openai langchainhub chromadb tiktoken

Collecting langchain
  Downloading langchain-1.1.0-py3-none-any.whl.metadata (4.9 kB)
Collecting langchain_community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-openai
  Downloading langchain_openai-1.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting chromadb
  Downloading chromadb-1.3.5-cp39-abi3-macosx_11_0_arm64.whl.metadata (7.2 kB)
Collecting tiktoken
  Downloading tiktoken-0.12.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting langchain-core<2.0.0,>=1.1.0 (from langchain)
  Downloading langchain_core-1.1.0-py3-none-any.whl.metadata (3.6 kB)
Collecting langgraph<1.1.0,>=1.0.2 (from langchain)
  Downloading langgraph-1.0.4-py3-none-any.whl.metadata (7.8 kB)
Collecting pydantic<3.0.0,>=2.7.4 (from langchain)
  Downloading pydantic-2.12.4-py3-none-any.whl.metadata (89 kB)
Collecting jsonpatch<2.0.0,>=1.33.0 (from langchain-

In [9]:
! pip install langchain-text-splitters -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [65]:
import os
import time
import json
from pprint import pprint
import pandas as pd

import langchain
print("langchain.__version__ ", langchain.__version__)

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document

langchain.__version__  1.1.0


In [58]:
from dotenv import load_dotenv
load_dotenv()

True

## Warm up & Config

In [12]:
raw_docs_base_dir = '../data/processed/p_jsons'

In [28]:
## check the len of each doc
all_len = []
all_char_len = []
for item in os.listdir(raw_docs_base_dir):
    with open(os.path.join(raw_docs_base_dir, item), 'r') as f:
        data = json.load(f)
        all_len.append(len(' '.join(data['doc_judgement']).split()))
        all_char_len.append(len(' '.join(data['doc_judgement'])))

pprint.pprint(pd.Series(all_len).describe())
pprint.pprint(pd.Series(all_char_len).describe())

print('Percent len > 7k: ', (len([item for item in all_len if item > 7000])/len(all_len)) * 100)

count      102.000000
mean      6559.382353
std       8863.852452
min        209.000000
25%       2441.750000
50%       3748.500000
75%       6385.750000
max      49445.000000
dtype: float64
count       102.000000
mean      43190.519608
std       58908.064283
min        1435.000000
25%       15841.750000
50%       23687.500000
75%       41734.750000
max      352851.000000
dtype: float64
Percent len > 7k:  22.54901960784314


## Utils

In [30]:
def custom_chunker(text: str):

    # based on len of doc, we can set different chunk size
    num_chars = len(text)

    if num_chars < 3000:
        return [text]
    
    elif num_chars > 3000 and num_chars < 12000:
        splitter = RecursiveCharacterTextSplitter(chunk_size=3000, 
                                                  chunk_overlap=300, 
                                                  separators=["\n\n", "\n", ".", " "])
        return splitter.split_text(text)
        
    else:
        coarse_splitter = RecursiveCharacterTextSplitter(chunk_size=9000,
                                                         chunk_overlap=900,
                                                         separators=["\n\n", "\n", ".", " "])
        coarse_chunks = coarse_splitter.split_text(text)
        fine_splitter = RecursiveCharacterTextSplitter(chunk_size=3000,
                                                       chunk_overlap=300,
                                                       separators=["\n\n", "\n", ".", " "])
        final_chunks = []
        for coarse_chunk in coarse_chunks:
            fine_chunks = fine_splitter.split_text(coarse_chunk)
            final_chunks.extend(fine_chunks)

        return final_chunks

## Chunking

In [32]:
## exec: All chunks Extraction 

chunks_all = []
for item in os.listdir(raw_docs_base_dir):
    with open(os.path.join(raw_docs_base_dir, item), 'r') as f:
        data = json.load(f)
        doc_text = ' '.join(data['doc_judgement'])
        chunks = custom_chunker(doc_text)
        print(f"Document: {item}, Original Length: {len(doc_text)}, Number of Chunks: {len(chunks)}")
        for idx, chunk in enumerate(chunks):
            chunk_metadata = {
                'source_doc': item,
                'chunk_index': idx,
                'original_length': len(doc_text)
            }
            chunks_all.append((chunk, chunk_metadata))
        print("\n")

Document: 103.json, Original Length: 22208, Number of Chunks: 11


Document: 20.json, Original Length: 29251, Number of Chunks: 14


Document: 98.json, Original Length: 15457, Number of Chunks: 8


Document: 77.json, Original Length: 60359, Number of Chunks: 31


Document: 61.json, Original Length: 98865, Number of Chunks: 50


Document: 36.json, Original Length: 69407, Number of Chunks: 33


Document: 41.json, Original Length: 17548, Number of Chunks: 10


Document: 16.json, Original Length: 13974, Number of Chunks: 8


Document: 57.json, Original Length: 11887, Number of Chunks: 6


Document: 94.json, Original Length: 35087, Number of Chunks: 18


Document: 82.json, Original Length: 33638, Number of Chunks: 18


Document: 6.json, Original Length: 70200, Number of Chunks: 36


Document: 7.json, Original Length: 38497, Number of Chunks: 21


Document: 83.json, Original Length: 16655, Number of Chunks: 8


Document: 95.json, Original Length: 33140, Number of Chunks: 16


Document: 56.js

In [33]:
chunks_all

[('SURINDER SINGH NIJJAR, J. 1. Leave granted. 2. These appeals impugn the final judgment and decree dated 21st\n        March, 2012 passed by the High Court of Judicature at  Madras\n        in OSA No. 44 & 45 of 2012 and M.P. No. 1  of  2012,  whereby\n        the letters  patent  appeals  of  the  Union  of  India  were\n        dismissed. The appellant had entered into agreements with the\n        respondents on 30th January, 1983 and 30th  March,  1984  for\n        supply of mono block concrete sleepers (in short “Sleepers”).\n         The agreements were renewed from time to  time  under  which\n        the Union of India agreed to pay specified rates  for  supply\n        of each sleeper.  The agreements/contracts also provided that\n        the rates payable shall be based on certain standard rates of\n        principal raw materials, such as cement, High  Tensile  Steel\n        (HTS)  wires,  molded  steel,  etc.  The  contracts   further\n        provided  that  whenever  t

In [39]:
lc_documents = [Document(page_content=item[0], metadata=item[1]) for item in chunks_all]
print(len(lc_documents))

import random
print(random.choice(lc_documents))

2237
page_content='of the forest. The need for ensuring  service  is  clearly  to  protect  the
interests of the owner of the forest who may have valid reasons not only  to
object to the issuance of regulatory or prohibitory directions, but to  also
enable him/her to raise a jurisdictional issue that the land in question  is
actually not a forest. The need for ensuring  service  is  also  to  prevent
damage to or destruction of a forest. 60.  Unfortunately, Chintamani missed these finer  details  because  it  was
perhaps not brought to the notice of this  Court  that  Section  35  of  the
Forest Act as applicable  to  the  State  of  Maharashtra  had  sub-sections
beyond sub-section (3). This Court proceeded on the basis of Section  35  of
the Indian Forest Act, 1927  as  it  existed  without  being  aware  of  the
amendments made by the State of  Maharashtra  and  the  erstwhile  State  of
Bombay. This, coupled with the factually incorrect view  that  two  hectares
of forest land[25] 

## Indexing

In [60]:
# initialize the chroma dir
vector_store_chroma = Chroma(collection_name='legal_mini_rag', 
                             embedding_function=OpenAIEmbeddings(),
                             persist_directory='/tmp/chroma_db_test'
                             )

In [62]:
vector_store_chroma.add_documents(lc_documents)
# vector_store_chroma.persist() # to save them to disk  

['30142bc0-8c78-4aaf-a21a-2546ab248580',
 'ae6e2f3a-7b66-45c3-a6e6-e397e31f9ae8',
 '5093d9d7-3fa7-4600-bba5-efd00c9139cf',
 '6bee6c18-d18e-46a6-a69f-63d3feef261b',
 '2a06979e-7aa6-411c-bd41-0c8bd91afbc5',
 '8eebb12e-083b-4d93-a5d7-137468dd272c',
 'b29f888a-5e6b-4922-8e04-c09bd8a6bf66',
 'b5d823f7-5388-4062-a7f7-334eaf48f591',
 'b575bbc5-39c2-4cfb-9eca-b4c4de8bc6f2',
 '01695557-0157-4d5e-a1e0-862a2b642e46',
 '5f2365fc-983f-43e6-ae78-b20579f7e7ad',
 '425dfdf4-c0f0-4266-ab32-37f729b0aaf2',
 'ced9570b-258d-4aec-96ae-a62f4ccf5690',
 'ea5ecd54-aa6f-485a-8c8f-550ce66f3b07',
 'f5e007fe-8320-4161-9f48-90b289e0df53',
 '2a26f053-5354-4576-8f8e-e5efe72cdd55',
 '73fd1398-1011-4a20-8e8c-f6daf707485e',
 '579c0d07-52e7-4103-beef-78f60b792eee',
 '8efc0d0b-8bd0-4ff3-9917-827fea8f08cb',
 '4535cb6c-452c-43d6-b69e-f66feba4e39b',
 '50c3c692-ad5a-4fb1-bb99-3d028dd7e157',
 '9b8491e2-97ac-477f-bbc8-fc2b10c718b9',
 'af25d173-e67a-4937-890a-b05964558ca8',
 'bf217c97-b5b6-48fb-b188-75e5c62b0076',
 '1858bdab-109e-

In [69]:
## test collection 
my_collection = vector_store_chroma._collection
print('Total docs indexed: ', my_collection.count())
random_embedding = my_collection.get(include=["embeddings"], limit=1)
print('embedding len: ', random_embedding['embeddings'].shape)

Total docs indexed:  2237
embedding len:  (1, 1536)


## Retrieval

## Generation