Import all the libraries

In [1]:
import sys
import os
from dotenv import load_dotenv
from openai import OpenAI
import anthropic
import json
import cohere
import gradio as gr
from keybert import KeyBERT
from transformers import AutoModel, AutoTokenizer
from langchain.schema.document import Document
from langchain.embeddings.base import Embeddings
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA
from langchain.llms import Ollama


Reading the clean and extracted data


In [2]:
docs = []

with open('my_list5.txt', 'r') as file:
    for line in file:
        line = line.strip()
        if line:
            data = json.loads(line)
            doc = Document(page_content=data["page_content"], metadata=data["metadata"])
            docs.append(doc)

In [3]:
len(docs)

52500

In [4]:
docs

[Document(metadata={'class': 0, 'tags': ['Violent crime', 'Explosives', 'Bomb', 'Lack of nonviolent purpose', 'Crime of violence']}, page_content="Drapeau’s cohorts, the cohort would be a “victim” of making the bomb. Further, firebombs are inherently dangerous. There is no peaceful purpose for making a bomb. Felony offenses that involve explosives qualify as “violent crimes” for purposes of enhancing the sentences of career offenders. See 18 U.S.C. § 924(e)(2)(B)(ii) (defining a “violent felony” as: “any crime punishable by imprisonment for a term exceeding one year ... that ... involves use of explosives”). Courts have found possession of a'bomb to be a crime of violence based on the lack of a nonviolent purpose for a bomb and the fact that, by its very nature, there is a substantial risk that the bomb would be used against the person or property of another. See United States v. Newman, 125 F.3d 863 (10th Cir.1997) (unpublished) (<HOLDING>); United States v. Dodge, 846 F.Supp. 181, ho

Chunking the Data in size 700 tried various sizes but 700 was the best as it was ideal to mantain longer context which is ideal for legal rag.

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100)
chunks = splitter.split_documents(docs)

Wasn't worth using as it's very basic and doesn't provide very dense embeddings leading to query retrievals lacking depth.

In [None]:
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(chunks, embedding)


Instead used intfloat/e5-large-v2 though it's slower but provides us with significantly better retrieval results.

embedding = HuggingFaceEmbeddings(model_name="intfloat/e5-large-v2")


Running this embedding using cuda for faster vectorization

In [None]:

# Custom embeddings class that uses GPU via sentence-transformers
class GPUHuggingFaceEmbeddings(Embeddings):
    def __init__(self, model_name: str = "intfloat/e5-large-v2"):
        # Load model on GPU
        self.model = SentenceTransformer(model_name, device="cuda")

    def embed_documents(self, texts):
        # texts: list of strings
        embeddings = self.model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
        return embeddings.tolist()

    def embed_query(self, text):
        embedding = self.model.encode(text, convert_to_numpy=True)
        return embedding.tolist()


def main():
    embedding = GPUHuggingFaceEmbeddings(model_name="intfloat/e5-large-v2")

    vectorstore = FAISS.from_documents(chunks, embedding)

    # Save vectorstore locally
    save_path = "faiss_e5_index1"
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    vectorstore.save_local(save_path)

    print(f"FAISS index saved to {save_path}")

if __name__ == "__main__":
    main()
