In [None]:
# Install Groq and community packages
!pip install langchain chromadb pypdf sentence-transformers langchain-groq langchain-community

Collecting langchain-groq
  Using cached langchain_groq-0.3.2-py3-none-any.whl.metadata (2.6 kB)
Collecting groq<1,>=0.4.1 (from langchain-groq)
  Using cached groq-0.24.0-py3-none-any.whl.metadata (15 kB)
Using cached langchain_groq-0.3.2-py3-none-any.whl (15 kB)
Using cached groq-0.24.0-py3-none-any.whl (127 kB)
Installing collected packages: groq, langchain-groq
Successfully installed groq-0.24.0 langchain-groq-0.3.2


In [6]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["USE_TF"] = "0"

import json
import uuid
from langchain_groq import ChatGroq
from langchain_community.embeddings import SentenceTransformerEmbeddings

from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import chromadb
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [16]:
from dotenv import load_dotenv
import os

load_dotenv()
APIKey = os.getenv("GROQ_API_KEY")
APIKey

'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'

In [17]:
def setup_compliance_qa_chain(
    collection_name: str = "islamic_finance_guidelines",
    persist_directory: str = "./chroma_db"
):
    # Set Groq API key
    os.environ["GROQ_API_KEY"] = APIKey# Replace with your actual Groq API key

    # Initialize HuggingFace embeddings (since Groq doesn't provide embeddings yet)
    # Using a sentence transformer model that works well for semantic search
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


    # Initialize ChromaDB client
    client = chromadb.PersistentClient(path=persist_directory)

    # Create a collection if it doesn't exist
    try:
        # Check if collection exists by attempting to get it
        collection = client.get_collection(name=collection_name)
        print(f"Using existing collection: {collection_name}")
    except Exception as e:
        print(f"Collection doesn't exist or error occurred: {e}")
        print(f"Creating new collection: {collection_name}")
        # Let Langchain create the collection when initializing Chroma

    # Initialize Chroma vector store using Langchain's integration
    vectorstore = Chroma(
        collection_name=collection_name,
        embedding_function=embeddings,
        persist_directory=persist_directory
    )

    # Set up the retriever
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

    # Create the prompt
    prompt = PromptTemplate(
        input_variables=["context", "question"],
        template="""
You are a legal expert specializing in Islamic finance and general compliance, you will recieve a document (contract, guideline, etc.). Your task is to determine if the user's query is compliant with the relevant regulations or not.
You will assess whether the user's query complies with the relevant regulations or not.

Provide your response strictly in the following JSON format:
{{
  "answer": "yes" or "no",
  "reason": "a concise paragraph explaining why the answer is yes or no",
  "score": "green" (compliant), "yellow" (uncertain or partially compliant), or "red" (non-compliant),
  "alternative": "if the answer is no, propose a better solution"
}}

User Query:
{question}

Relevant Law Excerpts:
{context}

Now respond with the JSON:
"""
    )

    # Initialize Groq LLM
    llm = ChatGroq(
        model_name="llama3-70b-8192",  # or other models like "mixtral-8x7b-32768"
        temperature=0,
        max_tokens=1024
    )

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        chain_type_kwargs={"prompt": prompt},
        return_source_documents=True
    )

    return qa_chain, vectorstore

In [4]:
def load_pdf_documents(pdf_directory):
    """
    Load documents from PDF files in a directory.
    Returns a list of Document objects suitable for adding to the vector store.

    Args:
        pdf_directory (str): Path to directory containing PDF files

    Returns:
        list: List of Document objects with text content and metadata
    """
    documents = []

    if not os.path.exists(pdf_directory):
        print(f"Directory {pdf_directory} does not exist.")
        return documents

    for filename in os.listdir(pdf_directory):
        if filename.lower().endswith('.pdf'):
            try:
                file_path = os.path.join(pdf_directory, filename)
                print(f"Loading PDF: {file_path}")

                # Use PyPDFLoader to extract text from the PDF
                loader = PyPDFLoader(file_path)
                pdf_documents = loader.load()

                # Add source information to metadata
                for doc in pdf_documents:
                    doc.metadata["source"] = file_path
                    doc.metadata["filename"] = filename

                documents.extend(pdf_documents)
                print(f"Extracted {len(pdf_documents)} pages from {filename}")

            except Exception as e:
                print(f"Error processing {filename}: {e}")

    return documents


def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """
    Split documents into smaller chunks for better retrieval.

    Args:
        documents (list): List of Document objects
        chunk_size (int): Maximum size of each text chunk
        chunk_overlap (int): Overlap between consecutive chunks

    Returns:
        list: List of chunked Document objects
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )

    chunked_documents = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunked_documents)} chunks")

    return chunked_documents


def load_documents_from_directory(directory_path):
    """
    Load documents from text files in a directory.
    Returns a list of Document objects suitable for adding to the vector store.
    """
    from langchain.docstore.document import Document
    documents = []

    if not os.path.exists(directory_path):
        print(f"Directory {directory_path} does not exist.")
        return documents

    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                doc = Document(
                    page_content=content,
                    metadata={"source": file_path}
                )
                documents.append(doc)

    return documents

def add_documents_to_vectorstore(vectorstore, documents):
    """Add documents to the vector store."""
    vectorstore.add_documents(documents)
    vectorstore.persist()
    print(f"Added {len(documents)} documents to the vector store")

In [5]:
def upsert_pdfs_to_vectordb(pdf_directory, collection_name="islamic_finance_guidelines", persist_directory="./chroma_db", chunk_size=1000, chunk_overlap=200):
    """
    Load PDFs from a directory, split them into chunks, and upsert them to the ChromaDB vector database.

    Args:
        pdf_directory (str): Path to directory containing PDF files
        collection_name (str): Name of the ChromaDB collection
        persist_directory (str): Directory to persist the ChromaDB data
        chunk_size (int): Maximum size of each text chunk
        chunk_overlap (int): Overlap between consecutive chunks

    Returns:
        tuple: (qa_chain, vectorstore) - The QA chain and vector store objects
    """
    # Load the PDFs
    print(f"Loading PDFs from {pdf_directory}...")
    documents = load_pdf_documents(pdf_directory)

    if not documents:
        print("No PDF documents found or loaded.")
        return None, None

    # Split documents into chunks
    print("Splitting documents into chunks...")
    chunked_docs = split_documents(documents, chunk_size, chunk_overlap)

    # Initialize embeddings
    print("Setting up embeddings and vector store...")
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

    # Create a new vectorstore with the documents
    print(f"Creating vector store with {len(chunked_docs)} document chunks...")
    vectorstore = Chroma.from_documents(
        documents=chunked_docs,
        embedding=embeddings,
        collection_name=collection_name,
        persist_directory=persist_directory
    )

    # Ensure data is persisted
    vectorstore.persist()
    print(f"Successfully added {len(chunked_docs)} document chunks to ChromaDB collection '{collection_name}'")

    # Set up the QA chain with the new vector store
    qa_chain, _ = setup_compliance_qa_chain(collection_name, persist_directory)

    return qa_chain, vectorstore

In [12]:
def run_compliance_query(query: str, qa_chain, output_path: str = "compliance_result.json"):
    result = qa_chain.invoke({"query": query})
    model_output = result["result"]

    try:
        output_json = json.loads(model_output)
    except json.JSONDecodeError as e:
        print("Failed to parse JSON:", e)
        output_json = {"error": "Model returned invalid JSON", "raw": model_output}

    with open(output_path, "w") as f:
        json.dump(output_json, f, indent=4)

    print(f"Saved result to {output_path}")
    return output_json

## ADD DATA INTO CHROMA VECTOR DATABASE

In [None]:
pdf_directory = "../data"
# Upsert PDFs to vector database
qa_chain, vectorstore = upsert_pdfs_to_vectordb(pdf_directory)

Loading PDFs from ../data...
Loading PDF: ../data\FI28ED_1_Salam and Parallel Salam (07).PDF
Extracted 26 pages from FI28ED_1_Salam and Parallel Salam (07).PDF
Loading PDF: ../data\FI5F55_1_Musharaka Financing(4).PDF
Extracted 30 pages from FI5F55_1_Musharaka Financing(4).PDF
Loading PDF: ../data\FI922A_1_Murabaha and Other Deferred Payment Sales (28).PDF
Extracted 21 pages from FI922A_1_Murabaha and Other Deferred Payment Sales (28).PDF
Loading PDF: ../data\FINANC_1_Istisna’a and Parallel Istisna’a (10).PDF
Extracted 42 pages from FINANC_1_Istisna’a and Parallel Istisna’a (10).PDF
Loading PDF: ../data\Ijarah (32).pdf
Extracted 41 pages from Ijarah (32).pdf
Loading PDF: ../data\SS - shariah-standards-2015-321-390 (1) 1.pdf
Extracted 70 pages from SS - shariah-standards-2015-321-390 (1) 1.pdf
Loading PDF: ../data\SS10 - Salam and Parallel Salam - revised standard.pdf
Extracted 24 pages from SS10 - Salam and Parallel Salam - revised standard.pdf
Loading PDF: ../data\SS8 - Murabahah - rev

  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Creating vector store with 1288 document chunks...


  vectorstore.persist()


Successfully added 1288 document chunks to ChromaDB collection 'islamic_finance_guidelines'
Using existing collection: islamic_finance_guidelines


  vectorstore = Chroma(


## INFERENCE

In [9]:
qa_chain, vectorstore = setup_compliance_qa_chain()
query = "A bank offers a personal loan of $10,000 with a 10% interest rate. Is it halal?"
response = run_compliance_query(query, qa_chain)

Using existing collection: islamic_finance_guidelines
Saved result to compliance_result.json


In [10]:
response

{'answer': 'no',
 'reason': 'The bank offers a personal loan with a 10% interest rate, which is a form of Riba (usury) and is explicitly prohibited in Islamic finance. The relevant law excerpts do not provide any exceptions or justifications for such a transaction.',
 'score': 'red',
 'alternative': 'The bank could offer a Shariah-compliant financing product, such as a Murabahah or Ijarah-based product, where the profit is earned through a legitimate sale or lease transaction, and not through interest. The bank could also consider partnering with an Islamic bank or financial institution to offer Shariah-compliant products.'}

## INFERENCE FROM PDF FILE

In [21]:
pdf_directory = "../tests\MODEL MURABAHA FACILITY AGREEMENT.pdf"
documents = []

if not os.path.exists(pdf_directory):
    print(f"Directory {pdf_directory} does not exist.")
print(f"Loading PDF: {pdf_directory}")

# Use PyPDFLoader to extract text from the PDF
loader = PyPDFLoader(pdf_directory)
pdf_documents = loader.load()

# Add source information to metadata
for doc in pdf_documents:
    doc.metadata["source"] = pdf_directory
    doc.metadata["filename"] = pdf_directory

documents.extend(pdf_documents)
print(f"Extracted {len(pdf_documents)} pages from {pdf_directory}")

Loading PDF: ../tests\MODEL MURABAHA FACILITY AGREEMENT.pdf
Extracted 17 pages from ../tests\MODEL MURABAHA FACILITY AGREEMENT.pdf


In [9]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
    )

chunked_documents = text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunked_documents)} chunks")


Split 17 documents into 51 chunks


In [18]:
qa_chain, vectorstore = setup_compliance_qa_chain()
query_text = "\n".join([chunk.page_content for chunk in chunked_documents])
query = query_text[:8000]  # truncate if needed
response = run_compliance_query(query, qa_chain)

  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


Collection doesn't exist or error occurred: Collection [islamic_finance_guidelines] does not exists
Creating new collection: islamic_finance_guidelines


  vectorstore = Chroma(


Saved result to compliance_result.json
