In [8]:
import os
from dotenv import load_dotenv
from functools import lru_cache

load_dotenv()
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['Langchain_Project'] = os.getenv('LANGCHAIN_PROJECT')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

# @lru_cache(maxsize=1)
# def getEmbeddings(model_name=None, model_kwargs=None, encode_kwargs=None):
#     """
#     Get the embeddings model.
#     """
#     return HuggingFaceEmbeddings(
#         model_name=model_name,
#         model_kwargs=model_kwargs or {},
#         encode_kwargs=encode_kwargs or {}
#     )

model_name = 'BAAI/bge-large-en-v1.5'
model_kwargs = {"device": "cuda"}
encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
    )




In [10]:
# from langchain_openai import ChatOpenAI

# @lru_cache(maxsize=1)
# def getLLM():
#     """
#     Get the LLM model.
#     """
#     return ChatOpenAI(
#         temperature=0,
#         model_name="gpt-3.5-turbo",
#         openai_api_key=os.getenv('OPENAI_API_KEY')
#     )
# llm = getLLM()

In [11]:
from langchain_groq import ChatGroq
groq_api_key = os.getenv('GROQ_API_KEY')
llm = ChatGroq(api_key=groq_api_key, model="Gemma2-9b-it")
llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x0000024C11BB2540>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x0000024C13768E30>, model_name='Gemma2-9b-it', model_kwargs={}, groq_api_key=SecretStr('**********'))

### Load and Split Documents in Chunks

In [12]:
from functools import lru_cache
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyMuPDFLoader, UnstructuredWordDocumentLoader

def load_file(file_path: str):
    """
    Dynamically load PDF or DOCX file and return documents.
    """
    ext = file_path.split('.')[-1].lower()
    if ext == 'pdf':
        loader = PyMuPDFLoader(file_path)
    elif ext == 'docx':
        loader = UnstructuredWordDocumentLoader(file_path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")
    
    return loader.load()

def getDocuments(file_path: str, chunk_size: int = 1000, chunk_overlap: int = 200):
    """
    Load and chunk the document using RecursiveCharacterTextSplitter.
    """
    documents = load_file(file_path)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )

    return text_splitter.split_documents(documents)

documents = getDocuments('bank-Regulatory-Agreement.pdf', 1000, 200)
documents[:5]

[Document(metadata={'producer': 'Adobe PDF Library 10.0', 'creator': 'Acrobat PDFMaker 10.1 for Word', 'creationdate': '2017-08-21T16:29:57-04:00', 'source': 'bank-Regulatory-Agreement.pdf', 'file_path': 'bank-Regulatory-Agreement.pdf', 'total_pages': 34, 'format': 'PDF 1.5', 'title': 'REGULATORY AGREEMENT', 'author': 'Joseph N. Center', 'subject': '', 'keywords': '', 'moddate': '2017-08-21T16:30:03-04:00', 'trapped': '', 'modDate': "D:20170821163003-04'00'", 'creationDate': "D:20170821162957-04'00'", 'page': 0}, page_content='1 \nTHIS REGULATORY AGREEMENT (“Agreement”), entered into as of the ____ day of __________, \n20__, by and between ___________________________ Housing Development Fund [Company, Inc.] \n[Corporation] (“HDFC”), a corporation formed pursuant to the Business Corporation Law and Article XI \nof the Private Housing Finance Law, having an address at ________________, and the City of New \nYork (“City”), acting by and through its Department of Housing Preservation and D

In [32]:
from langchain_community.vectorstores import FAISS
vectorstore = FAISS.from_documents(
    documents=documents,
    embedding=embeddings,
)
vectorstore.save_local('faiss_index')

In [15]:
vectorstore = FAISS.load_local('faiss_index', embeddings, allow_dangerous_deserialization=True)

In [16]:
query = "Household Limit?"
result = vectorstore.similarity_search(query, k=1)
print(result[0].page_content)

initial occupancy of an Apartment by such Household or any member thereof, (iii) any 
Household that already holds, or includes any person who holds, Shares allocated to another 
Apartment, unless such Household or person simultaneously sells such Shares to an Eligible 
Household in compliance with this Agreement, (iv) except as provided in the preceding clause 
of this sentence, any Household that has previously purchased, or includes any person who 
has previously purchased, a home, a condominium unit, or the shares attributable to a 
cooperative dwelling unit through a governmentally subsidized affordable housing program, or 
(v) a corporation, partnership, limited liability company, or other entity. 
 
“IRC” shall mean the Internal Revenue Code of 1986, as amended, and any rules or 
regulations promulgated pursuant thereto. 
 
“Income Limit” shall mean one hundred twenty percent (120%) of Median.


In [17]:
documents = getDocuments('regulatory-upload.docx', 250, 70)
documents[:5]

[Document(metadata={'source': 'regulatory-upload.docx'}, page_content='Regulatory Guidelines for Customer Data Management, Record Retention, and Financial Eligibility\n\nSection 1: Customer Data Privacy and Retention\n\n1.1 Data Retention Period'),
 Document(metadata={'source': 'regulatory-upload.docx'}, page_content='1.1 Data Retention Period\n\nAll customer Personally Identifiable Information (PII) must be retained for no longer than 7 years from the date of account closure, unless required by legal obligations.\n\n1.2 Data Encryption Standard'),
 Document(metadata={'source': 'regulatory-upload.docx'}, page_content='1.2 Data Encryption Standard\n\nAll customer data must be stored using at least AES-256 encryption both at rest and in transit.\n\n1.3 PII Access Control'),
 Document(metadata={'source': 'regulatory-upload.docx'}, page_content='1.3 PII Access Control\n\nOnly authorized personnel with role-based access control (RBAC) are permitted to access PII. Access logs must be retaine

In [18]:
uploaded_clauses = [{"text": doc.page_content} for doc in documents]
uploaded_clauses

[{'text': 'Regulatory Guidelines for Customer Data Management, Record Retention, and Financial Eligibility\n\nSection 1: Customer Data Privacy and Retention\n\n1.1 Data Retention Period'},
 {'text': '1.1 Data Retention Period\n\nAll customer Personally Identifiable Information (PII) must be retained for no longer than 7 years from the date of account closure, unless required by legal obligations.\n\n1.2 Data Encryption Standard'},
 {'text': '1.2 Data Encryption Standard\n\nAll customer data must be stored using at least AES-256 encryption both at rest and in transit.\n\n1.3 PII Access Control'},
 {'text': '1.3 PII Access Control\n\nOnly authorized personnel with role-based access control (RBAC) are permitted to access PII. Access logs must be retained for a minimum of 2 years.\n\n1.4 Third-Party Disclosure'},
 {'text': '1.4 Third-Party Disclosure\n\nNo customer data may be shared with external vendors unless a signed Data Processing Agreement (DPA) exists and customer consent has been 

In [19]:
len(uploaded_clauses)

18

In [20]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

prompt_template = PromptTemplate(
    input_variables=["uploaded_clause", "match_1", "match_2"],
    template="""
    You are a compliance assistant. Analyze if the following uploaded clause complies with regulations.

    Uploaded Clause:
    "{uploaded_clause}"

    Top 2 Matching Regulatory Clauses:
    1. "{match_1}"
    2. "{match_2}"

    Think step-by-step:
    - Compare the uploaded clause with the regulations
    - Highlight similarities/differences
    - Return only a structured JSON:
    
    {{"compliant": true/false, "reasoning": "..."}}
"""
)

chain = LLMChain(
    llm=llm,
    prompt=prompt_template,
    # verbose=True
)

  chain = LLMChain(


In [21]:
uploaded_clauses[0]['text']

'Regulatory Guidelines for Customer Data Management, Record Retention, and Financial Eligibility\n\nSection 1: Customer Data Privacy and Retention\n\n1.1 Data Retention Period'

In [22]:
result_1 = vectorstore.similarity_search(uploaded_clauses[0]['text'], k=2)
result_1[0].page_content

'7 \n \n13. \nsuch other documents and records as HPD, the Manager, or the Monitor shall deem \nnecessary. \n \n“Restriction Period” shall mean a period commencing on the Commencement Date and \nexpiring upon the Expiration Date. \n \n“Retirement Plan” shall mean (i) a defined benefit pension plan, (ii) an Individual Retirement \nAccount established pursuant to IRC §408, (iii) a Roth Individual Retirement Account \nestablished pursuant to IRC §408A, (iv) a qualified profit-sharing plan established pursuant to \nIRC §401(k), (v) a Roth qualified profit-sharing plan established pursuant to IRC §401(k) and \n§402A, (vi) a tax-sheltered annuity established pursuant to IRC §403(b), (vii) a Roth tax-\nsheltered annuity established pursuant to IRC §403(b) and §402A, (viii) a deferred \ncompensation plan established pursuant to IRC §Section 457, (ix) a Roth deferred \ncompensation plan established pursuant to IRC §Section 457 and §402A, (x) a governmental'

In [23]:
llm_response = chain.invoke({
    "uploaded_clause": uploaded_clauses[0]['text'],
    "match_1": result_1[0].page_content,
    "match_2": result_1[1].page_content
})

print(llm_response)

{'uploaded_clause': 'Regulatory Guidelines for Customer Data Management, Record Retention, and Financial Eligibility\n\nSection 1: Customer Data Privacy and Retention\n\n1.1 Data Retention Period', 'match_1': '7 \n \n13. \nsuch other documents and records as HPD, the Manager, or the Monitor shall deem \nnecessary. \n \n“Restriction Period” shall mean a period commencing on the Commencement Date and \nexpiring upon the Expiration Date. \n \n“Retirement Plan” shall mean (i) a defined benefit pension plan, (ii) an Individual Retirement \nAccount established pursuant to IRC §408, (iii) a Roth Individual Retirement Account \nestablished pursuant to IRC §408A, (iv) a qualified profit-sharing plan established pursuant to \nIRC §401(k), (v) a Roth qualified profit-sharing plan established pursuant to IRC §401(k) and \n§402A, (vi) a tax-sheltered annuity established pursuant to IRC §403(b), (vii) a Roth tax-\nsheltered annuity established pursuant to IRC §403(b) and §402A, (viii) a deferred \nc

In [24]:
llm_response

{'uploaded_clause': 'Regulatory Guidelines for Customer Data Management, Record Retention, and Financial Eligibility\n\nSection 1: Customer Data Privacy and Retention\n\n1.1 Data Retention Period',
 'match_1': '7 \n \n13. \nsuch other documents and records as HPD, the Manager, or the Monitor shall deem \nnecessary. \n \n“Restriction Period” shall mean a period commencing on the Commencement Date and \nexpiring upon the Expiration Date. \n \n“Retirement Plan” shall mean (i) a defined benefit pension plan, (ii) an Individual Retirement \nAccount established pursuant to IRC §408, (iii) a Roth Individual Retirement Account \nestablished pursuant to IRC §408A, (iv) a qualified profit-sharing plan established pursuant to \nIRC §401(k), (v) a Roth qualified profit-sharing plan established pursuant to IRC §401(k) and \n§402A, (vi) a tax-sheltered annuity established pursuant to IRC §403(b), (vii) a Roth tax-\nsheltered annuity established pursuant to IRC §403(b) and §402A, (viii) a deferred \n

In [25]:
# If llm_response is already a dictionary, no need to clean or parse it

# print("Uploaded Clause:", llm_response.get("uploaded_clause"))
# print("Match 1:", parsed_response.get("match_1"))
# print("Match 2:", parsed_response.get("match_2"))
print("Reasoning:", llm_response.get("text"))

Reasoning: ```json
{"compliant": false, "reasoning": "The uploaded clause is incomplete and lacks specific details about data retention periods.  \n\n-  It only mentions a 'Data Retention Period' without defining its duration.\n\n-  The provided matching regulatory clauses focus on reporting requirements, restrictions periods, and record retention for  financial and operational matters, not specifically on customer data privacy and retention.\n\nTo determine compliance, we need a clear statement outlining how long customer data will be retained, in alignment with relevant privacy regulations like GDPR, CCPA, or other applicable laws."}
``` 



In [29]:
import uuid
result = []
unique_id = f"compliance_{uuid.uuid4().hex[:8]}"

for clause in uploaded_clauses:
    query = clause['text']
    
    matches = vectorstore.similarity_search(query, k=2)
    
    llm_response = chain.invoke({
        "uploaded_clause": query,
        "match_1": matches[0].page_content,
        "match_2": matches[1].page_content
    })
    
    result.append({
        "file_name": unique_id,
        "llm_response": llm_response
    })

In [27]:
result

[{'llm_response': {'uploaded_clause': 'Regulatory Guidelines for Customer Data Management, Record Retention, and Financial Eligibility\n\nSection 1: Customer Data Privacy and Retention\n\n1.1 Data Retention Period',
   'match_1': '7 \n \n13. \nsuch other documents and records as HPD, the Manager, or the Monitor shall deem \nnecessary. \n \n“Restriction Period” shall mean a period commencing on the Commencement Date and \nexpiring upon the Expiration Date. \n \n“Retirement Plan” shall mean (i) a defined benefit pension plan, (ii) an Individual Retirement \nAccount established pursuant to IRC §408, (iii) a Roth Individual Retirement Account \nestablished pursuant to IRC §408A, (iv) a qualified profit-sharing plan established pursuant to \nIRC §401(k), (v) a Roth qualified profit-sharing plan established pursuant to IRC §401(k) and \n§402A, (vi) a tax-sheltered annuity established pursuant to IRC §403(b), (vii) a Roth tax-\nsheltered annuity established pursuant to IRC §403(b) and §402A, 

In [31]:
import json
file_name = f"{unique_id}.json"
try:
    with open(file_name, 'w', encoding='utf-8') as json_file:
        json.dump(result, json_file, ensure_ascii=False, indent=4)
    print(f"Results saved successfully to {file_name}")
except Exception as e:
    raise RuntimeError(f"Failed to save the file {file_name}: {e}")

Results saved successfully to compliance_3cdb43e2.json


In [39]:
import os
import json
RESULT_FOLDER = "results"
file_id = 'fd45291b'
json_file_path = os.path.join(RESULT_FOLDER, f"compliance_{file_id}.json")
json_file_path = json_file_path.replace("\\", "/")
json_file_path
with open(json_file_path, 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

In [40]:
with open(json_file_path, "r", encoding="utf-8") as f:
        data = json.load(f)