In [10]:
import pdfplumber 
import re 
import json 
import nltk
from nltk.tokenize import sent_tokenize
import os
import time
import google.generativeai as genai
from dotenv import load_dotenv
import numpy as np
import faiss

In [11]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ANUSANTH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ANUSANTH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
#text extraction
def read_pdf(file_path):
    text = []
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text() or ""
            text.append(page_text)
    return "\n".join(text)


def preprocess_text(text):
    """
    The definitive function to parse documents. It is resilient to line breaks
    within a single sentence and uses a more precise filtering logic.
    """
   
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\r\n|\r', '\n', text)
    
    text = re.sub(r'\n(?!\s*(?:\d+\.|\([a-z\d]+\)|[A-Z\s]{5,}))', ' ', text)
    
    blocks = text.split('\n')
    
    final_clauses = []
    
    for block in blocks:
        block = block.strip()
        
        if not block:
            continue
        if len(block.split()) < 3 and block.endswith(':'):
            continue
        if re.match(r'^(Lessor|Lessee|Date:)', block, re.IGNORECASE):
            continue
        
        # Skip serial numbers like "1.", "1.1", "(a)", "(i)"
        block = re.sub(r'^\s*(\d+(\.\d+)*\.|\([a-zA-Z\d]+\))\s*', '', block)

        sentences = sent_tokenize(block)
        final_clauses.extend(sentences)
        
    return final_clauses

        

In [13]:
def analyze_clauses_in_batch(clauses):
    
    
    model = genai.GenerativeModel(model_name="gemini-2.5-flash")
    
    
    clauses_with_ids = [{"id": i, "text": clause} for i, clause in enumerate(clauses)]
    
    
    system_prompt = f"""
    You are an expert legal risk analyst. YOUR CLIENT IS THE PERSON SIGNING THE DOCUMENT (e.g., the Lessee or Occupant). 
    Your entire analysis must be from THEIR PERSPECTIVE, identifying risks that could negatively affect them. You will be given a JSON array of legal clauses, each with a unique "id". 
    Your task is to analyze every clause and return a single JSON array as your response. 
    Each object in your returned array must correspond to a clause from the input and contain:
    1. "id": The original ID of the clause.
    2. "analysis": An object containing your analysis with the following four fields:
        - "risky": A boolean (true/false).
        - "score": An integer from 0 (no risk) to 100 (critical risk).
        - "summary": A concise, one-sentence summary of the clause's meaning.
        - "reason": A concise, one-sentence explanation.
        - "category": One of ['Financial', 'Liability', 'Operational', 'Compliance', 'Termination', 'Data Privacy', 'Intellectual Property', 'Uncategorized'].

    Process all clauses provided in the input JSON and respond ONLY with the resulting JSON array.
    """
    
    prompt = f"{system_prompt}\n\nCLAUSES_JSON:\n{json.dumps(clauses_with_ids, indent=2)}"
    
    print(f"Sending {len(clauses)} clauses to the AI...")

    try:
        response = model.generate_content(prompt)
        
        json_text = response.text.strip().replace("```json", "").replace("```", "")
        analyses = json.loads(json_text)
        
        analysis_map = {item['id']: item['analysis'] for item in analyses}
        
        final_results = []
        for clause_data in clauses_with_ids:
            clause_id = clause_data['id']
            analysis = analysis_map.get(clause_id, {
                "risky": False, "score": 0, "summary": "Analysis not found in batch response.", "reason": "Analysis not found in batch response.", "category": "Error"
            })
            final_results.append({
                "Clause": clause_data['text'],
                **analysis
            })
        return final_results

    except (json.JSONDecodeError, ValueError, KeyError) as e:
        print(f"--> CRITICAL ERROR: Could not parse the batch AI response. Error: {e}")
        # Return an error for all clauses if the batch fails
        return [{"Clause": clause, "risky": False, "score": 0, "reason": "Batch analysis failed.", "category": "Error"} for clause in clauses]
    except Exception as e:
        print(f"--> An unexpected API error occurred: {e}")
        return [{"Clause": clause, "risky": False, "score": 0, "reason": "API error.", "category": "Error"} for clause in clauses]

In [14]:
def calculate_overall_risk(analysis_results):
    
    risky_clauses = [r for r in analysis_results if r.get('risky')]
    if not risky_clauses:
        return {
            "overall_risk_score": 0,
            "risk_summary": "No significant risks were identified in this document.",
            "total_clauses": len(analysis_results),
            "risky_clause_count": 0
        }

    
    average_score = sum(c['score'] for c in risky_clauses) / len(risky_clauses)
    
    
    top_risks = sorted(risky_clauses, key=lambda x: x['score'], reverse=True)[:3]
    risk_summary_points = [f"- {c['summary']} (Risk Score: {c['score']})" for c in top_risks]
    risk_summary = "The primary risks identified are:\n" + "\n".join(risk_summary_points)

    return {
        "overall_risk_score": round(average_score),
        "risk_summary": risk_summary,
        "total_clauses": len(analysis_results),
        "risky_clause_count": len(risky_clauses)
    }


In [15]:
# json export

def export_final_json(document_summary, clause_analysis, output_file="analysis.json"):
    """
    Exports the final JSON with the document summary at the top level.
    """
    final_output = {
        "document_summary": document_summary,
        "clause_by_clause_analysis": clause_analysis
    }
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(final_output, f, indent=4, ensure_ascii=False)

In [None]:
# --- Main Function ---

load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("GOOGLE_API_KEY not found.")
genai.configure(api_key=api_key)

pdf_file = r"samples\Hostel_Rental_Rules.pdf"

print(f"Analysing {pdf_file}...")
raw_text = read_pdf(pdf_file)
clauses = preprocess_text(raw_text)
 
clause_analysis = analyze_clauses_in_batch(clauses) 

document_summary = calculate_overall_risk(clause_analysis)

export_final_json(document_summary, clause_analysis)
print(f"\nAnalysis complete. The file 'analysis.json' has been saved!")


Analysing samples\Hostel_Rental_Rules.pdf...
Sending 18 clauses to the AI...

Analysis complete. The file 'analysis.json' has been saved!


In [None]:
# chatBot
class LegalAnalyzer:
    def __init__(self, pdf_path):
        """
        This is the one-time setup. It runs when the class is created.
        It loads the PDF, preprocesses it, and builds the knowledge base.
        """
        print(f"Initializing analyzer with document: {pdf_path}")
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"The document was not found at: {pdf_path}")

        raw_text = read_pdf(pdf_path)
        self.clauses = preprocess_text(raw_text)

        self.vector_store = self._create_vector_store()
        print("Analyzer is ready.")

    def _create_vector_store(self, model='models/text-embedding-004'):
        print(f"Creating vector store for {len(self.clauses)} clauses...")
        result = genai.embed_content(model=model, content=self.clauses, task_type="retrieval_document")
        embeddings = result['embedding']
        dimension = len(embeddings[0])
        index = faiss.IndexFlatL2(dimension)
        index.add(np.array(embeddings))
        return index

    def ask_question(self, question: str) -> str:
        """
        This is the on-demand chatbot function. It's very fast.
        """
        if not self.vector_store:
            return "Vector store is not available."

        question_embedding = genai.embed_content(model='models/text-embedding-004', content=question, task_type="retrieval_query")['embedding']
        k = 3
        _, indices = self.vector_store.search(np.array([question_embedding]), k)
        
        relevant_clauses = [self.clauses[i] for i in indices[0]]
        context = "\n".join(relevant_clauses)
        
        rag_model = genai.GenerativeModel(model_name="gemini-1.5-flash")
        prompt = f"Using ONLY the context below, answer the user's question.\n\nCONTEXT:\n{context}\n\nQUESTION: {question}\n\nANSWER:"
        response = rag_model.generate_content(prompt)
        return response.text.strip()

if __name__ == '__main__':
    
    load_dotenv()
    api_key = os.getenv("GOOGLE_API_KEY")
    if not api_key:
        raise ValueError("GOOGLE_API_KEY not found in .env file.")
    genai.configure(api_key=api_key)

   
    pdf_file_path = r"samples\Hostel_Rental_Rules.pdf"
    analyzer = LegalAnalyzer(pdf_path=pdf_file_path)

    
    question_1 = "What is the penalty for late payment?"
    answer_1 = analyzer.ask_question(question_1)
    print(f"\nQ: {question_1}\nA: {answer_1}\n")

    question_2 = "Can I have visitors stay overnight?"
    answer_2 = analyzer.ask_question(question_2)
    print(f"Q: {question_2}\nA: {answer_2}\n")

Initializing analyzer with document: samples\Hostel_Rental_Rules.pdf
Creating vector store for 18 clauses...
Analyzer is ready.

Q: What is the penalty for late payment?
A: The penalty for late payment will be decided by management.

Q: Can I have visitors stay overnight?
A: No, visitors are not permitted to stay overnight without prior written approval from the hostel management.

