In [6]:
import pdfplumber 
import re 
import json 
import nltk
from nltk.tokenize import sent_tokenize
import os
import time
import google.generativeai as genai
from dotenv import load_dotenv

In [8]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ANUSANTH\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ANUSANTH\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [7]:
#text extraction
def read_pdf(file_path):
    text = []
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text() or ""
            text.append(page_text)
    return "\n".join(text)

#preprocess text
def preprocess_text(text):
    text = re.sub(r"\s+", " ",text)
    text = text.strip()
    
    clauses = sent_tokenize(text)
    return clauses
        

In [13]:
def analyze_clauses_in_batch(clauses):
    
    
    model = genai.GenerativeModel(model_name="gemini-2.5-flash")
    
    
    clauses_with_ids = [{"id": i, "text": clause} for i, clause in enumerate(clauses)]
    
    
    system_prompt = f"""
    You are an expert legal risk analyst. You will be given a JSON array of legal clauses, each with a unique "id". 
    Your task is to analyze every clause and return a single JSON array as your response. 
    Each object in your returned array must correspond to a clause from the input and contain:
    1. "id": The original ID of the clause.
    2. "analysis": An object containing your analysis with the following four fields:
        - "risky": A boolean (true/false).
        - "score": An integer from 0 (no risk) to 100 (critical risk).
        - "summary": A concise, one-sentence summary of the clause's meaning.
        - "reason": A concise, one-sentence explanation.
        - "category": One of ['Financial', 'Liability', 'Operational', 'Compliance', 'Termination', 'Data Privacy', 'Intellectual Property', 'Uncategorized'].

    Process all clauses provided in the input JSON and respond ONLY with the resulting JSON array.
    """
    
    prompt = f"{system_prompt}\n\nCLAUSES_JSON:\n{json.dumps(clauses_with_ids, indent=2)}"
    
    print(f"Sending {len(clauses)} clauses to the AI...")

    try:
        response = model.generate_content(prompt)
        
        json_text = response.text.strip().replace("```json", "").replace("```", "")
        analyses = json.loads(json_text)
        
        analysis_map = {item['id']: item['analysis'] for item in analyses}
        
        final_results = []
        for clause_data in clauses_with_ids:
            clause_id = clause_data['id']
            analysis = analysis_map.get(clause_id, {
                "risky": False, "score": 0, "summary": "Analysis not found in batch response.", "reason": "Analysis not found in batch response.", "category": "Error"
            })
            final_results.append({
                "Clause": clause_data['text'],
                **analysis
            })
        return final_results

    except (json.JSONDecodeError, ValueError, KeyError) as e:
        print(f"--> CRITICAL ERROR: Could not parse the batch AI response. Error: {e}")
        # Return an error for all clauses if the batch fails
        return [{"Clause": clause, "risky": False, "score": 0, "reason": "Batch analysis failed.", "category": "Error"} for clause in clauses]
    except Exception as e:
        print(f"--> An unexpected API error occurred: {e}")
        return [{"Clause": clause, "risky": False, "score": 0, "reason": "API error.", "category": "Error"} for clause in clauses]

In [14]:
def calculate_overall_risk(analysis_results):
    
    risky_clauses = [r for r in analysis_results if r.get('risky')]
    if not risky_clauses:
        return {
            "overall_risk_score": 0,
            "risk_summary": "No significant risks were identified in this document.",
            "total_clauses": len(analysis_results),
            "risky_clause_count": 0
        }

    
    average_score = sum(c['score'] for c in risky_clauses) / len(risky_clauses)
    
    
    top_risks = sorted(risky_clauses, key=lambda x: x['score'], reverse=True)[:3]
    risk_summary_points = [f"- {c['summary']} (Risk Score: {c['score']})" for c in top_risks]
    risk_summary = "The primary risks identified are:\n" + "\n".join(risk_summary_points)

    return {
        "overall_risk_score": round(average_score),
        "risk_summary": risk_summary,
        "total_clauses": len(analysis_results),
        "risky_clause_count": len(risky_clauses)
    }


In [15]:
# json export

def export_final_json(document_summary, clause_analysis, output_file="analysis.json"):
    """
    Exports the final JSON with the document summary at the top level.
    """
    final_output = {
        "document_summary": document_summary,
        "clause_by_clause_analysis": clause_analysis
    }
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(final_output, f, indent=4, ensure_ascii=False)

In [17]:
# --- Main Function ---



load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("GOOGLE_API_KEY not found.")
genai.configure(api_key=api_key)


pdf_file = r"samples\Hostel_Damage_Policy.pdf"


print(f"Starting optimized analysis of {pdf_file}...")
raw_text = read_pdf(pdf_file)
clauses = preprocess_text(raw_text)
 
clause_analysis = analyze_clauses_in_batch(clauses) 


document_summary = calculate_overall_risk(clause_analysis)


export_final_json(document_summary, clause_analysis)

print(f"\nAnalysis complete. The file 'analysis.json' has been saved!")

Starting optimized analysis of samples\Hostel_Damage_Policy.pdf...
Sending 10 clauses to the AI...

Analysis complete. The file 'analysis.json' has been saved!
