In [1]:
import pdfplumber 
import re 
import json 
import nltk
from nltk.tokenize import sent_tokenize

In [8]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ANUSANTH\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ANUSANTH\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [6]:
#text extraction
def read_pdf(file_path):
    text = []
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text() or ""
            text.append(page_text)
    return "\n".join(text)

#preprocess text
def preprocess_text(text):
    text = re.sub(r"\s+", " ",text)
    text = text.strip()
    
    clauses = sent_tokenize(text)
    return clauses
        

In [10]:
#rule based scoring
RISK_KEYWORDS = {
    "liability": 80,
    "compensation": 75,
    "forfeiture": 70,
    "non-refundable": 70,
    "penalty": 65,
    "eviction": 80,
    "termination": 75,
    "disciplinary": 60,
    "not liable": 80,
    "responsibility": 50
}


def get_risk_score(input):
    clauses = input.lower()
    score = 0
    for keyword,weight in RISK_KEYWORDS.items():
        if keyword in clauses:
            score = max(score,weight)
    return score

def analyze_clauses(clauses):
    results = []
    for clause in clauses:
        score = get_risk_score(clause)
        results.append({
            "Clause": clause,
            "risky": score>0,
            "score": score
        })
    return results

In [4]:
# json export
def export_json(results,output_file="analysis.json"):
    with open(output_file,"w",encoding="utf-8") as f:
        json.dump(results,f,indent=4,ensure_ascii=False)

In [11]:
# main fusntion

pdf_file = r"samples\Hostel_Damage_Policy.pdf"  
raw_text = read_pdf(pdf_file)
clauses = preprocess_text(raw_text)
analysis = analyze_clauses(clauses)
export_json(analysis)

print(f"Analysis complete. Saved to analysis.json")

Analysis complete. Saved to analysis.json
