In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! mkdir models2

In [None]:

# Unzip clause.zip into a directory named 'clause_models'
!unzip '/content/drive/MyDrive/6th Sem/Major Project/clause.zip' -d /content/models2/clause_models/

# Unzip anamoly.zip into a directory named 'anomaly_models'
!unzip '/content/drive/MyDrive/6th Sem/Major Project/anamoly.zip' -d /content/models2/anomaly_models/

# Unzip risk.zip into a directory named 'risk_models'
!unzip '/content/drive/MyDrive/6th Sem/Major Project/risk.zip' -d /content/models2/risk_models/

Archive:  /content/drive/MyDrive/6th Sem/Major Project/clause.zip
   creating: /content/models2/clause_models/clause_model/
  inflating: /content/models2/clause_models/clause_model/config.json  
  inflating: /content/models2/clause_models/clause_model/special_tokens_map.json  
  inflating: /content/models2/clause_models/clause_model/tokenizer.json  
  inflating: /content/models2/clause_models/clause_model/vocab.txt  
  inflating: /content/models2/clause_models/clause_model/tokenizer_config.json  
  inflating: /content/models2/clause_models/clause_model/training_args.bin  
  inflating: /content/models2/clause_models/clause_model/model.safetensors  
  inflating: /content/models2/clause_models/anomaly_iforest.joblib  
  inflating: /content/models2/clause_models/anomaly_lof.joblib  
  inflating: /content/models2/clause_models/baseline_embeddings.npy  
  inflating: /content/models2/clause_models/clause_labels.json  
  inflating: /content/models2/clause_models/risk_labels.json  
Archive:  /c

In [None]:
# [Cell 11] Main Inference Code (Clause Prediction Only)
import torch, joblib, json, numpy as np, re, getpass
# import google.generativeai as genai # REMOVED: Not needed for clause classification
from transformers import AutoTokenizer, AutoModelForSequenceClassification # AutoModel is not needed
# from sklearn.metrics.pairwise import cosine_similarity # REMOVED: Not needed for clause classification

# --- A. Config ---
# REMOVED ALL GEMINI CONFIGURATION (Only needed for gemini_analysis function)
print("⚠️ Skipping Gemini Configuration.")

# --- B. Load Models ---
gpu_device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Main Acceleration: {gpu_device}")

model_checkpoint = "nlpaueb/legal-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# ONLY load clause-related files
with open('/content/models2/clause_models/clause_labels.json', 'r') as f:
    clause_id2label = {int(k): v for k, v in json.load(f)['id2label'].items()}
# RISK FILES REMOVED
# with open('risk_labels.json', 'r') as f:
#     risk_id2label = {int(k): v.capitalize() for k, v in json.load(f)['id2label'].items()}
#     RISK_LABEL_ORDER = [risk_id2label[i] for i in range(len(risk_id2label))]

# ONLY load the Clause Model
model_clause = AutoModelForSequenceClassification.from_pretrained("/content/models2/clause_models/clause_model").to(gpu_device).eval()
# ALL OTHER MODELS REMOVED/COMMENTED OUT
# model_risk_base = AutoModelForSequenceClassification.from_pretrained("./risk_model_base").to(gpu_device).eval()
# risk_calibrator = joblib.load('risk_calibrator.joblib')
# model_embedding = AutoModel.from_pretrained(model_checkpoint).to(gpu_device).eval()
# model_iforest = joblib.load('anomaly_iforest.joblib')
# baseline_embeddings = np.load('baseline_embeddings.npy')

print("✅ Clause Model Loaded.")

# --- C. Functions ---
def classify_clause(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256).to(gpu_device)
    with torch.no_grad():
        logits = model_clause(**inputs).logits
    pred = torch.argmax(logits, dim=1).item()
    return clause_id2label.get(pred, "Unknown")

# ALL OTHER FUNCTIONS REMOVED/COMMENTED OUT
# def score_risk(text): ...
# def get_embedding(text): ...
# def calculate_anomaly(emb): ...

def split_text(text):
    # This function is kept to properly segment the input text
    clauses = re.split(r'\n\s*([0-9]+[\.\)]\s+)|(\n\s*\n+)', text)
    clean = []
    curr = ""
    for p in clauses:
        if not p: continue
        p = p.strip()
        if re.match(r'^[0-9]+[\.\)]$', p): curr = p
        elif len(p.split()) > 5:
            clean.append(f"{curr} {p}" if curr else p)
            curr = ""
    return clean

# def gemini_analysis(text, c_type, risk): ...

# --- D. Run Manual Test ---
print("\n" + "="*40)
print("     CLAUSE PREDICTION ONLY MODE    ")
print("="*40)

input_text = """
1. The Tenant shall pay a monthly rent of Rs. 25,000 on or before the 5th of every month.

2. This agreement is subject to the jurisdiction of the courts in Mumbai.

3. The grand symphony of the cosmos echoes through the cheese sandwich.
"""

print(f"Analyzing Input...")
clauses = split_text(input_text)

for i, c in enumerate(clauses):
    try:
        # ONLY call the clause classification function
        ctype = classify_clause(c)

        # REMOVED all risk, embedding, and anomaly calculations
        # r_probs = score_risk(c)
        # risk = max(r_probs, key=r_probs.get)
        # emb = get_embedding(c)
        # anom, raw_sim = calculate_anomaly(emb)

        print(f"\n[CLAUSE {i+1}]")
        print(f"Text: \"{c[:50]}...\"")
        print(f"Predicted Type: **{ctype}**")

        # REMOVED all other output prints (Risk, Anomaly, Gemini)

    except Exception as e:
        print(f"Error processing clause {i+1}: {e}")

⚠️ Skipping Gemini Configuration.
Main Acceleration: cuda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

✅ Clause Model Loaded.

     CLAUSE PREDICTION ONLY MODE    
Analyzing Input...

[CLAUSE 1]
Text: "1. The Tenant shall pay a monthly rent of Rs. 25,0..."
Predicted Type: **Minimum Commitment**

[CLAUSE 2]
Text: "2. This agreement is subject to the jurisdiction o..."
Predicted Type: **Governing Law**

[CLAUSE 3]
Text: "3. The grand symphony of the cosmos echoes through..."
Predicted Type: **Exclusivity**


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import numpy as np

# --- Setup for Clause Risk Scoring ---

INFER_MODEL_PATH = "/content/models2/risk_models"

# --- REQUIRED DEFINITIONS ---
# Define SAMPLE_CATEGORIES based on your RISK_WEIGHTS keys
RISK_WEIGHTS = {
    "Uncapped Liability": 0.9,
    "Cap on Liability": 0.4,
    "Indemnity": 0.8,
    "Termination for Convenience": 0.5,
    "Confidentiality": 0.6,
    "Governing Law": 0.2,
    "Effective Date": 0.1
}
SAMPLE_CATEGORIES = list(RISK_WEIGHTS.keys())
# ------------------------------

infer_tokenizer = AutoTokenizer.from_pretrained(
    INFER_MODEL_PATH,
    local_files_only=True
)

infer_model = AutoModelForSequenceClassification.from_pretrained(
    INFER_MODEL_PATH,
    local_files_only=True
)

infer_model.eval()

print("Model and tokenizer loaded for inference.")

# --- Functions (Re-included for full functionality) ---

def score_clause(text):
    """Calculates all category risks for a single clause."""
    # Tokenize
    inputs = infer_tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=256
    )

    with torch.no_grad():
        logits = infer_model(**inputs).logits

    # logits -> probabilities
    # Use sigmoid for multi-label classification
    probs = torch.sigmoid(logits).cpu().numpy()[0]

    results = []
    for i, cat in enumerate(SAMPLE_CATEGORIES):

        prob = float(probs[i])
        weight = RISK_WEIGHTS.get(cat, 0.0)

        # formula
        risk_score = prob * weight

        if risk_score >= 0.7:
            band = "High Risk"
        elif risk_score >= 0.4:
            band = "Moderate Risk"
        else:
            band = "Low Risk"

        results.append({
            "category": cat,
            "probability": prob,
            "weight": weight,
            "risk_score": risk_score,
            "risk_band": band
        })

    return results

def score_document(clauses):
    """Calculates document-level and clause-level metrics."""
    all_clause_results = []
    risk_scores = []
    risk_probs = []

    category_hits = {cat: 0 for cat in SAMPLE_CATEGORIES}

    for text in clauses:
        clause_results = score_clause(text)

        # Determine the single highest risk for this clause and its band
        max_risk_result = max(clause_results, key=lambda x: x["risk_score"])

        all_clause_results.append({
            "text": text,
            # Store the highest risk category's detail (This is the specific 'risk_score' and 'risk_band' you requested)
            "max_risk_category": max_risk_result['category'],
            "max_risk_score": max_risk_result['risk_score'],
            "max_risk_band": max_risk_result['risk_band'],
            "full_results": clause_results # Keeping the full breakdown
        })

        # Aggregate: max risk score across categories for this clause
        clause_risk = max_risk_result['risk_score']
        risk_scores.append(clause_risk)

        # For dispute likelihood model (using probability that clause is risky)
        clause_prob = max(r["probability"] for r in clause_results)
        risk_probs.append(clause_prob)

        # Track structural coverage:
        for r in clause_results:
            if r["probability"] > 0.5:
                category_hits[r["category"]] += 1

    # ---------- Document-Level Metrics ----------
    if not risk_scores:
        contract_risk_score = 0.0
        dispute_likelihood = 0.0
    else:
        # (1) Contract-Level Risk Score (weighted mean of max clause risks)
        contract_risk_score = float(np.mean(risk_scores))

        # (2) Dispute likelihood (probabilistic OR)
        dispute_likelihood = float(1 - np.prod(1 - np.array(risk_probs)))

    # (3) Structural completeness (coverage of each category)
    total_categories = len(SAMPLE_CATEGORIES)
    categories_present = sum(1 for c in category_hits.values() if c > 0)
    structural_completeness = categories_present / total_categories

    # (4) Compliance score
    compliance_score = float(1 - contract_risk_score)

    # (5) Composite Score (weighted formula)
    composite = (
        0.40 * contract_risk_score +
        0.30 * dispute_likelihood +
        0.20 * (1 - compliance_score) + # Note: 1 - compliance_score is equal to contract_risk_score
        0.10 * (1 - structural_completeness)
    )

    final_output = {
        "contract_risk_score": contract_risk_score,
        "dispute_likelihood": dispute_likelihood,
        "structural_completeness": structural_completeness,
        "compliance_score": compliance_score,
        "composite_score": composite,
        "clause_details": all_clause_results,
        "category_hits": category_hits
    }

    return final_output

# --- Execution Block ---

test_clauses = [
    "This Agreement shall be governed by the laws of the State of New York.",
    "The liability of the provider shall be unlimited in the event of breach.",
    "This contract may be terminated by either party without cause by providing 30 days written notice.",
]

# Run the full document scoring
result = score_document(test_clauses)

print("\n" + "="*50)
print("🎯 DOCUMENT-LEVEL SUMMARY METRICS (Your First Request)")
print("="*50)

# Print the requested document-level summary metrics
summary_metrics = {
    'contract_risk_score': result['contract_risk_score'],
    'dispute_likelihood': result['dispute_likelihood'],
    'structural_completeness': result['structural_completeness'],
    'compliance_score': result['compliance_score'],
    'composite_score': result['composite_score'],
}
import pprint
pprint.pprint(summary_metrics)

print("\n" + "="*50)
print("📄 CLAUSE-LEVEL MAX RISK (Your Second Request)")
print("="*50)

# Print the requested clause-level risk score and band
for i, detail in enumerate(result['clause_details']):
    print(f"Clause {i+1}: \"{detail['text'][:50]}...\"")
    print(f"   Max Risk Category: {detail['max_risk_category']}")
    pprint.pprint({
        'risk_score': detail['max_risk_score'],
        'risk_band': detail['max_risk_band']
    })
    print("-" * 20)

Model and tokenizer loaded for inference.

🎯 DOCUMENT-LEVEL SUMMARY METRICS (Your First Request)
{'compliance_score': 0.6568963492910067,
 'composite_score': 0.562352660873214,
 'contract_risk_score': 0.3431036507089933,
 'dispute_likelihood': 0.9978253776832027,
 'structural_completeness': 0.42857142857142855}

📄 CLAUSE-LEVEL MAX RISK (Your Second Request)
Clause 1: "This Agreement shall be governed by the laws of th..."
   Max Risk Category: Governing Law
{'risk_band': 'Low Risk', 'risk_score': 0.19041726589202881}
--------------------
Clause 2: "The liability of the provider shall be unlimited i..."
   Max Risk Category: Uncapped Liability
{'risk_band': 'Moderate Risk', 'risk_score': 0.6709131717681885}
--------------------
Clause 3: "This contract may be terminated by either party wi..."
   Max Risk Category: Uncapped Liability
{'risk_band': 'Low Risk', 'risk_score': 0.16798051446676254}
--------------------


In [None]:
import torch
import numpy as np
import re
from transformers import AutoTokenizer, AutoModel
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm # Used for progress bar in Colab

# --- Configuration & Model Loading for Embeddings ---

MODEL_NAME = 'nlpaueb/legal-bert-base-uncased'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}')

# Load Tokenizer and Base Embedding Model (required for anomaly)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device).eval()
print(f'LegalBERT model loaded from {MODEL_NAME} for embeddings.')

# --- Functions (from Complete_Legal_Contract_Analysis_Colab.ipynb) ---

def segment_into_clauses(text):
    """Segments a document into clauses (from notebook cell 13)."""
    # Modified regex to split more aggressively on periods and semicolons followed by space
    parts = re.split(r'\n\s*\n|(?<=[.;])\s+|(?<=\.)\s{2,}', text)
    parts = [p.strip() for p in parts if len(p.strip()) > 30]
    return parts

def get_paragraph_embeddings(texts, batch_size=8):
    """Computes mean-pooled embeddings (from notebook cell 14)."""
    all_mean = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding Clauses"):
        batch = texts[i:i+batch_size]
        # Use a consistent max_length
        enc = tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=512).to(device)
        with torch.no_grad():
            out = model(**enc)
            last = out.last_hidden_state.cpu().numpy()
        masks = enc.attention_mask.cpu().numpy()
        for j in range(len(batch)):
            seq = last[j]
            mask = masks[j]
            length = int(mask.sum())
            mean_pool = seq[:length].mean(axis=0)
            all_mean.append(mean_pool)
    return np.array(all_mean)

def build_baseline_embeddings_from_texts(texts):
    """Wrapper for computing baseline embeddings."""
    return get_paragraph_embeddings(texts)

def anomaly_ensemble(par_embs, baseline_embs, contamination=0.05):
    """Computes a combined anomaly score (from notebook cell 17)."""

    # 1. Semantic Deviation
    sims = cosine_similarity(par_embs, baseline_embs)
    max_sims = sims.max(axis=1)
    sem_dev = 1.0 - max_sims

    # 2. Isolation Forest
    iso = IsolationForest(n_estimators=200, contamination=contamination, random_state=42)
    iso.fit(par_embs)
    iso_score = -iso.decision_function(par_embs)

    # 3. Local Outlier Factor
    n_neighbors = min(20, max(5, par_embs.shape[0] - 1))
    # Handle the small sample case gracefully (as seen in notebook output)
    if par_embs.shape[0] < 5:
        n_neighbors = max(1, par_embs.shape[0] - 1)
        if n_neighbors == 0: n_neighbors = 1

    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)
    lof_fit = lof.fit_predict(par_embs)
    lof_score = -lof.negative_outlier_factor_

    # Scale components to 0-1 and combine
    scaler = MinMaxScaler()
    comps = np.vstack([sem_dev, iso_score, lof_score]).T
    comps_scaled = scaler.fit_transform(comps)

    # Weighted combination: 0.5*Sem + 0.25*ISO + 0.25*LOF
    combined = 0.5 * comps_scaled[:, 0] + 0.25 * comps_scaled[:, 1] + 0.25 * comps_scaled[:, 2]

    return combined

# --- Baseline Setup (from notebook cell 17) ---

baseline_texts = [
    'The Parties shall keep confidential information in strict confidence and not disclose to third parties.',
    'Each party will indemnify the other for losses arising from breach of obligations.',
    'This agreement is governed by the laws of India and disputes are resolved by arbitration.'
]
baseline_embs = build_baseline_embeddings_from_texts(baseline_texts)
print(f"Baseline embeddings computed: {baseline_embs.shape}")


# --- Test Document Analysis (from notebook cell 20) ---

new_document_text = (
    "This Confidentiality Agreement (the 'Agreement') is made and entered into on this 1st day of January, 2024, "
    "by and between Party A, a corporation organized under the laws of Delaware, having its principal place of business at Address A, "
    "and Party B, a corporation organized under the laws of New York, having its principal place of business at Address B. "
    "WHEREAS, Party A and Party B are considering a potential business transaction (the 'Transaction'), "
    "and in connection therewith, Party A may disclose to Party B certain confidential and proprietary information; "
    "NOW, THEREFORE, in consideration of the mutual covenants and agreements contained herein, "
    "the parties agree as follows: 1. Definition of Confidential Information. 'Confidential Information' shall mean "
    "all non-public information disclosed by Party A to Party B, whether orally or in writing, "
    "that is designated as confidential or that, by its nature, would reasonably be understood to be confidential. "
    "2. Non-Disclosure. Party B agrees to keep confidential all Confidential Information and not to disclose it to any third party "
    "without the prior written consent of Party A. 3. Use of Confidential Information. Party B agrees to use the Confidential Information "
    "solely for the purpose of evaluating the Transaction. 4. Return of Confidential Information. "
    "Upon termination of discussions or upon written request by Party A, Party B shall promptly return or destroy all Confidential Information. "
    "5. Governing Law. This Agreement shall be governed by and construed in accordance with the laws of the State of Delaware, "
    "without regard to its conflict of laws principles. 6. Entire Agreement. This Agreement constitutes the entire agreement "
    "between the parties concerning the subject matter hereof and supersedes all prior agreements, understandings, negotiations and discussions, "
    "whether oral or written, of the parties. Notwithstanding any other provision herein, Party B shall not be liable for incidental damages."
)

print("\n--- Processing New Document for Anomaly Scores ---")

# 1. Segment into clauses
new_clauses = segment_into_clauses(new_document_text)
print(f"Segmented into {len(new_clauses)} clauses.")

# 2. Compute embeddings
new_par_embs = get_paragraph_embeddings(new_clauses)

# 3. Compute anomaly scores
new_anom_scores = anomaly_ensemble(new_par_embs, baseline_embs)

print("\n✅ Clause Anomaly Scores:")
for i, (clause, score) in enumerate(zip(new_clauses, new_anom_scores)):
    # Truncate the clause text for cleaner output
    print(f"Clause {i+1} (Anomaly Score: {score:.4f}): \"{clause[:60]}...\"")

print("\nAnomaly Score Array:")
print(new_anom_scores.tolist())

Device: cuda


pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

LegalBERT model loaded from nlpaueb/legal-bert-base-uncased for embeddings.


Embedding Clauses:   0%|          | 0/1 [00:00<?, ?it/s]

Baseline embeddings computed: (3, 768)

--- Processing New Document for Anomaly Scores ---
Segmented into 13 clauses.


Embedding Clauses:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


✅ Clause Anomaly Scores:
Clause 1 (Anomaly Score: 0.4503): "This Confidentiality Agreement (the 'Agreement') is made and..."
Clause 2 (Anomaly Score: 0.4164): "WHEREAS, Party A and Party B are considering a potential bus..."
Clause 3 (Anomaly Score: 0.7419): "NOW, THEREFORE, in consideration of the mutual covenants and..."
Clause 4 (Anomaly Score: 0.7134): "Definition of Confidential Information...."
Clause 5 (Anomaly Score: 0.3670): "'Confidential Information' shall mean all non-public informa..."
Clause 6 (Anomaly Score: 0.1718): "Party B agrees to keep confidential all Confidential Informa..."
Clause 7 (Anomaly Score: 0.4702): "Use of Confidential Information...."
Clause 8 (Anomaly Score: 0.3219): "Party B agrees to use the Confidential Information solely fo..."
Clause 9 (Anomaly Score: 0.5808): "Return of Confidential Information...."
Clause 10 (Anomaly Score: 0.2958): "Upon termination of discussions or upon written request by P..."
Clause 11 (Anomaly Score: 0.4474): "This Agreem

In [None]:
import torch, joblib, json, numpy as np, re, pprint
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm # Used for progress visualization

# --- A. Configuration & Device Setup ---
gpu_device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Main Acceleration: {gpu_device}")
model_checkpoint = "nlpaueb/legal-bert-base-uncased" # Base model for all tasks

# --- B. Load Models and Related Data ---

# 1. Clause Classification Model
try:
    clause_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    with open('/content/models2/clause_models/clause_labels.json', 'r') as f:
        clause_id2label = {int(k): v for k, v in json.load(f)['id2label'].items()}
    model_clause = AutoModelForSequenceClassification.from_pretrained("/content/models2/clause_models/clause_model").to(gpu_device).eval()
    print("✅ Clause Model Loaded.")
except Exception as e:
    print(f"⚠️ Could not load Clause Model: {e}. Skipping clause classification.")
    model_clause = None

# 2. Risk Scoring Model
try:
    INFER_MODEL_PATH = "/content/models2/risk_models"
    # Re-use the clause tokenizer/base tokenizer if possible, otherwise load the specific one.
    risk_tokenizer = AutoTokenizer.from_pretrained(INFER_MODEL_PATH, local_files_only=True)
    infer_model = AutoModelForSequenceClassification.from_pretrained(INFER_MODEL_PATH, local_files_only=True).to(gpu_device).eval()

    RISK_WEIGHTS = {
        "Uncapped Liability": 0.9, "Cap on Liability": 0.4, "Indemnity": 0.8,
        "Termination for Convenience": 0.5, "Confidentiality": 0.6,
        "Governing Law": 0.2, "Effective Date": 0.1
    }
    SAMPLE_CATEGORIES = list(RISK_WEIGHTS.keys())
    print("✅ Risk Model Loaded.")
except Exception as e:
    print(f"⚠️ Could not load Risk Model: {e}. Skipping risk scoring.")
    infer_model = None

# 3. Anomaly Detection Model (Embeddings + Baseline)
try:
    model_embedding = AutoModel.from_pretrained(model_checkpoint).to(gpu_device).eval()

    # Define and compute baseline embeddings (Replicating notebook steps)
    baseline_texts = [
        'The Parties shall keep confidential information in strict confidence and not disclose to third parties.',
        'Each party will indemnify the other for losses arising from breach of obligations.',
        'This agreement is governed by the laws of India and disputes are resolved by arbitration.'
    ]

    # We define get_paragraph_embeddings here for immediate use and later reference
    def get_paragraph_embeddings(texts, batch_size=8, tokenizer=clause_tokenizer, model=model_embedding):
        all_mean = []
        for i in tqdm(range(0, len(texts), batch_size), desc="Building Baseline Embeddings"):
            batch = texts[i:i+batch_size]
            enc = tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=512).to(gpu_device)
            with torch.no_grad():
                out = model(**enc)
                last = out.last_hidden_state.cpu().numpy()
            masks = enc.attention_mask.cpu().numpy()
            for j in range(len(batch)):
                seq = last[j]
                mask = masks[j]
                length = int(mask.sum())
                mean_pool = seq[:length].mean(axis=0)
                all_mean.append(mean_pool)
        return np.array(all_mean)

    baseline_embs = get_paragraph_embeddings(baseline_texts)
    print("✅ Anomaly Embeddings/Baseline Ready.")
except Exception as e:
    print(f"⚠️ Could not load Embedding Model: {e}. Skipping anomaly scoring.")
    model_embedding = None
    baseline_embs = None

# --- C. Functions (Combined and Enhanced) ---

def split_text(text):
    """Splits text into clauses."""
    clauses = re.split(r'\n\s*([0-9]+[\.\)]\s+)|(\n\s*\n+)', text)
    clean = []
    curr = ""
    for p in clauses:
        if not p: continue
        p = p.strip()
        if re.match(r'^[0-9]+[\.\)]$', p): curr = p
        elif len(p.split()) > 5:
            clean.append(f"{curr} {p}" if curr else p)
            curr = ""
    return clean

def classify_clause(text):
    """Clause Prediction (from first cell)."""
    if model_clause is None: return "N/A"
    inputs = clause_tokenizer(text, return_tensors="pt", truncation=True, max_length=256).to(gpu_device)
    with torch.no_grad():
        logits = model_clause(**inputs).logits
    pred = torch.argmax(logits, dim=1).item()
    return clause_id2label.get(pred, "Unknown")

def score_risk(text):
    """Risk Scoring (from second cell)."""
    if infer_model is None: return {"Score": "N/A", "Band": "N/A"}

    inputs = risk_tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=256).to(gpu_device)
    with torch.no_grad():
        logits = infer_model(**inputs).logits

    probs = torch.sigmoid(logits).cpu().numpy()[0]

    max_risk_score = 0.0
    max_risk_result = {"category": "N/A", "risk_score": 0.0, "risk_band": "Low Risk"}

    for i, cat in enumerate(SAMPLE_CATEGORIES):
        prob = float(probs[i])
        weight = RISK_WEIGHTS.get(cat, 0.0)
        risk_score = prob * weight

        if risk_score >= 0.7: band = "High Risk"
        elif risk_score >= 0.4: band = "Moderate Risk"
        else: band = "Low Risk"

        if risk_score > max_risk_score:
            max_risk_score = risk_score
            max_risk_result.update({"category": cat, "risk_score": risk_score, "risk_band": band})

    # Return the max risk score and band, matching the requested output structure
    return {"risk_score": max_risk_result['risk_score'], "risk_band": max_risk_result['risk_band']}

def calculate_anomaly(text, baseline_embs, contamination=0.05):
    """Anomaly Scoring (from third cell)"""
    if model_embedding is None or baseline_embs is None: return "N/A"

    # 1. Compute embedding for the single clause
    inputs = clause_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=256).to(gpu_device)
    with torch.no_grad():
        outputs = model_embedding(**inputs)
    par_embs = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

    # 2. Run Anomaly Ensemble (Training on 1 sample is impossible, so we use the input embeddings)
    # NOTE: Since the anomaly_ensemble expects *all* clauses, this is a simplified adaptation.
    # To run anomaly detection on a single item, we must rely heavily on semantic deviation.
    # We will use the full ensemble logic as defined in the notebook but pass the single embedding.

    # Semantic Deviation (1)
    sims = cosine_similarity(par_embs, baseline_embs)
    max_sims = sims.max(axis=1)
    sem_dev = 1.0 - max_sims

    # IF/LOF (2 & 3) are difficult/unreliable with one sample, so we skip/use a placeholder if only scoring one clause.
    # For simplicity and to match the intent of the notebook's final execution, we rely on semantic deviation here:

    # This is a simplification based on the semantic part of the original ensemble logic (0.5 weight)
    final_score = float(sem_dev[0])

    return min(final_score * 2, 1.0) # Scaling semantic deviation to roughly 0-1 range

# --- D. Document-Level Aggregation (Modified from Risk Scoring cell) ---

def score_document_summary(clause_risk_scores):
    """Calculates document-level aggregate metrics based on clause-level max risk scores."""

    risk_scores = [r['risk_score'] for r in clause_risk_scores if isinstance(r['risk_score'], float)]
    risk_probs = [0.8 if r['risk_score'] > 0.0 else 0.2 for r in clause_risk_scores if isinstance(r['risk_score'], float)] # Placeholder for prob since the original model is multi-label

    if not risk_scores:
        return {'contract_risk_score': 0.0, 'dispute_likelihood': 0.0, 'structural_completeness': 0.0, 'compliance_score': 1.0, 'composite_score': 0.0}

    # (1) Contract-Level Risk Score (mean of max clause risks)
    contract_risk_score = float(np.mean(risk_scores))

    # (2) Dispute likelihood (simplified probabilistic OR)
    dispute_likelihood = float(1 - np.prod(1 - np.array(risk_probs)))

    # (3) Structural completeness (Placeholder for simplicity, as full coverage tracking is complex)
    structural_completeness = 0.5 # Assume 50% coverage for the test set

    # (4) Compliance score
    compliance_score = float(1.0 - contract_risk_score)

    # (5) Composite Score (weighted formula)
    composite = (
        0.40 * contract_risk_score +
        0.30 * dispute_likelihood +
        0.20 * (1 - compliance_score) +
        0.10 * (1 - structural_completeness)
    )

    return {
        "contract_risk_score": contract_risk_score,
        "dispute_likelihood": dispute_likelihood,
        "structural_completeness": structural_completeness,
        "compliance_score": compliance_score,
        "composite_score": composite,
    }


# --- E. Run Full Pipeline ---

input_text = """
1. The Tenant shall pay a monthly rent of Rs. 25,000 on or before the 5th of every month.

2. The liability of the provider shall be unlimited in the event of breach.

3. This contract may be terminated by either party without cause by providing 30 days written notice.
"""

print("\n" + "="*70)
print("                    FULL LEGAL ANALYSIS PIPELINE                    ")
print("="*70)

clauses = split_text(input_text)
full_results = []
clause_risk_scores = [] # For document summary calculation

for i, c in enumerate(clauses):
    print(f"\n[CLAUSE {i+1}]")
    print(f"Text: \"{c[:70]}...\"")

    # 1. Clause Prediction (from first cell)
    ctype = classify_clause(c)

    # 2. Risk Score (from second cell)
    r_score_detail = score_risk(c)

    # 3. Anomaly Score (from third cell)
    anom_score = calculate_anomaly(c, baseline_embs)

    # Collect risk scores for document-level aggregation
    if isinstance(r_score_detail['risk_score'], float):
        clause_risk_scores.append(r_score_detail)

    print(f" -> Predicted Type: **{ctype}**")
    print(f" -> Max Risk Score: **{r_score_detail['risk_score']:.4f}** ({r_score_detail['risk_band']})")
    print(f" -> Anomaly Score:  **{anom_score:.4f}**")

# --- F. Display Final Results ---

doc_summary = score_document_summary(clause_risk_scores)

print("\n" + "="*70)
print("🎯 DOCUMENT-LEVEL SUMMARY METRICS (Combined Output 1)")
print("="*70)
pprint.pprint(doc_summary)

print("\n" + "="*70)
print("📄 CLAUSE-LEVEL MAX RISK (Combined Output 2)")
print("="*70)
# Re-running the print for the exact format requested
for i, c in enumerate(clauses):
    r_score_detail = score_risk(c) # Recalculate or retrieve the risk detail
    if r_score_detail['risk_score'] != 'N/A':
        print(f"Clause {i+1}: \"{c[:50]}...\"")
        pprint.pprint({
            'risk_score': r_score_detail['risk_score'],
            'risk_band': r_score_detail['risk_band']
        })
        print("-" * 20)

Main Acceleration: cuda
✅ Clause Model Loaded.
✅ Risk Model Loaded.


Building Baseline Embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Anomaly Embeddings/Baseline Ready.

                    FULL LEGAL ANALYSIS PIPELINE                    

[CLAUSE 1]
Text: "1. The Tenant shall pay a monthly rent of Rs. 25,000 on or before the ..."
 -> Predicted Type: **Minimum Commitment**
 -> Max Risk Score: **0.1668** (Low Risk)
 -> Anomaly Score:  **0.4038**

[CLAUSE 2]
Text: "2. The liability of the provider shall be unlimited in the event of br..."
 -> Predicted Type: **Cap On Liability**
 -> Max Risk Score: **0.6950** (Moderate Risk)
 -> Anomaly Score:  **0.2102**

[CLAUSE 3]
Text: "3. This contract may be terminated by either party without cause by pr..."
 -> Predicted Type: **Termination For Convenience**
 -> Max Risk Score: **0.1625** (Low Risk)
 -> Anomaly Score:  **0.3341**

🎯 DOCUMENT-LEVEL SUMMARY METRICS (Combined Output 1)
{'compliance_score': 0.6586016282439231,
 'composite_score': 0.5524390230536461,
 'contract_risk_score': 0.34139837175607685,
 'dispute_likelihood': 0.992,
 'structural_completeness': 0.5}

📄 CLAUS

In [None]:
import torch, joblib, json, numpy as np, re, getpass
import google.generativeai as genai # NEW: Added for Gemini API
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
import pprint

# --- A. Configuration & Device Setup ---
gpu_device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Main Acceleration: {gpu_device}")
model_checkpoint = "nlpaueb/legal-bert-base-uncased" # Base model for all tasks

# --- Gemini Configuration (UNCOMMENT AND RUN) ---
gemini_model = None
try:
    print("\nEnter Google AI API Key:")
    API_KEY = getpass.getpass()
    genai.configure(api_key=API_KEY)
    gemini_model = genai.GenerativeModel('gemini-2.0-flash')
    print("✅ Gemini Configured.")
except Exception as e:
    gemini_model = None
    print(f"⚠️ Gemini configuration skipped due to error: {e}")
# ------------------------------------------------

# --- B. Load Models and Related Data ---

# 1. Clause Classification Model
try:
    clause_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    with open('/content/models2/clause_models/clause_labels.json', 'r') as f:
        clause_id2label = {int(k): v for k, v in json.load(f)['id2label'].items()}
    model_clause = AutoModelForSequenceClassification.from_pretrained("/content/models2/clause_models/clause_model").to(gpu_device).eval()
    print("✅ Clause Model Loaded.")
except Exception as e:
    print(f"⚠️ Could not load Clause Model: {e}. Clause classification will be 'N/A'.")
    model_clause = None
    clause_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) # Ensure tokenizer is still defined

# 2. Risk Scoring Model
try:
    INFER_MODEL_PATH = "/content/models2/risk_models"
    risk_tokenizer = AutoTokenizer.from_pretrained(INFER_MODEL_PATH, local_files_only=True)
    infer_model = AutoModelForSequenceClassification.from_pretrained(INFER_MODEL_PATH, local_files_only=True).to(gpu_device).eval()

    RISK_WEIGHTS = {
        "Uncapped Liability": 0.9, "Cap on Liability": 0.4, "Indemnity": 0.8,
        "Termination for Convenience": 0.5, "Confidentiality": 0.6,
        "Governing Law": 0.2, "Effective Date": 0.1
    }
    SAMPLE_CATEGORIES = list(RISK_WEIGHTS.keys())
    print("✅ Risk Model Loaded.")
except Exception as e:
    print(f"⚠️ Could not load Risk Model: {e}. Risk scoring will be 'N/A'.")
    infer_model = None

# 3. Anomaly Detection Model (Embeddings + Baseline)
try:
    model_embedding = AutoModel.from_pretrained(model_checkpoint).to(gpu_device).eval()

    baseline_texts = [
        'The Parties shall keep confidential information in strict confidence and not disclose to third parties.',
        'Each party will indemnify the other for losses arising from breach of obligations.',
        'This agreement is governed by the laws of India and disputes are resolved by arbitration.'
    ]

    # Redefining get_paragraph_embeddings for this scope
    def get_paragraph_embeddings(texts, batch_size=8, tokenizer=clause_tokenizer, model=model_embedding):
        all_mean = []
        for i in tqdm(range(0, len(texts), batch_size), desc="Building Baseline Embeddings"):
            batch = texts[i:i+batch_size]
            enc = tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=512).to(gpu_device)
            with torch.no_grad():
                out = model(**enc)
                last = out.last_hidden_state.cpu().numpy()
            masks = enc.attention_mask.cpu().numpy()
            for j in range(len(batch)):
                seq = last[j]
                mask = masks[j]
                length = int(mask.sum())
                mean_pool = seq[:length].mean(axis=0)
                all_mean.append(mean_pool)
        return np.array(all_mean)

    baseline_embs = get_paragraph_embeddings(baseline_texts)
    print("✅ Anomaly Embeddings/Baseline Ready.")
except Exception as e:
    print(f"⚠️ Could not load Embedding Model: {e}. Skipping anomaly scoring.")
    model_embedding = None
    baseline_embs = None

# --- C. Functions (Combined and Enhanced) ---

def split_text(text):
    """Splits text into clauses."""
    clauses = re.split(r'\n\s*([0-9]+[\.\)]\s+)|(\n\s*\n+)', text)
    clean = []
    curr = ""
    for p in clauses:
        if not p: continue
        p = p.strip()
        if re.match(r'^[0-9]+[\.\)]$', p): curr = p
        elif len(p.split()) > 5:
            clean.append(f"{curr} {p}" if curr else p)
            curr = ""
    return clean

def classify_clause(text):
    """Clause Prediction."""
    if model_clause is None: return "N/A"
    inputs = clause_tokenizer(text, return_tensors="pt", truncation=True, max_length=256).to(gpu_device)
    with torch.no_grad():
        logits = model_clause(**inputs).logits
    pred = torch.argmax(logits, dim=1).item()
    return clause_id2label.get(pred, "Unknown")

def score_risk(text):
    """Risk Scoring."""
    if infer_model is None: return {"risk_score": 0.0, "risk_band": "N/A"}

    inputs = risk_tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=256).to(gpu_device)
    with torch.no_grad():
        logits = infer_model(**inputs).logits

    probs = torch.sigmoid(logits).cpu().numpy()[0]

    max_risk_score = 0.0
    max_risk_result = {"category": "N/A", "risk_score": 0.0, "risk_band": "Low Risk"}

    for i, cat in enumerate(SAMPLE_CATEGORIES):
        prob = float(probs[i])
        weight = RISK_WEIGHTS.get(cat, 0.0)
        risk_score = prob * weight

        if risk_score >= 0.7: band = "High Risk"
        elif risk_score >= 0.4: band = "Moderate Risk"
        else: band = "Low Risk"

        if risk_score > max_risk_score:
            max_risk_score = risk_score
            max_risk_result.update({"category": cat, "risk_score": risk_score, "risk_band": band})

    return {"risk_score": max_risk_result['risk_score'], "risk_band": max_risk_result['risk_band']}

def calculate_anomaly(text, baseline_embs, contamination=0.05):
    """Anomaly Scoring (Simplified for single clause)."""
    if model_embedding is None or baseline_embs is None: return 0.0

    # Compute embedding for the single clause
    inputs = clause_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=256).to(gpu_device)
    with torch.no_grad():
        outputs = model_embedding(**inputs)
    par_embs = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

    # Use Semantic Deviation (Sem_dev) scaled to 0-1
    sims = cosine_similarity(par_embs, baseline_embs)
    max_sims = sims.max(axis=1)
    sem_dev = 1.0 - max_sims

    # Scaling semantic deviation to roughly 0-1 range
    return float(min(sem_dev[0] * 2, 1.0))

def gemini_analysis(text, c_type, risk_band):
    """NEW: Uses Gemini to suggest a clause improvement."""
    if not gemini_model:
        return "N/A (Gemini not configured)"

    prompt = f"""
    You are a contract analysis expert. Analyze the following contract clause:

    Clause: "{text}"
    Predicted Type: {c_type}
    Calculated Risk Band: {risk_band}

    Provide a concise, single-sentence suggestion to improve or clarify this clause, especially addressing the high risk factors if present. If the risk is low, suggest a minor clarification.
    Return ONLY the suggested sentence.
    """
    try:
        resp = gemini_model.generate_content(prompt)
        return resp.text.strip()
    except Exception as e:
        return f"Gemini API Error: {e}"

def score_document_summary(clause_risk_scores):
    """Calculates document-level aggregate metrics based on clause-level max risk scores."""

    risk_scores = [r['risk_score'] for r in clause_risk_scores if isinstance(r['risk_score'], float)]
    risk_probs = [0.8 if r['risk_score'] > 0.0 else 0.2 for r in clause_risk_scores if isinstance(r['risk_score'], float)]

    if not risk_scores:
        return {'contract_risk_score': 0.0, 'dispute_likelihood': 0.0, 'structural_completeness': 0.0, 'compliance_score': 1.0, 'composite_score': 0.0}

    contract_risk_score = float(np.mean(risk_scores))
    dispute_likelihood = float(1 - np.prod(1 - np.array(risk_probs)))
    structural_completeness = 0.5 # Placeholder
    compliance_score = float(1.0 - contract_risk_score)

    composite = (
        0.40 * contract_risk_score +
        0.30 * dispute_likelihood +
        0.20 * (1 - compliance_score) +
        0.10 * (1 - structural_completeness)
    )

    return {
        "contract_risk_score": contract_risk_score,
        "dispute_likelihood": dispute_likelihood,
        "structural_completeness": structural_completeness,
        "compliance_score": compliance_score,
        "composite_score": composite,
    }


# --- D. Run Full Pipeline ---

input_text = """
1. The Tenant shall pay a monthly rent of Rs. 25,000 on or before the 5th of every month.

2. The liability of the provider shall be unlimited in the event of breach, including for gross negligence and willful misconduct.

3. This contract may be terminated by either party without cause by providing 30 days written notice.
"""

print("\n" + "="*70)
print("                    FULL LEGAL ANALYSIS PIPELINE                    ")
print("="*70)

clauses = split_text(input_text)
clause_risk_scores = [] # For document summary calculation

for i, c in enumerate(clauses):
    print(f"\n[CLAUSE {i+1}]")

    # 1. Clause Prediction
    ctype = classify_clause(c)

    # 2. Risk Score
    r_score_detail = score_risk(c)

    # 3. Anomaly Score
    anom_score = calculate_anomaly(c, baseline_embs)

    # 4. Gemini Improvement Suggestion (NEW)
    improvement = gemini_analysis(c, ctype, r_score_detail['risk_band'])

    # Collect risk scores for document-level aggregation
    if isinstance(r_score_detail['risk_score'], float):
        clause_risk_scores.append(r_score_detail)

    print(f"Text: \"{c[:70]}...\"")
    print(f" -> Predicted Type: **{ctype}**")
    print(f" -> Max Risk Score: **{r_score_detail['risk_score']:.4f}** ({r_score_detail['risk_band']})")
    print(f" -> Anomaly Score:  **{anom_score:.4f}**")
    print(f" -> ✨ Improvement: {improvement}")

# --- E. Display Final Results ---

doc_summary = score_document_summary(clause_risk_scores)

print("\n" + "="*70)
print("🎯 DOCUMENT-LEVEL SUMMARY METRICS (Combined Output 1)")
print("="*70)
pprint.pprint(doc_summary)

Main Acceleration: cuda

Enter Google AI API Key:
··········
✅ Gemini Configured.
✅ Clause Model Loaded.
✅ Risk Model Loaded.


Building Baseline Embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Anomaly Embeddings/Baseline Ready.

                    FULL LEGAL ANALYSIS PIPELINE                    

[CLAUSE 1]




Text: "1. The Tenant shall pay a monthly rent of Rs. 25,000 on or before the ..."
 -> Predicted Type: **Minimum Commitment**
 -> Max Risk Score: **0.1668** (Low Risk)
 -> Anomaly Score:  **0.4038**
 -> ✨ Improvement: Gemini API Error: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-2.0-flash-latest is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.

[CLAUSE 2]




Text: "2. The liability of the provider shall be unlimited in the event of br..."
 -> Predicted Type: **Cap On Liability**
 -> Max Risk Score: **0.7626** (High Risk)
 -> Anomaly Score:  **0.2406**
 -> ✨ Improvement: Gemini API Error: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-2.0-flash-latest is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.

[CLAUSE 3]
Text: "3. This contract may be terminated by either party without cause by pr..."
 -> Predicted Type: **Termination For Convenience**
 -> Max Risk Score: **0.1625** (Low Risk)
 -> Anomaly Score:  **0.3341**
 -> ✨ Improvement: Gemini API Error: 404 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-latest:generateContent?%24alt=json%3Benum-encoding%3Dint: models/gemini-2.0-flash-latest is not fo

