In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

def encode_pair(text_a, text_b):
    return tokenizer(text_a, text_b, truncation=True, padding="max_length", max_length=512)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [14]:
%pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting scikit-learn (from sentence_transformers)
  Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence_transformers)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   -------- ------------------------------- 2.4/11.1 MB 12.2 MB/s eta 0:00:01
   ---------------- ----------------------- 4.5/11.1 MB 10.3 MB/s eta 0:00:01
   ----------------------- ---------------- 6.6/11.1 MB 10.1 MB/s eta 0:00:01
   ------------------------------- -------- 8.7/11.1 MB 10.1 MB/s eta 0:00:01
   -------------------------------------- - 10.7/11.1 MB 10.2 MB/s eta 0:00:01
   ----------------------

In [None]:
from pathlib import Path
import re
import numpy as np
from sentence_transformers import SentenceTransformer

# Load contracts from a directory
def load_contracts_from_txt(folder_path, max_files=10):
    contract_dir = Path(folder_path)
    contract_files = sorted(contract_dir.glob("*.txt"))[:max_files]
    return [file.read_text(encoding='utf-8', errors='ignore') for file in contract_files]

# Regex-based clause boundary detection
def detect_regex_boundaries(text):
    lines = text.splitlines()
    boundaries = []
    regex_patterns = [
        r'^\d+\.\s+.*$',                      # e.g., 1. Definitions
        r'^\d+(\.\d+)*\s+.+$',                # e.g., 2.1 Termination
        r'^(Section|Article)\s+\d+',          # Section 3
        r'^[A-Z\s]{5,}$',                     # ALL CAPS heading
        r'^WHEREAS\b',                        # WHEREAS
        r'^NOW,\s*THEREFORE\b'                # NOW, THEREFORE
    ]

    for i, line in enumerate(lines):
        for pattern in regex_patterns:
            if re.match(pattern, line.strip()):
                boundaries.append(i)
                break
    return sorted(set(boundaries))

# Semantic similarity-based clause detection
def detect_semantic_boundaries(text, model=None, threshold=0.5):
    if model is None:
        model = SentenceTransformer("all-MiniLM-L6-v2")

    sentences = [line.strip() for line in text.splitlines() if len(line.strip()) > 20]
    embeddings = model.encode(sentences)

    boundaries = []
    for i in range(len(embeddings) - 1):
        sim = cosine_similarity(embeddings[i], embeddings[i + 1])
        if sim < threshold:
            boundaries.append(i + 1)

    return boundaries, sentences

def cosine_similarity(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

# Merge regex + semantic boundaries
def merge_boundaries(regex_boundaries, semantic_boundaries):
    return sorted(set(regex_boundaries + semantic_boundaries))

def should_filter_clause(text):
    return (
        len(text.split()) < 6 or
        text.lower().startswith("source:") or
        re.match(r'^page\s+\d+', text.lower()) or
        "sec.gov" in text.lower()
    )

# Split text into clauses, optionally tracking clause numbers and headings
def segment_clauses(sentences, boundaries):
    clauses = []
    filtered_clauses = []
    start = 0
    current_clause = None

    for idx in boundaries:
        chunk = sentences[start:idx]
        if not chunk:
            start = idx
            continue

        text = " ".join(chunk).strip()
        if should_filter_clause(text):
            filtered_clauses.append(text)
            start = idx
            continue

        first_line = chunk[0]
        subclause_match = re.match(r'^\([a-zA-Z]\)', first_line.strip())
        heading_match = re.match(r'^(\d+(\.\d+)*\.?)\s+(.*)', first_line.strip())

        if heading_match:
            clause_number = heading_match.group(1)
            clause_title = first_line[:60] + "..."  # fallback preview title
            current_clause = {
                "clause_number": clause_number,
                "clause_title": clause_title,
                "clause_text": text
            }
            clauses.append(current_clause)

        elif subclause_match and current_clause:
            current_clause["clause_text"] += " " + text

        else:
            # Try to infer a title from the first line if no heading match is found
            fallback_line = first_line.strip()
            inferred_title = fallback_line[:60] + "..." if len(fallback_line.split()) > 3 else None

            current_clause = {
                "clause_number": None,
                "clause_title": inferred_title,
                "clause_text": text
            }
            clauses.append(current_clause)


        start = idx

    # Final chunk
    final_chunk = sentences[start:]
    if final_chunk:
        text = " ".join(final_chunk).strip()
        if should_filter_clause(text):
            filtered_clauses.append(text)
        elif re.match(r'^\([a-zA-Z]\)', final_chunk[0].strip()) and current_clause:
            current_clause["clause_text"] += " " + text
        else:
            clauses.append({
                "clause_number": None,
                "clause_title": None,
                "clause_text": text
            })

    return clauses, filtered_clauses

# Main execution
def process_contracts(folder_path, max_files=3, threshold=0.5):
    contracts = load_contracts_from_txt(folder_path, max_files)
    model = SentenceTransformer("all-MiniLM-L6-v2")
    all_clauses = []
    all_filtered = []

    for text in contracts:
        regex_bounds = detect_regex_boundaries(text)
        sem_bounds, sentences = detect_semantic_boundaries(text, model, threshold)
        merged_bounds = merge_boundaries(regex_bounds, sem_bounds)
        clauses, filtered = segment_clauses(sentences, merged_bounds)
        all_clauses.append(clauses)
        all_filtered.append(filtered)

    return all_clauses, all_filtered



In [49]:
if __name__ == "__main__":
    folder_path = "Datasets/CUAD/full_contract_txt"  # replace with your folder
    clauses, filtered = process_contracts(folder_path, max_files=3)

    for doc_idx, doc_clauses in enumerate(clauses):
        print(f"\n=== Document {doc_idx + 1} Clauses ===")
        for i, clause in enumerate(doc_clauses[:10]):  # change 10 to however many you want
            print(f"\nClause {i + 1}")
            print("Number:", clause["clause_number"])
            print("Title:", clause["clause_title"])
            print("Text:", clause["clause_text"][:300], "...")  # truncated for readability



=== Document 1 Clauses ===

Clause 1
Number: None
Title: None
Text: CO-BRANDING AND ADVERTISING AGREEMENT THIS CO-BRANDING AND ADVERTISING AGREEMENT (the "Agreement") is made as of June 21, 1999 (the "Effective Date") by and between I-ESCROW, INC., with its principal place of business at 1730 S. Amphlett Blvd., Suite 233, San Mateo, California 94402 ("i-Escrow"), an ...

Clause 2
Number: 2.1
Title: OVERVIEW.  As set forth herein, 2TheMart will promote Services to its auction users (buyers and sellers), and i-Escrow shall develop Co-Branded Site, and develop the Information Transfer Mechanism working with 2TheMart to make Services available seamlessly to Customers. Unless otherwise specified, each party shall be responsible for all development, hosting and other costs associated with the pages resident on their servers and all emails to users they send.
Text: 2.1 OVERVIEW.  As set forth herein, 2TheMart will promote Services to its auction users (buyers and sellers), and i-Escrow shall

In [46]:
segmented_contracts = process_contracts("Datasets/CUAD/full_contract_txt")


In [47]:
from pprint import pprint

# Run the pipeline
segmented_contracts = process_contracts("Datasets/CUAD/full_contract_txt")

# Print results for the first contract
for i, clause in enumerate(segmented_contracts[0]):
    print(f"\n--- Clause {i+1} ---\n{clause}\n")



--- Clause 1 ---
[{'clause_number': None, 'clause_title': None, 'clause_text': 'CO-BRANDING AND ADVERTISING AGREEMENT THIS CO-BRANDING AND ADVERTISING AGREEMENT (the "Agreement") is made as of June 21, 1999 (the "Effective Date") by and between I-ESCROW, INC., with its principal place of business at 1730 S. Amphlett Blvd., Suite 233, San Mateo, California 94402 ("i-Escrow"), and 2THEMART.COM, INC. having its principal place of business at 18301 Von Karman Avenue, 7th Floor, Irvine, California 92612 ("2TheMart"). (a) "CONTENT" means all content or information, in any medium, provided by a party to the other party for use in conjunction with the performance of its obligations hereunder, including without limitation any text, music, sound, photographs, video, graphics, data or software. Content provided by 2TheMart is referred to herein as "2TheMart Content" and Content provided by i-Escrow is referred to herein as "i-Escrow Content." (b) "CO-BRANDED SITE" means the web-site accessible t