In [115]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

def encode_pair(text_a, text_b):
    return tokenizer(text_a, text_b, truncation=True, padding="max_length", max_length=512)


In [116]:
#groudn truth implementation
import pandas as pd

# Load the master clause file
df = pd.read_csv("Datasets/CUAD/all_reshaped_clauses.csv")
df

Unnamed: 0,filename,clause_type,clause_text
0,GopageCorp_20140221_10-K_EX-10.1_8432966_EX-10...,Change of Control,"For purposes of the preceding sentence, and wi..."
1,DeltathreeInc_19991102_S-1A_EX-10.19_6227850_E...,Change of Control,The term of this Agreement shall be effective ...
2,EdietsComInc_20001030_10QSB_EX-10.4_2606646_EX...,Change of Control,"For purposes of this Agreement, ""Change in Con..."
3,MusclepharmCorp_20170208_10-KA_EX-10.38_989358...,Change of Control,Neither party shall voluntarily or by operatio...
4,TomOnlineInc_20060501_20-F_EX-4.46_749700_EX-4...,Change of Control,"Notwithstanding the foregoing, Skype or Skype ..."
...,...,...,...
8623,"INTERSECTENT,INC_05_11_2020-EX-10.1-SUPPLY AGR...",Warranty Duration,"If, upon inspecting and testing the API, INTER..."
8624,ULTRAGENYXPHARMACEUTICALINC_12_23_2013-EX-10.9...,Warranty Duration,In the event that the Product fails to conform...
8625,VERICELCORP_08_06_2019-EX-10.10-SUPPLY AGREEME...,Warranty Duration,Vericel shall be deemed to have accepted such ...
8626,"NETGEAR,INC_04_21_2003-EX-10.16-DISTRIBUTOR AG...",Warranty Duration,The warranty period for each Product is specif...


In [117]:
## Code to generate clause topic based on the CUAD topics listed ##

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# Optional: filter short clauses or low-signal entries
df = df[df['clause_text'].str.split().apply(len) > 5]

# Count clause type frequencies
counts = df['clause_type'].value_counts()

# Keep only clause types with at least 2 samples
valid_types = counts[counts >= 2].index
df_filtered = df[df['clause_type'].isin(valid_types)]

# Split and train
X = df_filtered['clause_text']
y = df_filtered['clause_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


# Pipeline: TF-IDF + Logistic Regression
model = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=5000)),
    ("clf", LogisticRegression(max_iter=1000))
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


                                    precision    recall  f1-score   support

        Affiliate License-Licensee       1.00      0.17      0.29        12
        Affiliate License-Licensor       0.00      0.00      0.00         5
                    Agreement Date       0.68      0.79      0.73        33
                   Anti-assignment       0.86      0.95      0.90        75
                      Audit Rights       0.93      0.95      0.94        43
                  Cap on Liability       0.70      0.89      0.78        55
                 Change of Control       0.82      0.58      0.68        24
 Competitive Restriction Exception       0.50      0.07      0.12        15
               Covenant not to Sue       0.84      0.80      0.82        20
                     Document Name       0.97      0.91      0.94        33
                    Effective Date       0.60      0.42      0.49        43
                       Exclusivity       0.42      0.50      0.46        36
           

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [130]:
def predict_clause_types(clauses, model):
    texts = [clause["clause_text"] for clause in clauses]
    probs = model.predict_proba(texts)
    preds = model.predict(texts)
    labels = model.classes_

    for i, clause in enumerate(clauses):
        clause["predicted_type"] = preds[i]
        clause["confidence"] = round(float(max(probs[i])), 3)  # max class probability

    return clauses


In [119]:
from pathlib import Path
import re
import numpy as np
from sentence_transformers import SentenceTransformer

# === UTILITY FUNCTIONS ===

def cosine_similarity(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

def load_contracts_from_txt(folder_path, max_files=15):
    contract_dir = Path(folder_path)
    contract_files = sorted(contract_dir.glob("*.txt"))[:max_files]
    return [file.read_text(encoding='utf-8', errors='ignore') for file in contract_files]

def smart_sent_split(text):
    return re.split(r'(?<=[.;:])\s+(?=[A-Z(])', text.strip())

def should_filter_clause(text):
    return (
        len(text.split()) < 6 or
        text.lower().startswith("source:") or
        re.match(r'^page\s+\d+', text.lower()) or
        "sec.gov" in text.lower()
    )

# === BOUNDARY DETECTION ===

def detect_regex_boundaries(text):
    lines = text.splitlines()
    boundaries = []
    patterns = [
        r'^\d+\.\s+.*$',                # e.g., 1. Definitions
        r'^\d+(\.\d+)*\s+.+$',          # e.g., 2.1 Termination
        r'^(Section|Article)\s+\d+',    # Section 3
        r'^[A-Z][A-Z\s,\(\)]+$',        # FULL CAPS line
        r'^WHEREAS\b', r'^NOW,\s*THEREFORE\b'
    ]
    for i, line in enumerate(lines):
        if any(re.match(pat, line.strip()) for pat in patterns):
            boundaries.append(i)
    return sorted(set(boundaries))

def detect_semantic_boundaries(text, model=None, threshold=0.45):
    if model is None:
        model = SentenceTransformer("all-MiniLM-L6-v2")
    sentences = [s.strip() for s in smart_sent_split(text) if len(s.strip()) > 20]
    embeddings = model.encode(sentences)
    boundaries = []
    for i in range(len(embeddings) - 1):
        sim = cosine_similarity(embeddings[i], embeddings[i + 1])
        if sim < threshold:
            boundaries.append(i + 1)
    return boundaries, sentences

def merge_boundaries(regex_bounds, semantic_bounds):
    return sorted(set(regex_bounds + semantic_bounds))

# === SEGMENTATION ===

def segment_clauses(sentences, boundaries):
    clauses = []
    filtered_clauses = []
    start = 0
    current_clause = None

    for idx in boundaries:
        chunk = sentences[start:idx]
        text = " ".join(chunk).strip()
        if not text or should_filter_clause(text):
            start = idx
            continue

        first_line = chunk[0]
        heading_match = re.match(r'^(\d+(\.\d+)*\.?)\s+(.*)', first_line.strip())
        subclause_match = re.match(r'^\([a-zA-Z]\)', first_line.strip())

        if heading_match:
            current_clause = {
                "clause_number": heading_match.group(1),
                "clause_title": heading_match.group(3)[:60] + "...",
                "clause_text": text
            }
            clauses.append(current_clause)
        elif subclause_match and current_clause:
            current_clause["clause_text"] += " " + text
        else:
            current_clause = {
                "clause_number": None,
                "clause_title": first_line[:60] + "...",
                "clause_text": text
            }
            clauses.append(current_clause)

        start = idx

    final_chunk = sentences[start:]
    if final_chunk:
        text = " ".join(final_chunk).strip()
        if not should_filter_clause(text):
            clauses.append({
                "clause_number": None,
                "clause_title": final_chunk[0][:60] + "...",
                "clause_text": text
            })

    return clauses, filtered_clauses


In [125]:
#Clause boundary supervised training    

def generate_boundary_training_data(df, window_size=2):
    rows = []
    for fname, group in df.groupby("filename"):
        full_text = "\n".join(group["clause_text"].tolist())
        sentences = smart_sent_split(full_text)
        
        # Identify boundary sentence indices
        clause_starts = set()
        idx = 0
        for clause in group["clause_text"]:
            clause_sentences = smart_sent_split(clause)
            if clause_sentences:
                clause_starts.add(idx)
                idx += len(clause_sentences)
        
        # Slide window and label
        for i in range(1, len(sentences) - 1):
            context = " ".join(sentences[i-window_size:i+window_size])
            label = 1 if i in clause_starts else 0
            rows.append({"context": context, "is_boundary": label})
    
    return pd.DataFrame(rows)


In [121]:
def predict_supervised_boundaries(sentences, model, window_size=2, threshold=0.5):
    boundaries = []
    for i in range(1, len(sentences)-1):
        context = " ".join(sentences[max(0, i-window_size):i+window_size])
        prob = model.predict_proba([context])[0][1]
        if prob >= threshold:
            boundaries.append(i)
    return boundaries


In [126]:
boundary_df = generate_boundary_training_data(df, window_size=2)

# Train a classifier (e.g., logistic regression)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

X_train, X_test, y_train, y_test = train_test_split(
    boundary_df['context'], boundary_df['is_boundary'], stratify=boundary_df['is_boundary'], test_size=0.2
)

boundary_clf = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000)),
    ("clf", LogisticRegression(max_iter=1000))
])
boundary_clf.fit(X_train, y_train)


In [127]:
def process_contracts(folder_path, type_classifier, boundary_classifier, max_files=3, threshold=0.45, window_size=2):
    contracts = load_contracts_from_txt(folder_path, max_files)
    model = SentenceTransformer("all-MiniLM-L6-v2")  # for semantic boundaries
    all_clauses, all_filtered = [], []

    for text in contracts:
        # Step 1: Sentence tokenization
        sem_bounds, sentences = detect_semantic_boundaries(text, model, threshold)

        # Step 2: Regex and supervised boundaries
        regex_bounds = detect_regex_boundaries(text)
        supervised_bounds = predict_supervised_boundaries(sentences, boundary_classifier, window_size=window_size)

        # Step 3: Merge boundaries
        merged_bounds = merge_boundaries(regex_bounds + supervised_bounds, sem_bounds)

        # Step 4: Segment clauses
        clauses, filtered = segment_clauses(sentences, merged_bounds)

        # Step 5: Predict clause types
        typed_clauses = predict_clause_types(clauses, type_classifier)

        all_clauses.append(typed_clauses)
        all_filtered.append(filtered)

    return all_clauses, all_filtered


In [131]:
if __name__ == "__main__":
    folder_path = "Datasets/CUAD/full_contract_txt"

    clauses, filtered = process_contracts(
        folder_path=folder_path,
        type_classifier=model,              # clause type model
        boundary_classifier=boundary_clf,   # clause boundary model
        max_files=3
    )

    for doc_idx, doc_clauses in enumerate(clauses):
        print(f"\n=== Document {doc_idx + 1} Clauses ===")
        for i, clause in enumerate(doc_clauses):
            print(f"\nClause {i + 1}")
            print(f"Number        : {clause.get('clause_number')}")
            print(f"Title         : {clause.get('clause_title')}")
            print(f"Predicted Type: {clause.get('predicted_type', 'N/A')}")
            print(f"Confidence    : {clause.get('confidence', 'N/A')}")
            print(f"Text          : {clause.get('clause_text', '')[:500]}...")



=== Document 1 Clauses ===

Clause 1
Number        : None
Title         : CO-BRANDING AND ADVERTISING AGREEMENT

THIS CO-BRANDING AND ...
Predicted Type: Effective Date
Confidence    : 0.145
Text          : CO-BRANDING AND ADVERTISING AGREEMENT

THIS CO-BRANDING AND ADVERTISING AGREEMENT (the "Agreement") is made as of June 21, 1999 (the "Effective Date") by and between I-ESCROW, INC., with its principal place of business at 1730 S....

Clause 2
Number        : None
Title         : Amphlett Blvd., Suite 233, San Mateo, California 94402 ("i-E...
Predicted Type: Parties-Answer
Confidence    : 0.102
Text          : Amphlett Blvd., Suite 233, San Mateo, California 94402 ("i-Escrow"), and 2THEMART.COM, INC. having its principal place of business at 18301 Von Karman Avenue, 7th Floor, Irvine, California 92612 ("2TheMart").

1. (a) "CONTENT" means all content or information, in any medium, provided by a party to the other party for use in conjunction with the performance of its obligations h