# Automated Legal Contract Summarization
### using CUAD Dataset




In [1]:
!pip install transformers datasets rouge-score nltk torch accelerate sentencepiece


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=723af11013fcab706ca8bc9ea7b3d3d692c9898ff6ecc15aea9df5e71402c10f
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


## Environment Setup and Library Imports


In [2]:
import pandas as pd
import numpy as np
import torch
import nltk

from transformers import BartTokenizer, BartForConditionalGeneration
from rouge_score import rouge_scorer
from nltk.tokenize import sent_tokenize

nltk.download("punkt")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
!wget https://zenodo.org/record/4595826/files/CUAD_v1.zip
!unzip CUAD_v1.zip -d cuad_data


--2026-01-06 18:23:01--  https://zenodo.org/record/4595826/files/CUAD_v1.zip
Resolving zenodo.org (zenodo.org)... 137.138.52.235, 188.185.43.153, 188.185.48.75, ...
Connecting to zenodo.org (zenodo.org)|137.138.52.235|:443... connected.
HTTP request sent, awaiting response... 301 MOVED PERMANENTLY
Location: /records/4595826/files/CUAD_v1.zip [following]
--2026-01-06 18:23:02--  https://zenodo.org/records/4595826/files/CUAD_v1.zip
Reusing existing connection to zenodo.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 105883672 (101M) [application/octet-stream]
Saving to: ‘CUAD_v1.zip’


2026-01-06 18:23:35 (3.06 MB/s) - ‘CUAD_v1.zip’ saved [105883672/105883672]

Archive:  CUAD_v1.zip
   creating: cuad_data/CUAD_v1/
  inflating: cuad_data/CUAD_v1/CUAD_v1.json  
  inflating: cuad_data/CUAD_v1/CUAD_v1_README.txt  
   creating: cuad_data/CUAD_v1/full_contract_pdf/
   creating: cuad_data/CUAD_v1/full_contract_pdf/Part_I/
   creating: cuad_data/CUAD_v1/full_contract_pdf/Part_I/A

## Dataset Loading and Exploration


In [4]:
df = pd.read_csv("master_clauses.csv")
df.head()


Unnamed: 0,Filename,Document Name,Document Name-Answer,Parties,Parties-Answer,Agreement Date,Agreement Date-Answer,Effective Date,Effective Date-Answer,Expiration Date,...,Liquidated Damages,Liquidated Damages-Answer,Warranty Duration,Warranty Duration-Answer,Insurance,Insurance-Answer,Covenant Not To Sue,Covenant Not To Sue-Answer,Third Party Beneficiary,Third Party Beneficiary-Answer
0,CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...,['MARKETING AFFILIATE AGREEMENT'],MARKETING AFFILIATE AGREEMENT,"['BIRCH FIRST GLOBAL INVESTMENTS INC.', 'MA', ...","Birch First Global Investments Inc. (""Company""...","['8th day of May 2014', 'May 8, 2014']",5/8/14,['This agreement shall begin upon the date of ...,,['This agreement shall begin upon the date of ...,...,[],No,"[""COMPANY'S SOLE AND EXCLUSIVE LIABILITY FOR T...",Yes,[],No,[],No,[],No
1,EuromediaHoldingsCorp_20070215_10SB12G_EX-10.B...,['VIDEO-ON-DEMAND CONTENT LICENSE AGREEMENT'],VIDEO-ON-DEMAND CONTENT LICENSE AGREEMENT,"['EuroMedia Holdings Corp.', 'Rogers', 'Rogers...","Rogers Cable Communications Inc. (""Rogers""); E...","['July 11 , 2006']",7/11/06,"['July 11 , 2006']",7/11/06,"['The term of this Agreement (the ""Initial Ter...",...,[],No,[],No,[],No,[],No,[],No
2,FulucaiProductionsLtd_20131223_10-Q_EX-10.9_83...,['CONTENT DISTRIBUTION AND LICENSE AGREEMENT'],CONTENT DISTRIBUTION AND LICENSE AGREEMENT,"['Producer', 'Fulucai Productions Ltd.', 'Conv...","CONVERGTV, INC. (“ConvergTV”); Fulucai Product...","['November 15, 2012']",11/15/12,"['November 15, 2012']",11/15/12,[],...,[],No,[],No,[],No,[],No,[],No
3,GopageCorp_20140221_10-K_EX-10.1_8432966_EX-10...,['WEBSITE CONTENT LICENSE AGREEMENT'],WEBSITE CONTENT LICENSE AGREEMENT,"['PSiTech Corporation', 'Licensor', 'Licensee'...","PSiTech Corporation (""Licensor""); Empirical Ve...","['Feb 10, 2014']",2/10/14,"['Feb 10, 2014']",2/10/14,['The initial term of this Agreement commences...,...,[],No,[],No,[],No,[],No,[],No
4,IdeanomicsInc_20160330_10-K_EX-10.26_9512211_E...,['CONTENT LICENSE AGREEMENT'],CONTENT LICENSE AGREEMENT,"['YOU ON DEMAND HOLDINGS, INC.', 'Licensor', '...",Beijing Sun Seven Stars Culture Development Li...,"['December 21, 2015']",12/21/15,"['December 21, 2015']",12/21/15,"['The Term of this Agreement (the ""Term"") shal...",...,[],No,[],No,[],No,[],No,[],No


In [5]:
print(df.columns)


Index(['Filename', 'Document Name', 'Document Name-Answer', 'Parties',
       'Parties-Answer', 'Agreement Date', 'Agreement Date-Answer',
       'Effective Date', 'Effective Date-Answer', 'Expiration Date',
       'Expiration Date-Answer', 'Renewal Term', 'Renewal Term-Answer',
       'Notice Period To Terminate Renewal',
       'Notice Period To Terminate Renewal- Answer', 'Governing Law',
       'Governing Law-Answer', 'Most Favored Nation',
       'Most Favored Nation-Answer', 'Competitive Restriction Exception',
       'Competitive Restriction Exception-Answer', 'Non-Compete',
       'Non-Compete-Answer', 'Exclusivity', 'Exclusivity-Answer',
       'No-Solicit Of Customers', 'No-Solicit Of Customers-Answer',
       'No-Solicit Of Employees', 'No-Solicit Of Employees-Answer',
       'Non-Disparagement', 'Non-Disparagement-Answer',
       'Termination For Convenience', 'Termination For Convenience-Answer',
       'Rofr/Rofo/Rofn', 'Rofr/Rofo/Rofn-Answer', 'Change Of Control',
      

In [6]:
answer_cols = [col for col in df.columns if col.endswith("-Answer")]
print("Answer columns:", len(answer_cols))


Answer columns: 40


## Data Preprocessing and Contract Text Construction


In [7]:
def build_contract_text(row):
    parts = []
    for col in answer_cols:
        val = row[col]
        if isinstance(val, str) and len(val.strip()) > 30:
            parts.append(val.strip())
    return " ".join(parts)

df["context"] = df.apply(build_contract_text, axis=1)



In [8]:
IMPORTANT_CLAUSES = [
    "Parties-Answer",
    "Governing Law-Answer",
    "Termination For Convenience-Answer",
    "Confidentiality-Answer" if "Confidentiality-Answer" in answer_cols else None,
    "Indemnification-Answer" if "Indemnification-Answer" in answer_cols else None
]

IMPORTANT_CLAUSES = [c for c in IMPORTANT_CLAUSES if c in answer_cols]

def build_gold_summary(row):
    parts = []
    for col in IMPORTANT_CLAUSES:
        val = row[col]
        if isinstance(val, str) and len(val.strip()) > 30:
            parts.append(val.strip())
    return " ".join(parts)

df["gold_summary"] = df.apply(build_gold_summary, axis=1)


## Contract Sampling for Summarization


In [9]:
texts = df["context"].iloc[:10].tolist()
gold_summaries = df["gold_summary"].iloc[:10].tolist()

print("Contracts selected:", len(texts))


Contracts selected: 10


In [10]:
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [11]:
def chunk_text(text, tokenizer, max_tokens=512):
    tokens = tokenizer.encode(text)
    return [
        tokenizer.decode(tokens[i:i+max_tokens])
        for i in range(0, len(tokens), max_tokens)
    ]


In [12]:
def bart_summarize(text):
    summaries = []
    for chunk in chunk_text(text, tokenizer):
        inputs = tokenizer(
            chunk,
            return_tensors="pt",
            truncation=True,
            max_length=1024
        )
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=150,
            min_length=40,
            num_beams=4,
            early_stopping=True
        )
        summaries.append(
            tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        )
    return " ".join(summaries)


In [13]:
import nltk
from nltk.tokenize import sent_tokenize

# Required downloads (run once)
nltk.download("punkt")
nltk.download("punkt_tab")

def extractive_summary(text, n=5):
    return " ".join(sent_tokenize(text)[:n])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [14]:
bart_outputs = [bart_summarize(t) for t in texts]
extractive_outputs = [extractive_summary(t) for t in texts]


## Evaluation using ROUGE Metrics


In [15]:
from rouge_score import rouge_scorer
import numpy as np

scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)

def evaluate(preds, refs):
    r1, rL = [], []
    for p, r in zip(preds, refs):
        score = scorer.score(r, p)
        r1.append(score["rouge1"].fmeasure)
        rL.append(score["rougeL"].fmeasure)
    return {
        "ROUGE-1": np.mean(r1),
        "ROUGE-L": np.mean(rL)
    }

print("BART:", evaluate(bart_outputs, gold_summaries))
print("Extractive:", evaluate(extractive_outputs, gold_summaries))


BART: {'ROUGE-1': np.float64(0.47287380325344397), 'ROUGE-L': np.float64(0.4331551403467337)}
Extractive: {'ROUGE-1': np.float64(0.8814262740183793), 'ROUGE-L': np.float64(0.8814262740183793)}


In [16]:
out = pd.DataFrame({
    "Contract Text": texts,
    "Gold Summary": gold_summaries,
    "BART Summary": bart_outputs,
    "Extractive Summary": extractive_outputs
})

out.to_csv("final_legal_summarization.csv", index=False)


## Results

In [17]:
from google.colab import files
files.download("final_legal_summarization.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>