# **Setup**


In [1]:
# Setup stuff
!git clone https://github.com/justin73939/ClauseWise.git

Cloning into 'ClauseWise'...
remote: Enumerating objects: 60, done.[K
remote: Counting objects: 100% (60/60), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 60 (delta 17), reused 47 (delta 10), pack-reused 0 (from 0)[K
Receiving objects: 100% (60/60), 1.12 MiB | 5.49 MiB/s, done.
Resolving deltas: 100% (17/17), done.


In [4]:
%cd /content/ClauseWise
!pip install -r requirements.txt
!pip install -q --upgrade transformers datasets accelerate sentencepiece peft huggingface_hub

/content/ClauseWise
Collecting PyPDF2 (from -r requirements.txt (line 16))
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [8]:
# Imports
import os
from collections import defaultdict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from transformers import T5ForConditionalGeneration, default_data_collator
from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM
from google.colab import files
from tqdm import tqdm
import random
import pandas as pd
import re
from datasets import Dataset
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import BCEWithLogitsLoss
import torch.nn.functional as F
from torch.optim import AdamW
from sklearn.metrics import f1_score
from peft import LoraConfig, get_peft_model
import evaluate
from clause_segmenter import ContractSegmenter, load_contract_text


# **Data Preprocessing**


In [None]:
# Upload your local CSV (downloaded from Atticus site)
uploaded = files.upload()

Saving master_clauses.csv to master_clauses.csv


In [None]:
# Load the master CSV you uploaded
df = pd.read_csv("data/master_clauses.csv")
print("Shape before flattening:", df.shape)

# Flatten: one clause per row
long_rows_classif = []
long_rows_textgen = []
for _, row in df.iterrows():
    filename = row["Filename"]
    for col in df.columns:
        if col.endswith("-Answer"):  # find every label column
            category = col.replace("-Answer", "").strip()
            text_col = category
            if text_col not in df.columns:
                continue
            text = row[text_col]
            answer = row[col]

            # skip empty entries
            if pd.isna(text) or text in ([], "", None): continue

            # clean stringified lists
            if isinstance(text, str):
                text = text.strip("[]").replace("'", "").strip()
            if isinstance(answer, str):
                answer = answer.strip("[]").replace("'", "").strip()

            # Separate the 8 columns that do not have yes/no answers for the text generator
            if category in ["Document Name", "Parties", "Agreement Date", "Effective Date", "Expiration Date", "Renewal Term",
                            "Notice Period To Terminate Renewal", "Governing Law"]:
                long_rows_textgen.append({
                  "document_name": filename,
                  "category": category,
                  "text": text,
                  "answer": answer
              })
                continue

            long_rows_classif.append({
                "document_name": filename,
                "category": category,
                "text": text,
                "answer": answer
            })

flat_df_classif = pd.DataFrame(long_rows_classif)
flat_df_textgen = pd.DataFrame(long_rows_textgen)
print("Flattened shape:", flat_df_classif.shape)
print("Flattened shape:", flat_df_textgen.shape)

# Clean up text artifacts
def clean_clause(t):
    if pd.isna(t): return ""
    t = re.sub(r"<omitted>", " ", t)
    t = re.sub(r"\[\*+\]", " ", t)
    t = re.sub(r"_+", " ", t)
    t = re.sub(r"\s+", " ", t)
    return t.strip()

flat_df_classif["text"] = flat_df_classif["text"].apply(clean_clause)
flat_df_classif["answer"] = flat_df_classif["answer"].apply(clean_clause)

flat_df_textgen["text"] = flat_df_textgen["text"].apply(clean_clause)
flat_df_textgen["answer"] = flat_df_textgen["answer"].apply(clean_clause)

# Drop empties and tiny fragments
flat_df_classif = flat_df_classif.dropna(subset=["text", "category"])
flat_df_classif = flat_df_classif[flat_df_classif["text"].str.len() > 5]
flat_df_classif = flat_df_classif.drop_duplicates(subset=["text", "category"])

flat_df_textgen = flat_df_textgen.dropna(subset=["text", "category"])
flat_df_textgen = flat_df_textgen[flat_df_textgen["text"].str.len() > 5]
flat_df_textgen = flat_df_textgen.drop_duplicates(subset=["text", "category"])

# Save cleaned file
flat_df_classif.to_csv("data/cuad_flattened_classification.csv", index=False)
print("Saved → data/cuad_flattened_classification.csv")
flat_df_textgen.to_csv("data/cuad_flattened_text_generation.csv", index=False)
print("Saved → data/cuad_flattened_text_generation.csv")

#print(flat_df_classif.head(50))
print(flat_df_textgen.head(50))



Shape before flattening: (510, 83)
Flattened shape: (16830, 4)
Flattened shape: (4080, 4)
Saved → data/cuad_flattened_classification.csv
Saved → data/cuad_flattened_text_generation.csv
                                        document_name  \
0   CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...   
1   CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...   
2   CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...   
3   CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...   
4   CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...   
5   CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...   
6   CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...   
7   CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...   
8   EuromediaHoldingsCorp_20070215_10SB12G_EX-10.B...   
9   EuromediaHoldingsCorp_20070215_10SB12G_EX-10.B...   
10  EuromediaHoldingsCorp_20070215_10SB12G_EX-10.B...   
11  EuromediaHoldingsCorp_20070215_10SB12G_EX-10.B...   
12  EuromediaHoldingsCorp_20070215_10SB12G_EX-10.B...   
13  EuromediaHold

In [None]:
df_class = pd.read_csv("data/cuad_flattened_classification.csv")
print(df_class.shape)
print(df_class.columns)
print(df_class.category.value_counts().head())

categories = df_class["category"].unique()
print(f"\nNumber of unique categories: {len(categories)}")
print(categories)

(3663, 4)
Index(['document_name', 'category', 'text', 'answer'], dtype='object')
category
Anti-Assignment                369
Cap On Liability               273
License Grant                  254
Audit Rights                   214
Termination For Convenience    181
Name: count, dtype: int64

Number of unique categories: 33
['Non-Disparagement' 'Anti-Assignment' 'Minimum Commitment'
 'License Grant' 'Audit Rights' 'Cap On Liability' 'Warranty Duration'
 'Most Favored Nation' 'Termination For Convenience'
 'Revenue/Profit Sharing' 'Unlimited/All-You-Can-Eat-License'
 'Uncapped Liability' 'Exclusivity' 'Affiliate License-Licensee'
 'Change Of Control' 'Non-Transferable License' 'Rofr/Rofo/Rofn'
 'Irrevocable Or Perpetual License' 'Competitive Restriction Exception'
 'Non-Compete' 'Price Restrictions' 'Covenant Not To Sue'
 'Volume Restriction' 'Joint Ip Ownership' 'Ip Ownership Assignment'
 'Post-Termination Services' 'Insurance' 'Affiliate License-Licensor'
 'No-Solicit Of Customers' 'No-

In [None]:
# Normalize answers
df_class["label"] = df_class["answer"].str.lower().map({"yes" : 1.0, "no" : 0.0}) # new column added
print(df_class.columns, "\n")
print(set(["label", "text"]).issubset(df_class.columns)) # Check if for every label there is a text

#print(df_class["category"])
print(len(df_class["text"]))
print(len(df_class["text"].unique()))

# For a multilabel model, change how the classification data is set up
# Format: {text : [array of labels encoded directly to categories in order]}

categories = df_class['category'].unique()
print(categories)
category_to_index = {c:i for i, c in enumerate(categories)}

# key=text, value=one-hot array of categories
num_categories = len(categories)
text_to_onehot = {}
for _, row in df_class.iterrows():
    text = row['text']
    category = row['category']
    label = row['label']  # 0 or 1

    if text not in text_to_onehot:
        text_to_onehot[text] = np.zeros(num_categories, dtype=float)

    # Update the corresponding category index with the label
    text_to_onehot[text][category_to_index[category]] = label

# Check to see if it is the same number of unique texts (should be)
print(len(text_to_onehot))


# Convert to final DataFrame
final_df_class = pd.DataFrame({
    'text': list(text_to_onehot.keys()),
    'labels': list(text_to_onehot.values())
})

print(final_df_class.head())


Index(['document_name', 'category', 'text', 'answer', 'label'], dtype='object') 

True
3663
3513
['Non-Disparagement' 'Anti-Assignment' 'Minimum Commitment'
 'License Grant' 'Audit Rights' 'Cap On Liability' 'Warranty Duration'
 'Most Favored Nation' 'Termination For Convenience'
 'Revenue/Profit Sharing' 'Unlimited/All-You-Can-Eat-License'
 'Uncapped Liability' 'Exclusivity' 'Affiliate License-Licensee'
 'Change Of Control' 'Non-Transferable License' 'Rofr/Rofo/Rofn'
 'Irrevocable Or Perpetual License' 'Competitive Restriction Exception'
 'Non-Compete' 'Price Restrictions' 'Covenant Not To Sue'
 'Volume Restriction' 'Joint Ip Ownership' 'Ip Ownership Assignment'
 'Post-Termination Services' 'Insurance' 'Affiliate License-Licensor'
 'No-Solicit Of Customers' 'No-Solicit Of Employees' 'Liquidated Damages'
 'Third Party Beneficiary' 'Source Code Escrow']
3513
                                                text  \
0  Company shall not specify the business practic...   
1  MA may not assi

# **Preparing data for the model**


In [None]:
# Split into train, validation, and test sets
texts = final_df_class["text"].tolist()

# Random state's number doesn't matter, it's purely a seed to reliably make operations reproducible
# Random state shuffles the data in a way depending on the number, the shuffling itself
# does not matter, so the number doesn't matter so long as it is consistent.
# 80% train, 10% validation, 10% test split
train_texts, val_test_texts = train_test_split(texts, test_size=0.2, random_state=0)
val_texts, test_texts = train_test_split(val_test_texts, test_size=0.5, random_state=0)

train_df = final_df_class[final_df_class["text"].isin(train_texts)].reset_index(drop=True)
val_df = final_df_class[final_df_class["text"].isin(val_texts)].reset_index(drop=True)
test_df = final_df_class[final_df_class["text"].isin(test_texts)].reset_index(drop=True)

print("Train:", len(train_df), "Val:", len(val_df), "Test:", len(test_df))

Train: 2810 Val: 351 Test: 352


In [None]:
# Tokenize inputs
clausewise_class = "ClauseWise/legalbert-clause-classifier"
tokenizer = AutoTokenizer.from_pretrained(clausewise_class)

def tokenize_texts(texts, max_length=512):
  return tokenizer(
      texts.to_list(),    # Pandas series is converted to a list
      padding="max_length", # Ensure all sequences are not shorter than max_length
      truncation=True,      # Ensure all sequences are not longer than max_length
      max_length=max_length,
      return_tensors="pt" # Returns pytorch tensors for the model
  )

# Convert texts and labels to tensors
train_encodings = tokenize_texts(train_df["text"])
val_encodings = tokenize_texts(val_df["text"])
test_encodings = tokenize_texts(test_df["text"])

# np.array used to speed up runtime when converting to tensor
train_labels = torch.tensor(np.array(list(train_df["labels"].values)), dtype=torch.float)
val_labels = torch.tensor(np.array(list(val_df["labels"].values)), dtype=torch.float)
test_labels = torch.tensor(np.array(list(test_df["labels"].values)), dtype=torch.float)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
# Create PyTorch datasets so the model can use them for training

# input_ids: Numerical representation of words after tokenization
# attention_mask: Tells the model which tokens are real and which are padding (if padding, ignore (basically))
train_dataset = TensorDataset(train_encodings["input_ids"], train_encodings["attention_mask"], train_labels)
val_dataset = TensorDataset(val_encodings["input_ids"], val_encodings["attention_mask"], val_labels)
test_dataset = TensorDataset(test_encodings["input_ids"], test_encodings["attention_mask"], test_labels)

# **Classification (LegalBERT) Model Fine-Tuning Loop**



In [None]:
# Create DataLoaders
batch_size = 8

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

loss_func = BCEWithLogitsLoss()

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    clausewise_class,
    num_labels=train_labels.shape[1],
    problem_type="multi_label_classification"
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Scheduler
epochs = 3
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

# ====================
# TRAINING LOOP
# ====================
for epoch in range(epochs):
    print(f"\n=== Epoch {epoch + 1}/{epochs} ===")

    model.train()
    train_loss = 0

    for batch in train_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_func(logits, labels.float())
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Training Loss: {avg_train_loss:.4f}")

# ====================
# VALIDATION LOOP
# ====================
model.eval()
val_loss = 0
predictions = []
targets = []

with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_func(logits, labels.float())
        val_loss += loss.item()

        pred = (torch.sigmoid(logits).cpu().numpy() > 0.5).astype(int)
        predictions.append(pred)
        targets.append(labels.cpu().numpy())

avg_val_loss = val_loss / len(val_loader)

# Convert correctly
predictions = np.vstack(predictions)
targets = np.vstack(targets)

f1 = f1_score(targets, predictions, average="micro")

print(f"Validation Loss: {avg_val_loss:.4f}")
print(f"Validation F1:   {f1:.4f}")

# ====================
# TEST LOOP
# ====================
print("\n========== TESTING MODEL ==========\n")

model.eval()
test_loss = 0
predictions = []
targets = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_func(logits, labels.float())
        test_loss += loss.item()

        pred = (torch.sigmoid(logits).cpu().numpy() > 0.5).astype(int)
        predictions.append(pred)
        targets.append(labels.cpu().numpy())

avg_test_loss = test_loss / len(test_loader)

predictions = np.vstack(predictions)
targets = np.vstack(targets)

f1 = f1_score(targets, predictions, average="micro")

print(f"Test Loss: {avg_test_loss:.4f}")
print(f"Test F1:   {f1:.4f}")


Using device: cuda


config.json:   0%|          | 0.00/643 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ClauseWise/legalbert-clause-classifier and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Epoch 1/3 ===
Training Loss: 0.2179

=== Epoch 2/3 ===
Training Loss: 0.0847

=== Epoch 3/3 ===
Training Loss: 0.0617
Validation Loss: 0.0595
Validation F1:   0.6873


Test Loss: 0.0566
Test F1:   0.7186


# **Text Generator**


In [None]:
### Load and prepare data ###
df_text_gen = pd.read_csv("data/cuad_flattened_text_generation.csv")
df_text_gen["answer"] = df_text_gen["answer"].fillna("").str.strip()
df_text_gen = df_text_gen[df_text_gen["answer"].str.len() > 1]

# Input format
df_text_gen["input_text"] = df_text_gen.apply(
    lambda row: f"Extract {row["category"]} from the following clause:\n{row["text"]}",
    axis=1
)

print(f"Total examples: {len(df_text_gen)}\n")
print("\nCategory Distribution:")
print(df_text_gen["category"].value_counts())

### Split into train, validation, and test set ###
texts = df_text_gen["input_text"].tolist()

# 80% train, 10% validation, 10% test split
train_texts, val_test_texts = train_test_split(texts, test_size=0.2, random_state=0)
val_texts, test_texts = train_test_split(val_test_texts, test_size=0.5, random_state=0)

train_df = df_text_gen[df_text_gen["input_text"].isin(train_texts)].reset_index(drop=True)
val_df = df_text_gen[df_text_gen["input_text"].isin(val_texts)].reset_index(drop=True)
test_df = df_text_gen[df_text_gen["input_text"].isin(test_texts)].reset_index(drop=True)

print("Train:", len(train_df), "Val:", len(val_df), "Test:", len(test_df))

### Tokenize inputs and targets ###
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

def tokenize_texts(input_texts, target_texts, max_input_length=512, max_target_length=128):
    input_encodings = tokenizer(
        input_texts.to_list(),
        padding="max_length",
        truncation=True,
        max_length=max_input_length,
        return_tensors="pt"
    )
    target_encodings = tokenizer(
        target_texts.to_list(),
        padding="max_length",
        truncation=True,
        max_length=max_target_length,
        return_tensors="pt"
    )

    # Replace padding tokens with -100 to ignore in loss calcualtion
    labels = target_encodings["input_ids"].clone()
    labels[labels==tokenizer.pad_token_id] = -100

    return input_encodings, labels

# Convert texts and labels to tensors
train_input_encodings, train_labels = tokenize_texts(train_df["input_text"], train_df["answer"])
val_input_encodings, val_labels = tokenize_texts(val_df["input_text"], val_df["answer"])
test_input_encodings, test_labels = tokenize_texts(test_df["input_text"], test_df["answer"])

# Create datasets
train_dataset = TensorDataset(train_input_encodings["input_ids"], train_input_encodings["attention_mask"], train_labels)
val_dataset = TensorDataset(val_input_encodings["input_ids"], val_input_encodings["attention_mask"], val_labels)
test_dataset = TensorDataset(test_input_encodings["input_ids"], test_input_encodings["attention_mask"], test_labels)

# Dataloaders
batch_size = 8

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

### Load Model ###
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

# Add LoRA
# LoRA basically uses a specific set of weights such that
# since the amount of data is not a lot, updating all the model's
# weights can risk having it overfit to it, therefoer only updating
# a particular subset of it. This does not heed performance or actual output.
USE_LORA = True
if USE_LORA:
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q", "v"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_2_SEQ_LM"
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

model.config.use_cache = False
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

# Scheduler
epochs = 3
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,
    num_training_steps=total_steps
)

### Training Loop ###
for epoch in range(epochs):
    print(f"\n=== Epoch {epoch+1}/{epochs} ===")

    model.train()
    train_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Training Loss: {avg_train_loss:.4f}")




Total examples: 2499


Category Distribution:
category
Parties                501
Agreement Date         451
Governing Law          421
Effective Date         344
Expiration Date        324
Document Name          274
Renewal Term           157
Most Favored Nation     27
Name: count, dtype: int64
Train: 1999 Val: 250 Test: 250


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Using device: cuda


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 884,736 || all params: 248,462,592 || trainable%: 0.3561

=== Epoch 1/3 ===


  torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)


Training Loss: 1.4470

=== Epoch 2/3 ===
Training Loss: 0.8774

=== Epoch 3/3 ===
Training Loss: 0.7846


In [None]:
### Validation Loop ###
model.eval()
val_loss = 0
predictions = []
targets = []

with torch.no_grad():
    for batch in val_loader:
      input_ids, attention_mask, labels = [x.to(device) for x in batch]

      outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
      val_loss += outputs.loss

      generated_ids = model.generate(
          input_ids=input_ids,
          attention_mask=attention_mask,
          max_length=128,
          num_beams=4,
          early_stopping=True
      )

      # Decode predictions and labels
      pred = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
      labels_decoded = labels.clone()
      labels_decoded[labels_decoded == -100] = tokenizer.pad_token_id
      target = tokenizer.batch_decode(labels_decoded, skip_special_tokens=True)

      predictions.extend(pred)
      targets.extend(target)

avg_val_loss = val_loss / len(val_loader)
print(f"Validation Loss: {avg_val_loss:.4f}")

# ROUGE scores (validation)
rouge = evaluate.load("rouge")
rouge_scores = rouge.compute(predictions=predictions, references=targets)
print(f"Validation ROUGE-L: {rouge_scores["rougeL"]:.4f}")

### Test Loop ###
model.eval()
test_loss = 0
predictions = []
targets = []
categories = []

with torch.no_grad():
    for i, batch in enumerate(test_loader):
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        test_loss += outputs.loss

        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=4,
            early_stopping=True
        )

        # Decode predictions and labels
        pred = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        labels_decoded = labels.clone()
        labels_decoded[labels_decoded == -100] = tokenizer.pad_token_id
        target = tokenizer.batch_decode(labels_decoded, skip_special_tokens=True)

        predictions.extend(pred)
        targets.extend(target)

        batch_size_real = len(pred)
        start = i * batch_size
        batch_categories = test_df.iloc[start:(start + batch_size_real)]["category"].tolist()
        categories.extend(batch_categories)

avg_test_loss = test_loss / len(test_loader)
print(f"Test Loss: {avg_test_loss:.4f}")

# ROUGE scores (overall)
rouge_scores = rouge.compute(predictions=predictions, references=targets)
print(f"Test ROUGE-1: {rouge_scores["rouge1"]:.4f}")
print(f"Test ROUGE-2: {rouge_scores["rouge2"]:.4f}")
print(f"Test ROUGE-L: {rouge_scores["rougeL"]:.4f}")


# Perforamnce per-category
print("\n=== Performance per-category ===")
category_results = defaultdict(lambda: {"predictions": [], "targets": []})
for pred, target, cat in zip(predictions, targets, categories):
    category_results[cat]["predictions"].append(pred)
    category_results[cat]["targets"].append(target)

for category in sorted(category_results.keys()):
    cat_preds = category_results[category]["predictions"]
    cat_targets = category_results[category]["targets"]
    cat_rouge = rouge.compute(predictions=cat_preds, references=cat_targets)
    print(f"\n{category}:")
    print(f"  Samples: {len(cat_preds)}")
    print(f"  ROUGE-L: {cat_rouge['rougeL']:.4f}")

# Sample predictions
print("\n=== Sample Predictions ===")
for i in range(min(5, len(predictions))):
    print(f"\nExample {i+1} - {categories[i]}:")
    print(f"  Prediction: {predictions[i]}")
    print(f"  Actual:     {targets[i]}")


Validation Loss: 0.6885


Downloading builder script: 0.00B [00:00, ?B/s]

Validation ROUGE-L: 0.6788
Test Loss: 0.6222
Test ROUGE-1: 0.7116
Test ROUGE-2: 0.5087
Test ROUGE-L: 0.6706

=== Performance per-category ===

Agreement Date:
  Samples: 46
  ROUGE-L: 0.7085

Document Name:
  Samples: 27
  ROUGE-L: 0.9471

Effective Date:
  Samples: 41
  ROUGE-L: 0.6951

Expiration Date:
  Samples: 34
  ROUGE-L: 0.2451

Governing Law:
  Samples: 40
  ROUGE-L: 0.9464

Most Favored Nation:
  Samples: 4
  ROUGE-L: 0.0000

Parties:
  Samples: 46
  ROUGE-L: 0.5793

Renewal Term:
  Samples: 12
  ROUGE-L: 0.7107

=== Sample Predictions ===

Example 1 - Parties:
  Prediction: MOUNT KNOWLEDGE HOLDINGS INC. ("Company"); BIRCH FIRST GLOBAL INVESTMENTS INC. ("MA")
  Actual:     Birch First Global Investments Inc. ("Company"); Mount Kowledge Holdings Inc. ("Marketing Affiliate", "MA")

Example 2 - Effective Date:
  Prediction: 07/11/2006
  Actual:     07/11/2006

Example 3 - Agreement Date:
  Prediction: 02/10/2014
  Actual:     02/10/2014

Example 4 - Document Name:
  Prediction: 

# **Chatbot Loop**

In [1]:
# Load models from HuggingFace
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
print("Loading models...")

# LegalBERT fine-tuned classifier (on the 33 yes/no categories)
legalbert_name = "ClauseWise/legalbert-clause-classifier"
legalbert_tokenizer = AutoTokenizer.from_pretrained(legalbert_name)
legalbert_model = AutoModelForSequenceClassification.from_pretrained(legalbert_name).to(device)

# FLAN-T5 fine-tuned extractor (on the 8 other categories)
flan_name = "ClauseWise/flan-t5-cuad-clause-extractor-lora"
flan_tokenizer = AutoTokenizer.from_pretrained(flan_name)
flan_model = AutoModelForSeq2SeqLM.from_pretrained(flan_name).to(device)

# Deepseek-R1 distilled reasoning model (open-source)
deepseek_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
deepseek_tokenizer = AutoTokenizer.from_pretrained(deepseek_name)
deepseek_model = AutoModelForCausalLM.from_pretrained(
    deepseek_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

print("All models loaded.")

# Red flag clause types are programmer defined
RED_FLAG_TYPES = [
    #placeholder
]

# LegalBERT Clause Classification
def classify_clause(text):
  inputs = legalbert_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
  outputs = legalbert_model(**inputs)
  predicted_class = torch.argmax(outputs.logits).item()
  label = legalbert_model.config.id2label[predicted_class]
  return label

# FLAN-T5 Extraction
def extract_text(question, context):
  prompt = f"Question: {question}\nContext: {context}\nAnswer:"
  inputs = flan_tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
  outputs = flan_model.generate(**inputs, max_length=250)
  return flan_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Deepseek Reasoning
def deepseek_reasoning(prompt):
  inputs = deepseek_tokenizer(prompt, return_tensors="pt").to(device)
  outputs = deepseek_model.generate(
      **inputs,
      max_length=250,
      do_sample=True,
      temperature=0.3
  )
  return deepseek_tokenizer.decode(outputs[0], skip_special_tokens=True)



def analyze_contract_text(raw_text):
  #segmenter = ContractSegmenter()
  #clauses = segmenter.segment_contract(raw_text)
  #return clauses
  return ContractSegmenter().segment_contract(raw_text)

def route_user_query(user_input, clauses):
  """
  1. If the user asks abotu a specific clause, classify that clause
  2. If the user asks a question needing extraction, use FLAN-T5
  3. Pass everything into DeepSeek for reasoning summary.
  """
  combined_context = "\n\n".join(
      f"[Clause {c['clause_id']}] {c['text']}" for c in clauses
  )

  # Basic heuristics
  need_extraction = any(q in user_input.lower() for q in [
        "what", "when", "who", "how much", "define", "meaning", "obligation"
  ])

  need_classification = "classify" in user_input.lower() or "type" in user_input.lower()

  # Classification
  classification_results = {}
  if need_classification:
    for clause in clauses:
      classification_results[clause["clause_id"]] = classify_clause(clause["text"])

  # Extraction
  extracted_answer = extract_text(user_input, combined_context) if need_extraction else None

  # Deepseek Reasoning section
  reasoning_prompt = f"""
  You are a legal reasoning engine. The user asked:

  {user_input}

  Contract clauses:
  {combined_context}

  Clause classifications:
  {classification_results}

  Extracted answer (if any): {extracted_answer}

  Red-flag categories (designer defined):
  {RED_FLAG_TYPES}

  Provide a clear, direct, legally-reasonable answer to the user.
  Do NOT hallucinate facts not present in the clauses.
  """

  final_answer = deepseek_reasoning(reasoning_prompt)
  return {
      "classification": classification_results,
      "extraction": extracted_answer,
      "final": final_answer
  }


# Terminal Bot Chat Loop
def start_chatbot():
  print("\n ClauseWise Legal Assistant")
  print("Upload a contract or clause first. Type 'exit' to quit.\n")
  raw_text = input("\nPaste contract text (or path to .txt/.pdf):\n> ")
  if raw_text.strip().endswith(".txt") or raw_text.strip().endswith(".pdf"):
        raw_text = load_contract_text(raw_text.strip())

  clauses = analyze_contract_text(raw_text)

  while True:
    user_input = input("Ask something about the contract: ")
    if user_input.lower().strip() == "exit":
      break

    result = route_user_query(user_input, clauses)
    print("\n--- Final Answer (DeepSeek) ---")
    print(result["final"])

    if result["classification"]:
      print("\n--- Clause Classifications (LegalBERT) ---")
      for cid, label in result["classification"].items():
          print(f"Clause {cid}: {label}")

    if result["extraction"]:
      print("\n--- Extracted Answer (FLAN-T5) ---")
      print(result["extraction"])

# Run chatbot program
start_chatbot()

NameError: name 'torch' is not defined

# **Model Push**


In [None]:
# Push model to huggingface hub
from huggingface_hub import login

# Login (only once per session)
login()

# Push to your namespace
model.push_to_hub("ClauseWise/legalbert-clause-classifier")
tokenizer.push_to_hub("ClauseWise/legalbert-clause-classifier")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...p4mafqt/model.safetensors:   0%|          | 14.2kB /  438MB            

CommitInfo(commit_url='https://huggingface.co/ClauseWise/legalbert-clause-classifier/commit/d43585357526a53848a1d541a0d34c233bf5b1f8', commit_message='Upload tokenizer', commit_description='', oid='d43585357526a53848a1d541a0d34c233bf5b1f8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ClauseWise/legalbert-clause-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='ClauseWise/legalbert-clause-classifier'), pr_revision=None, pr_num=None)

In [None]:
from huggingface_hub import login
login()
repo_id = "ClauseWise/flan-t5-cuad-clause-extractor-lora"

model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          | 9.86kB / 3.56MB            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  .../tmpxl3s3l6g/spiece.model: 100%|##########|  792kB /  792kB            

CommitInfo(commit_url='https://huggingface.co/ClauseWise/flan-t5-cuad-clause-extractor-lora/commit/0a6926130862aa5237f98ed02f3004d140a7a76b', commit_message='Upload tokenizer', commit_description='', oid='0a6926130862aa5237f98ed02f3004d140a7a76b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ClauseWise/flan-t5-cuad-clause-extractor-lora', endpoint='https://huggingface.co', repo_type='model', repo_id='ClauseWise/flan-t5-cuad-clause-extractor-lora'), pr_revision=None, pr_num=None)