In [None]:
!pip install -q transformers datasets evaluate scikit-learn


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m450.6/491.2 kB[0m [31m13.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
import torch

# ✅ Load dataset
dataset = load_dataset("Kanishkagarwal6101/Legal_Analyzer_Final")
label_list = sorted(set(dataset["train"]["label"]))
num_labels = len(label_list)

# ✅ Tokenizer and model
model_ckpt = "nlpaueb/legal-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize_function(examples):
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    tokens["labels"] = examples["label"]
    return tokens

tokenized_ds = dataset.map(tokenize_function, batched=True)
tokenized_ds = tokenized_ds.remove_columns(["text"])

# ✅ Create train/validation split
if "validation" not in tokenized_ds:
    split = tokenized_ds["train"].train_test_split(test_size=0.1, seed=42)
    tokenized_ds["train"] = split["train"]
    tokenized_ds["validation"] = split["test"]

# ✅ Load model
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels)

# ✅ Evaluation metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"]
    }

# ✅ Training setup
training_args = TrainingArguments(
    output_dir="./legal-bert-final",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# ✅ Train
trainer.train()


README.md:   0%|          | 0.00/310 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/7.92M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/38831 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/38831 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkanishk6101[0m ([33mkanishk6101-purdue-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.8908,0.912893,0.696189,0.696216
2,0.6859,0.779164,0.740731,0.743808
3,0.5155,0.831585,0.752832,0.753517


TrainOutput(global_step=13107, training_loss=0.7600733491515298, metrics={'train_runtime': 2380.0084, 'train_samples_per_second': 44.051, 'train_steps_per_second': 5.507, 'total_flos': 2.758878892337357e+16, 'train_loss': 0.7600733491515298, 'epoch': 3.0})

In [None]:
trainer.save_model("./legal-bert-final")
tokenizer.save_pretrained("./legal-bert-final")


('./legal-bert-final/tokenizer_config.json',
 './legal-bert-final/special_tokens_map.json',
 './legal-bert-final/vocab.txt',
 './legal-bert-final/added_tokens.json',
 './legal-bert-final/tokenizer.json')

In [None]:
import torch
import nltk
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ✅ Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nltk.download("punkt")

# ✅ Load model and tokenizer
model_path = "./legal-bert-final"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)

# ✅ 18-class label map
label2id = {
    "Business": 0, "Confidentiality": 1, "Consumers": 2, "Declarations": 3, "Economy": 4,
    "Education": 5, "Employment": 6, "Environment": 7, "External Relations": 8, "Fairness": 9,
    "Health": 10, "IP & Rights": 11, "Indemnification": 12, "Legal Governance": 13,
    "Miscellaneous": 14, "Payment": 15, "Social": 16, "Termination": 17
}
id2label = {v: k for k, v in label2id.items()}

# ✅ Clause-aware chunks (use what we just generated earlier)
import re
from PyPDF2 import PdfReader

reader = PdfReader("/content/independent_contractor_agreement.pdf")
text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
clause_chunks = re.split(r"\n?\s*\d+\.\s+", text)
clause_chunks = [chunk.strip() for chunk in clause_chunks if chunk.strip()]
print(f"✅ Total clause-based chunks: {len(clause_chunks)}")

# ✅ Predict using Legal-BERT
predicted_ids = []
for clause in clause_chunks:
    inputs = tokenizer(clause, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
        predicted_id = torch.argmax(logits, dim=1).item()
    predicted_ids.append(predicted_id)

predicted_labels = [id2label[i] for i in predicted_ids]

# ✅ Expected labels for clause-aware chunks (9)
expected_labels = [
    "Miscellaneous",            # Intro / Parties
    "Employment",          # Scope of Engagement
    "Termination",         # Term & Termination
    "Payment",             # Compensation
    "Confidentiality",     # Confidentiality
    "IP & Rights",         # Ownership of work
    "Indemnification",     # Indemnification
    "Legal Governance",    # Governing law
    "Miscellaneous"        # Final section
]

expected_labels = expected_labels[:len(clause_chunks)]
true_ids = [label2id[label] for label in expected_labels]

# ✅ Evaluation
acc = accuracy_score(true_ids, predicted_ids)
prec, rec, f1, _ = precision_recall_fscore_support(true_ids, predicted_ids, average="weighted", zero_division=0)

# ✅ Print metrics
print("\n📊 Legal-BERT Clause-Aware Evaluation:")
print(f"✅ Accuracy: {acc * 100:.2f}%")
print(f"✅ Precision: {prec * 100:.2f}%")
print(f"✅ Recall: {rec * 100:.2f}%")
print(f"✅ F1 Score: {f1 * 100:.2f}%")

# ✅ Save results
df_eval = pd.DataFrame({
    "Clause": clause_chunks[:len(expected_labels)],
    "Predicted": predicted_labels[:len(expected_labels)],
    "Expected": expected_labels
})
df_eval.to_csv("legalbert_clause_aware_eval.csv", index=False)
print("\n📁 Results saved to legalbert_clause_aware_eval.csv")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


✅ Total clause-based chunks: 9

📊 Legal-BERT Clause-Aware Evaluation:
✅ Accuracy: 77.78%
✅ Precision: 72.22%
✅ Recall: 77.78%
✅ F1 Score: 74.07%

📁 Results saved to legalbert_clause_aware_eval.csv


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
import torch

# ✅ Load your final cleaned dataset
dataset = load_dataset("Kanishkagarwal6101/Legal_Analyzer_Final")
label_list = sorted(set(dataset["train"]["label"]))
num_labels = len(label_list)

# ✅ Use standard uncased BERT base
model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# ✅ Tokenize
def tokenize_function(examples):
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    tokens["labels"] = examples["label"]
    return tokens

tokenized_ds = dataset.map(tokenize_function, batched=True)
tokenized_ds = tokenized_ds.remove_columns(["text"])

# ✅ Train-validation split if not already there
if "validation" not in tokenized_ds:
    split = tokenized_ds["train"].train_test_split(test_size=0.1, seed=42)
    tokenized_ds["train"] = split["train"]
    tokenized_ds["validation"] = split["test"]

# ✅ Load model
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels)

# ✅ Evaluation metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"]
    }

# ✅ Training config
training_args = TrainingArguments(
    output_dir="./bert-final",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

# ✅ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# ✅ Train!
trainer.train()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/38831 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.9361,0.904991,0.687951,0.686878
2,0.6969,0.803335,0.732235,0.738485
3,0.4989,0.865579,0.748713,0.750135


TrainOutput(global_step=13107, training_loss=0.7797921316714821, metrics={'train_runtime': 2364.6136, 'train_samples_per_second': 44.337, 'train_steps_per_second': 5.543, 'total_flos': 2.758878892337357e+16, 'train_loss': 0.7797921316714821, 'epoch': 3.0})

In [None]:
trainer.save_model("./bert-final")
tokenizer.save_pretrained("./bert-final")


('./bert-final/tokenizer_config.json',
 './bert-final/special_tokens_map.json',
 './bert-final/vocab.txt',
 './bert-final/added_tokens.json',
 './bert-final/tokenizer.json')

In [None]:
import torch
import nltk
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from PyPDF2 import PdfReader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re

# ✅ Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nltk.download("punkt")

# ✅ Load your BERT model
model_path = "./bert-final"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)

# ✅ 18-class label mapping
label2id = {
    "Business": 0, "Confidentiality": 1, "Consumers": 2, "Declarations": 3, "Economy": 4,
    "Education": 5, "Employment": 6, "Environment": 7, "External Relations": 8, "Fairness": 9,
    "Health": 10, "IP & Rights": 11, "Indemnification": 12, "Legal Governance": 13,
    "Miscellaneous": 14, "Payment": 15, "Social": 16, "Termination": 17
}
id2label = {v: k for k, v in label2id.items()}

# ✅ Load and chunk the contract using clause-aware logic
reader = PdfReader("/content/independent_contractor_agreement.pdf")
text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
clause_chunks = re.split(r"\n?\s*\d+\.\s+", text)
clause_chunks = [chunk.strip() for chunk in clause_chunks if chunk.strip()]
print(f"✅ Total clause-based chunks: {len(clause_chunks)}")

# ✅ Predict
predicted_ids = []
for clause in clause_chunks:
    inputs = tokenizer(clause, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
        predicted_id = torch.argmax(logits, dim=1).item()
    predicted_ids.append(predicted_id)

predicted_labels = [id2label[i] for i in predicted_ids]

# ✅ Ground truth labels for this PDF
expected_labels = [
    "Business",            # Intro
    "Employment",          # Scope
    "Termination",         # Term
    "Payment",             # Compensation
    "Confidentiality",     # NDA
    "IP & Rights",         # Ownership
    "Indemnification",     # Liability
    "Legal Governance",    # Law
    "Miscellaneous"        # Final
]
expected_labels = expected_labels[:len(clause_chunks)]

true_ids = [label2id[label] for label in expected_labels]
predicted_ids = predicted_ids[:len(expected_labels)]
predicted_labels = predicted_labels[:len(expected_labels)]
clause_chunks = clause_chunks[:len(expected_labels)]

# ✅ Evaluation
acc = accuracy_score(true_ids, predicted_ids)
prec, rec, f1, _ = precision_recall_fscore_support(true_ids, predicted_ids, average="weighted", zero_division=0)

# ✅ Print metrics
print("\n📊 BERT Clause-Aware Evaluation:")
print(f"✅ Accuracy: {acc * 100:.2f}%")
print(f"✅ Precision: {prec * 100:.2f}%")
print(f"✅ Recall: {rec * 100:.2f}%")
print(f"✅ F1 Score: {f1 * 100:.2f}%")

# ✅ Save breakdown
df_eval = pd.DataFrame({
    "Clause": clause_chunks,
    "Predicted": predicted_labels,
    "Expected": expected_labels
})
df_eval.to_csv("bert_clause_aware_eval.csv", index=False)
print("\n📁 Saved to bert_clause_aware_eval.csv")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


✅ Total clause-based chunks: 9

📊 BERT Clause-Aware Evaluation:
✅ Accuracy: 77.78%
✅ Precision: 77.78%
✅ Recall: 77.78%
✅ F1 Score: 77.78%

📁 Saved to bert_clause_aware_eval.csv
