In [6]:
# STEP 1: Clone the CUAD GitHub repository
!git clone https://github.com/TheAtticusProject/cuad.git
%cd cuad

Cloning into 'cuad'...
remote: Enumerating objects: 30, done.[K
remote: Total 30 (delta 0), reused 0 (delta 0), pack-reused 30 (from 1)[K
Receiving objects: 100% (30/30), 17.78 MiB | 21.77 MiB/s, done.
Resolving deltas: 100% (10/10), done.
/content/cuad


In [7]:
# STEP 2: Unzip data.zip to access cuad_v1.json
import zipfile

with zipfile.ZipFile("data.zip", "r") as zip_ref:
    zip_ref.extractall("data")

In [8]:
# STEP 3: Load CUAD dataset from extracted JSON
import json
import os

json_path = "data/CUADv1.json"

with open(json_path, "r") as f:
    cuad_data = json.load(f)

print("✅ Loaded", len(cuad_data["data"]), "contracts")

✅ Loaded 510 contracts


In [9]:
# STEP 4: Convert to LLaMA 3.2 instruction-tuning format
output_path = "/content/cuad_llama_format.jsonl"
count = 0

with open(output_path, "w") as f_out:
    for document in cuad_data["data"]:
        for paragraph in document["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                answers = qa["answers"]
                answer_text = answers[0]["text"] if answers else "N/A"

                # Format for LLaMA 3.2 instruction tuning
                prompt = f"<s>[INST] You are a legal AI assistant. {question}\n\n[CONTRACT]\n{context}\n[/CONTRACT]\n\nReturn only the clause. [/INST] {answer_text} </s>"
                f_out.write(json.dumps({"text": prompt}) + "\n")
                count += 1

print(f"✅ Successfully converted {count} examples.")
print(f"📁 Output file saved to: {output_path}")

✅ Successfully converted 20910 examples.
📁 Output file saved to: /content/cuad_llama_format.jsonl


In [10]:
# 1️⃣ Install dependencies
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" \
            transformers datasets peft accelerate bitsandbytes trl tqdm

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-dngamuyk/unsloth_7f3f00afc8814290bfee23b7562d292d
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-dngamuyk/unsloth_7f3f00afc8814290bfee23b7562d292d
  Resolved https://github.com/unslothai/unsloth.git to commit 46795df0d4279be6f275570fdfbc44d6091f496c
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [11]:
# 2️⃣ Import libraries
import os
from datasets import load_dataset
from transformers import AutoTokenizer, Trainer # Import Trainer from transformers
from unsloth import FastLanguageModel
from peft import LoraConfig, get_peft_model
from tqdm.auto import tqdm

In [12]:

# 3️⃣ Load your formatted dataset
dataset = load_dataset("json", data_files="/content/cuad_llama_format.jsonl", split="train")
print(f"✅ Loaded {len(dataset)} training samples")

Generating train split: 0 examples [00:00, ? examples/s]

✅ Loaded 20910 training samples


In [None]:
# 4️⃣ Tokenizer and Model setup
base_model = "meta-llama/Llama-3-3b-instruct"  # adjust to 3B
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
model = FastLanguageModel.from_pretrained(
    base_model,
    trust_remote_code=True,
    load_in_4bit=True,
    max_seq_length=2048,
)

In [None]:
# 5️⃣ Prepare dataset for training
def preprocess(example):
    enc = tokenizer(example["text"], truncation=True, padding="max_length", max_length=2048)
    return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"],
            "labels": enc["input_ids"]}

dataset = dataset.map(preprocess, batched=False, remove_columns=["text"])
dataset = dataset.train_test_split(test_size=0.05)

In [None]:
# 6️⃣ PEFT / LoRA configuration
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none"
)
model = get_peft_model(model, lora_config)

In [None]:
# 7️⃣ Trainer setup
trainer = Trainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    output_dir="lawbotics-llama-3.2-3b",
    batch_size=2,
    gradient_accumulation_steps=4,
    lr=2e-4,
    max_epochs=3,
    save_every_epoch=True,
    quantize=True,
)


In [None]:
# 8️⃣ Start fine-tuning
trainer.train()

In [None]:
# 9️⃣ Save the final model
trainer.save_model("lawbotics-llama-3.2-3b")
print("✅ Training complete! Model saved at /content/lawbotics-llama-3.2-3b")