This code was used to generate the results for model llama2-7b-hf, which is a base model (without instruct fine tuning).

#Step 1: Generate the prompt dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#change directory
%cd /content/drive/MyDrive/DOUTORADO/TESE/my_git_repo/subLegalBench_experiment_1

/content/drive/MyDrive/DOUTORADO/TESE/my_git_repo/subLegalBench_experiment_1


In [3]:
#list the files
!ls

code  evaluation.py    __pycache__  results  tasks.py
data  legalbench_data  README.md    tasks    utils.py


In [None]:
#!git clone https://github.com/HazyResearch/legalbench.git

In [4]:
from tqdm.auto import tqdm
import datasets

from tasks import TASKS, ISSUE_TASKS
from utils import generate_prompts

In [5]:
# Supress progress bars which appear every time a task is downloaded
datasets.utils.logging.set_verbosity_error()

In [6]:
!pip install datasets --upgrade



In [7]:
import os

In [8]:
import pandas as pd

In [129]:
task_name = 'maud_specific_performance'

In [78]:
legalbench_raw_path = "/content/drive/MyDrive/DOUTORADO/TESE/my_git_repo/subLegalBench_experiment_1/legalbench_data"

In [130]:
#join task_name to legalbench_raw_directory to form a file path
legalbench_dataset_path = os.path.join(legalbench_raw_path, task_name)

In [131]:
legalbench_dataset_path

'/content/drive/MyDrive/DOUTORADO/TESE/my_git_repo/subLegalBench_experiment_1/legalbench_data/maud_specific_performance'

In [132]:
os.listdir(legalbench_dataset_path)

['test.tsv', 'train.tsv']

In [133]:
#load both train.tsv and test.tsv inside legalbench_dataset_path
train_df = pd.read_csv(os.path.join(legalbench_dataset_path, "train.tsv"), sep="\t")
test_df = pd.read_csv(os.path.join(legalbench_dataset_path, "test.tsv"), sep="\t")

In [134]:
test_df.head()

Unnamed: 0,index,answer,text
0,0,B,Specific Performance. <omitted> Shareholder ag...
1,1,B,Section 10.5 Applicable Laws; Jurisdiction; Sp...
2,2,B,Section 11.14. Specific Performance. <omitted>...
3,3,B,Section 8.10. Specific Performance. The partie...
4,4,B,Section 8.11 Specific Performance. <omitted...


In [135]:
train_df.head()

Unnamed: 0,index,answer,text
0,0,B,Section 9.10 Specific Performance. The parties...


In [136]:
# Load base prompt
with open(f"tasks/{task_name}/base_prompt.txt") as in_file:
    prompt_template = in_file.read()
print(prompt_template)

Instruction: Read the segment of a merger agreement and answer the multiple-choice question by choosing the option that best characterizes the agreement.
Question: What is the wording of the Specific Performance clause regarding the parties’ entitlement in the event of a contractual breach?
Option A: "entitled to seek" specific performance
Option B: "entitled to" specific performance

Merger Agreement: Section 9.10 Specific Performance. The parties hereto hereby agree that irreparable damage would occur in the event that any provision of this Agreement were not performed in accordance with its specific terms or were otherwise breached, and that money damages or other legal remedies would not be an adequate remedy for any such damages. Accordingly, the parties acknowledge and agree that each party shall be entitled to, in accordance with the provisions of this Agreement, an injunction or injunctions, specific performance or other equitable relief to prevent breaches of this Agreement an

In [137]:
# test_df = dataset["test"].to_pandas()
prompt_dataset = generate_prompts(prompt_template=prompt_template, data_df=test_df)
print(prompt_dataset[0])

Instruction: Read the segment of a merger agreement and answer the multiple-choice question by choosing the option that best characterizes the agreement.
Question: What is the wording of the Specific Performance clause regarding the parties’ entitlement in the event of a contractual breach?
Option A: "entitled to seek" specific performance
Option B: "entitled to" specific performance

Merger Agreement: Section 9.10 Specific Performance. The parties hereto hereby agree that irreparable damage would occur in the event that any provision of this Agreement were not performed in accordance with its specific terms or were otherwise breached, and that money damages or other legal remedies would not be an adequate remedy for any such damages. Accordingly, the parties acknowledge and agree that each party shall be entitled to, in accordance with the provisions of this Agreement, an injunction or injunctions, specific performance or other equitable relief to prevent breaches of this Agreement an

# Step 2. Generate LLM answers.

In [19]:
!pip install -q transformers accelerate sentencepiece

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m128.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m96.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [20]:
!pip install -q transformers huggingface_hub
from huggingface_hub import login

# This will prompt for your access token
login(token="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")  # Get this from huggingface.co/settings/tokens

In [21]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load model in BF16 precision (best quality within 15GB)
model_id = "meta-llama/Llama-2-7b-hf"

# Load with explicit use_auth_token
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=True
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    use_auth_token=True
).eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [22]:
from peft import PeftModel

In [23]:
peft_model = PeftModel.from_pretrained(
    base_model,
    "LDS-Project/legal_llama2-7b_base",
    subfolder="checkpoint-297",
    token = True)

adapter_config.json:   0%|          | 0.00/935 [00:00<?, ?B/s]

checkpoint-297/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

In [24]:
from transformers import pipeline

In [25]:
merged_model = peft_model.merge_and_unload()

In [26]:
#getting the input context size of llama2
merged_model.config

LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.55.0",
  "use_cache": true,
  "vocab_size": 32000
}

In [None]:
# Get the newline token ID
# newline_token_id = tokenizer("\n")["input_ids"][-1]

# def generate_deterministic(prompt):
#     """Generate text with:
#     - Temperature 0.0 (fully deterministic)
#     - Stops at newline
#     - No sampling warnings"""

#     inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

#     # Generation parameters for deterministic output
#     outputs = model.generate(
#         **inputs,
#         max_new_tokens=100,
#         do_sample=False,  # Greedy decoding
#         temperature=None,  # Not used when do_sample=False
#         top_p=None,  # Not used when do_sample=False
#         eos_token_id=newline_token_id,  # Stop at newline
#         pad_token_id=tokenizer.eos_token_id
#     )

#     # Clean output - remove prompt and keep only generation
#     full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     return full_text[len(prompt):].split("\n")[0]  # Extract only new content

In [27]:
# Get the newline token ID - more robust approach
newline_token_id = tokenizer.encode("\n", add_special_tokens=False)[-1]

def generate_deterministic(prompt):
    """Generate text with:
    - Temperature 0.0 (fully deterministic)
    - Stops at newline
    - No sampling warnings"""

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generation parameters for deterministic output
    outputs = merged_model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=False,  # Greedy decoding. Llama doesnt allow temperature = 0.0
        eos_token_id=newline_token_id,  # Stop at newline
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode only the new tokens (excluding prompt)
    generated_ids = outputs[0][len(inputs["input_ids"][0]):]
    return tokenizer.decode(generated_ids, skip_special_tokens=True).split("\n")[0]

In [138]:
generate_deterministic(prompt_dataset[0])

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


'A'

In [139]:
# Process your dataset
generations = []
for prompt in prompt_dataset:
    torch.cuda.empty_cache()  # Clear memory between generations
    generations.append(generate_deterministic(prompt))

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.

In [140]:
from sklearn.metrics import balanced_accuracy_score
import pandas as pd
import numpy as np

# Clean and convert labels
def clean_label(label):
    if pd.isna(label) or str(label).strip() == '':
        return np.nan  # Will be filtered out
    cleaned = str(label).strip().lower()
    return 1 if cleaned in ('yes', 'y') else 0 if cleaned in ('no', 'n') else np.nan

# Convert to numeric (1=Yes, 0=No)
y_true = test_df['answer'].apply(clean_label)
y_pred = pd.Series(generations).apply(clean_label)

# Filter out invalid entries (NaN)
valid_mask = ~(np.isnan(y_true) | np.isnan(y_pred))
y_true_clean = y_true[valid_mask]
y_pred_clean = y_pred[valid_mask]

# Calculate balanced accuracy
if len(y_true_clean) > 0:
    balanced_acc = balanced_accuracy_score(y_true_clean, y_pred_clean)
    print(f"Balanced Accuracy: {balanced_acc:.4f}")
    print(f"Valid samples: {len(y_true_clean)}/{len(y_true)}")
else:
    print("Error: No valid samples after cleaning!")

Error: No valid samples after cleaning!


# Step 3. Evaluate LLM answers.

In [71]:
from evaluation import evaluate

In [141]:
evaluate(task_name, generations, test_df["answer"].tolist())

np.float64(0.5151515151515151)

In [33]:
generated_chat_path = "/content/drive/MyDrive/DOUTORADO/TESE/my_git_repo/subLegalBench_experiment_1/results/generated_chats/model_legal_llama2_7b_base_fine_tuned_ifd"

In [34]:
model_name = "legal_llama2_7b_base_fine_tuned_ifd"

In [35]:
task_group = "LJT"

In [142]:
# join task_name and model_name to form the name of generated_chat file
generated_chat_file = os.path.join(generated_chat_path, f"generated_chat_{task_name}_{model_name}_{task_group}.csv")

In [143]:
generated_chat_file

'/content/drive/MyDrive/DOUTORADO/TESE/my_git_repo/subLegalBench_experiment_1/results/generated_chats/model_legal_llama2_7b_base_fine_tuned_ifd/generated_chat_maud_specific_performance_legal_llama2_7b_base_fine_tuned_ifd_LJT.csv'

In [144]:
#save generations to csv
# import pandas as pd
df = pd.DataFrame(generations)
df.to_csv(generated_chat_file, index=False)



In [145]:
df

Unnamed: 0,0
0,A
1,A
2,A
3,A
4,A
...,...
173,A
174,A
175,A
176,A


In [None]:
# unassign colab
from google.colab import runtime
runtime.unassign()

In [None]:
# import os
# import pandas as pd

In [None]:
# Your GitHub-tracked directory (where files should be saved)
# GIT_TRACKED_DIR = "/content/drive/MyDrive/DOUTORADO/TESE/my_git_repo/subLegalBench_experiment_1/results/generated_chats/model_llama2_7b"

In [None]:
# filename = f"generated_chat_{task_name}_{model_name}.csv"

In [None]:
# pd.DataFrame(generations).to_csv(os.path.join(GIT_TRACKED_DIR, filename), index=False)

In [None]:
# ===== 3. GIT COMMIT =====
# Navigate to repo root
# REPO_ROOT = "/content/drive/MyDrive/DOUTORADO/TESE/my_git_repo/subLegalBench_experiment_1"
# os.chdir(REPO_ROOT)

In [None]:
# !apt-get install git -y
# !git config --global core.hooksPath /dev/null  # Disable problematic hooks
# !git config --global user.email "israelfama@yahoo.com"
# !git config --global user.name "israelfama"

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.15).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [None]:
# Git operations (YOUR CORRECTED PATH)
# !git add results/generated_chats/model_llama2_7b/{filename}

In [None]:
# !git commit -m "Auto-commit: {task_name} results ({timestamp})"

^C


In [None]:
# !git push origin main