In [41]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
import lzma
import numpy as np
import os
import json
import torch
import argparse
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForCausalLM

In [43]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

In [44]:
PROMPT_DICT_NONE = {
    "prompt_input": (
        "{instruction}\n{input}\n"
    ),
    "prompt_no_input": (
        "{instruction}\n"
    ),
}

In [45]:
# Used to get the ppl and emb for the whole input
def get_perplexity_and_embedding_whole_text(tokenizer, model, text, max_length):

    try:
        input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=max_length).to(device)


        with torch.no_grad():
            outputs = model(input_ids, labels=input_ids.contiguous())
        loss = outputs.loss
        perplexity = torch.exp(loss)

        return perplexity.to('cpu').item(), loss.to('cpu').item()

    except:
        return 0, 0

In [46]:
def get_perplexity_and_embedding_part_text(tokenizer, model, text, target_span, max_length):
    try:
        # Encode everything
        input_ids = tokenizer.encode(text, truncation=False, add_special_tokens=False)
        target_ids = tokenizer.encode(target_span, truncation=False, add_special_tokens=False)
        target_len = len(target_ids)

        # Find target position (search from end)
        start_idx = -1
        for i in range(len(input_ids)-target_len, -1, -1):
            if input_ids[i:i+target_len] == target_ids:
                start_idx = i
                break

        if start_idx == -1:
            print("Target not found in input!")
            return 0, 0

        # Truncate input to max_length while keeping target
        end_idx = start_idx + target_len
        if len(input_ids) > max_length:
            # Keep target at end
            input_ids = input_ids[-max_length:]
            # Recalculate positions
            start_idx = max(0, len(input_ids) - target_len)
            end_idx = len(input_ids)

        # Create labels (-100 for non-target tokens)
        labels = [-100]*len(input_ids)
        labels[start_idx:end_idx] = input_ids[start_idx:end_idx]

        # Convert to tensors
        input_ids = torch.tensor([input_ids], device=device)
        labels = torch.tensor([labels], device=device)

        # Calculate loss
        with torch.no_grad():
            outputs = model(input_ids, labels=labels)

        loss = outputs.loss
        perplexity = torch.exp(loss)

        return perplexity.item(), loss.item()

    except Exception as e:
        print(f"Error in conditional perplexity: {e}")
        return 0, 0

In [7]:
alpaca_directory = "/content/drive/MyDrive/DOUTORADO/TESE/lawinstruct/english_only_data/alpaca_format"

In [8]:
#list files on alpaca directory
alpaca_files = os.listdir(alpaca_directory)
alpaca_files

['readme.gdoc',
 'sample_data',
 'BVADecisions-bva_decisions_label-train-0.jsonl_english_only_deduplicated_alpaca_format.json.xz',
 'CaseBriefs-case_briefs-train-0.jsonl_english_only_deduplicated_alpaca_format.json.xz',
 'EOIRPrivacy-eoir_privacy-train-0.jsonl_english_only_deduplicated_alpaca_format.json.xz',
 'LawngNli-lawng_nli_entailment-train-0.jsonl_english_only_deduplicated_alpaca_format.json.xz',
 'LexGLUE-ledgar-train-0.jsonl_english_only_deduplicated_alpaca_format.json.xz',
 'Littleton-littleton_events-train-0.jsonl_english_only_deduplicated_alpaca_format.json.xz',
 'Littleton-littleton_graph-train-0.jsonl_english_only_deduplicated_alpaca_format.json.xz',
 'MultiLexSum-long_to_short-train-0.jsonl_english_only_deduplicated_alpaca_format.json.xz',
 'MultiLexSum-long_to_tiny-train-0.jsonl_english_only_deduplicated_alpaca_format.json.xz',
 'MultiLexSum-short_to_tiny-train-0.jsonl_english_only_deduplicated_alpaca_format.json.xz',
 'NaturalInstructionsLegal-billsum_summarization-tra

In [206]:
filename = "NaturalInstructionsLegal-casehold_legal_answer_generation-train-0.jsonl_english_only_deduplicated_alpaca_format.json.xz"

In [207]:
alpaca_path = os.path.join(alpaca_directory, filename)

In [208]:
alpaca_path

'/content/drive/MyDrive/DOUTORADO/TESE/lawinstruct/english_only_data/alpaca_format/NaturalInstructionsLegal-casehold_legal_answer_generation-train-0.jsonl_english_only_deduplicated_alpaca_format.json.xz'

In [12]:
model_name_or_path = "gpt2"

In [13]:
max_length = 1024

In [14]:
start_idx = 0

In [15]:
end_idx = -1

In [16]:
prompt = "none"

In [17]:
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map="auto", cache_dir='../cache', output_hidden_states=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir='../cache')

In [19]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [149]:
ifd_directory = "/content/drive/MyDrive/DOUTORADO/TESE/lawinstruct/english_only_data/superfiltering_test/ifd"

In [209]:
base_name = filename
while '.' in base_name:
    base_name = os.path.splitext(base_name)[0]
model_name = model_name_or_path.split('/')[-1]  # Extract model name
ifd_path = os.path.join(ifd_directory, f"{base_name}_ifd_{model_name}.jsonl")

In [210]:
ifd_path

'/content/drive/MyDrive/DOUTORADO/TESE/lawinstruct/english_only_data/superfiltering_test/ifd/NaturalInstructionsLegal-casehold_legal_answer_generation-train-0_ifd_gpt2.jsonl'

In [64]:
def process_file(
    alpaca_path,
    max_length=1024,
    start_idx=0,
    end_idx=-1,
    prompt="none",
    ifd_path=None
):
    # Load data
    try:
        with lzma.open(alpaca_path, "rt", encoding="utf-8") as f:
            data = json.load(f)
        print("File decompressed and JSON loaded successfully.")
    except FileNotFoundError:
        print(f"Error: File not found at {alpaca_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from the decompressed file.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

    # Set up prompts
    if prompt == 'none':
        prompt_no_input = PROMPT_DICT_NONE["prompt_no_input"]
        prompt_input = PROMPT_DICT_NONE["prompt_input"]

    # Process data
    end_idx = end_idx if end_idx != -1 else len(data)
    sampled_data = data[start_idx:end_idx]
    results = []

    for i in tqdm(range(len(sampled_data))):
        data_i = sampled_data[i]
        instruct_i = data_i['instruction']
        output_i = data_i['output']

        # Count tokens for each component
        instruction_tokens = len(tokenizer.encode(instruct_i, truncation=False, max_length=max_length))
        output_tokens = len(tokenizer.encode(output_i, truncation=False, max_length=max_length))

        input_i = data_i['input'] if 'input' in data_i.keys() else ''
        input_tokens = len(tokenizer.encode(input_i, truncation=False, max_length=max_length)) if input_i else 0

        if input_i == '':
            temp_dict = {'instruction': instruct_i}
            promt_to_use = prompt_no_input.format_map(temp_dict)
            whole_text = promt_to_use + output_i
            instruct_i = promt_to_use
        else:
            temp_dict = {'instruction': instruct_i, 'input': input_i}
            promt_to_use = prompt_input.format_map(temp_dict)
            whole_text = promt_to_use + output_i
            instruct_i = promt_to_use

        output_i_input_ids = tokenizer.encode(output_i, return_tensors="pt", truncation=True, max_length=max_length).to(device)
        output_i_len = output_i_input_ids.shape[1]

        if output_i == '':
            temp_data_i = {}
        else:
            ppl_out_alone, loss_out_alone = get_perplexity_and_embedding_whole_text(tokenizer, model, output_i, max_length-output_i_len+1)
            ppl_out_condition, loss_out_condition = get_perplexity_and_embedding_part_text(tokenizer, model, whole_text, output_i, max_length)
            temp_data_i = {
                'ppl': [0, ppl_out_alone, 0, ppl_out_condition],
                'loss': [0, loss_out_alone, 0, loss_out_condition],
                'instruction_tokens': instruction_tokens,
                'input_tokens': input_tokens,
                'output_tokens': output_tokens,
                'original_data': data_i
            }

        results.append(temp_data_i)

    # Merge results with original data
    new_data = []
    for i, (item, result) in enumerate(zip(data[start_idx:end_idx], results)):
        if not result:  # Empty result case
            new_item = {
                **item,
                'ppl_A_direct': np.nan,
                'ppl_A_condition': np.nan,
                'ifd_ppl': np.nan,
                'instruction_tokens': 0,
                'input_tokens': 0,
                'output_tokens': 0
            }
        else:
            try:
                ifd = result['ppl'][3] / result['ppl'][1]
            except (ZeroDivisionError, TypeError):
                ifd = np.nan

            new_item = {
                **item,
                'ppl_A_direct': result['ppl'][1],
                'ppl_A_condition': result['ppl'][3],
                'ifd_ppl': ifd,
                'instruction_tokens': result['instruction_tokens'],
                'input_tokens': result['input_tokens'],
                'output_tokens': result['output_tokens']
            }

        new_data.append(new_item)

    # Save to file if path provided
    if ifd_path:
        with open(ifd_path, "w") as fw:
            json.dump(new_data, fw, indent=4)
        print(f"Results saved to {ifd_path}")

    return new_data

In [211]:
process_file(alpaca_path=alpaca_path, max_length=max_length, start_idx=start_idx, end_idx=end_idx, prompt=prompt, ifd_path=ifd_path)

Output hidden; open in https://colab.research.google.com to view.

In [212]:
with open(ifd_path, "rt", encoding="utf-8") as f:
  data = json.load(f)

In [213]:
nan_count = sum(1 for item in data if np.isnan(item.get('ifd_ppl', np.nan)))

In [214]:
nan_count

0

In [156]:
nan_entries = [
    item for item in data
    if np.isnan(item.get("ifd_ppl", np.nan))
]

In [160]:
print(f"Found {len(nan_entries)} entries with NaN 'ifd_ppl':\n")
for idx, entry in enumerate(nan_entries[:10], 1):  # Show first 10 for brevity
    print(f"Entry {idx}:")
    print(f"Instruction: {entry['instruction']} (Tokens: {entry.get('instruction_tokens', 'N/A')})")
    print(f"Input: {entry.get('input', '')} (Tokens: {entry.get('input_tokens', 'N/A')})")
    print(f"Output: {entry['output']} (Tokens: {entry.get('output_tokens', 'N/A')})")
    print(f"ppl_A_direct: {entry['ppl_A_direct']}")
    print(f"ppl_A_condition: {entry['ppl_A_condition']}")
    print(f"ifd_ppl: {entry['ifd_ppl']}")
    print("-"*50 + "\n")

Found 10 entries with NaN 'ifd_ppl':

Entry 1:
Instruction:  (Tokens: 0)
Input: Definition: In this task, you will be shown a prompt from a judicial decision and multiple holding statements derived from citations following text in a legal decision.  Holdings represent the governing legal rule when the law is applied to a particular set of facts. There are five answer choices for each citing text. The correct answer is the holding statement that corresponds to the citing text. You should write an incorrect option. Even though there exist multiple wrong answers, we only need a single wrong answer. There is a <HOLDING> token in the position of the citing text prompt where the holding statement was extracted.
Explain your answer first.

Input: or indirectly by publication, dissemination, solicitation, indorsement or circulation or in any other way to induce directly or indirectly any person to enter or not enter into any obligation or acquire any title or interest in any merchandise or to 