In [None]:
!apt-get install -y zip unzip
!pip install --upgrade pip
!pip install datasets transformers bitsandbytes>=0.39.0 accelerate>=0.20.0 optimum>=1.20.0 gdown packaging ninja  -q
!gdown --folder https://drive.google.com/drive/folders/1-gHn8PGabr54NKOCBN7zaLAM6iuMJI3-?usp=sharing
!unzip ./Andre_legal_processing/MD_splits.zip -d ./Andre_legal_processing
!ninja --version
!echo $?

In [None]:
!pip install flash-attn --no-build-isolation

In [None]:
#!pip install faiss-gpu-cu12
#!pip install auto-gptq --no-build-isolation

In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import os
os.environ['HF_HOME'] = './HF_cache'
os.environ['HF_HUB_CACHE'] = './HF_cache'

import torch
torch.set_grad_enabled(False)

from pathlib import Path
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset, load_from_disk

In [3]:
plain_text_save_path = Path('???')

md_docs_save_path = Path('./MD_Docs')
md_docs_splits_save_path = Path('./MD_splits')

## Split MD files into train/val/test
I will do them 80/10/10 %.\
The test will only be used for the final evaluation.

In [None]:
from random import sample, seed
from shutil import copy
from tqdm import tqdm

In [None]:
seed(0)
md_files = list(md_docs_save_path.glob('*.md'))
# shuffle the files
shuffled_md_files = sample(md_files, len(md_files))
train_size = int(0.8 * len(md_files))
val_size = int(0.1* len(md_files))
test_size = len(md_files) - train_size - val_size

In [None]:
test_set = shuffled_md_files[:test_size]
val_set = shuffled_md_files[test_size:test_size+val_size]
train_set = shuffled_md_files[test_size+val_size:]

In [None]:
sets = {'train': train_set, 'val': val_set, 'test': test_set}
for set_name, set_files in sets.items():
  save_dir = md_docs_splits_save_path / set_name
  for file_ in tqdm(set_files):
    copy(file_, save_dir)

100%|██████████| 342/342 [01:35<00:00,  3.56it/s]
100%|██████████| 42/42 [00:15<00:00,  2.70it/s]
100%|██████████| 44/44 [00:14<00:00,  2.96it/s]


## Create the dataset

In [4]:
doc_splits = {'train':md_docs_splits_save_path/'train', 'val':md_docs_splits_save_path/'val'}

There is a problem including file paths using load_dataset. I will load using pandas first.

In [5]:
def load_split(split_path):
  '''
  input is a Path
  output is a pandas dataframe
  '''
  file_paths = [f for f in split_path.glob('*.md')]
  docs_frame = pd.DataFrame(columns=['file_name', 'path', 'text'])

  for p in tqdm(file_paths):
    f = p.open('r', encoding="utf-8")
    path = str(p)
    file_name = p.stem
    text = f.read()
    row = pd.DataFrame([file_name, path, text]).T
    row.columns = ['file_name', 'path', 'text']
    docs_frame = pd.concat([docs_frame, row])

  docs_frame = docs_frame.reset_index(drop=True)

  return docs_frame

In [6]:
train_frame = load_split(doc_splits['train'])
val_frame = load_split(doc_splits['val'])
train_frame.head()

100%|██████████| 342/342 [00:00<00:00, 1077.96it/s]
100%|██████████| 42/42 [00:00<00:00, 1187.63it/s]


Unnamed: 0,file_name,path,text
0,R (on the application of AS) v Liverpool City ...,MD_splits/train/R (on the application of AS) v...,# R (on the application of AS) v Liverpool Cit...
1,R (on the application of Y) v Secretary of Sta...,MD_splits/train/R (on the application of Y) v ...,# R (on the application of Y) v Secretary of S...
2,R (on the application of SPM) v Secretary of S...,MD_splits/train/R (on the application of SPM) ...,for Refugee Women) v Secretary of State for th...
3,Re JR147_s Application for Judicial Review,MD_splits/train/Re JR147_s Application for Jud...,# Re JR147's Application for Judicial Review [...
4,Sheikh v The Law Society of England and Wales ...,MD_splits/train/Sheikh v The Law Society of En...,# Sheikh v The Law Society of England and Wale...


In [7]:
train_dataset = Dataset.from_pandas(train_frame)
train_dataset

Dataset({
    features: ['file_name', 'path', 'text'],
    num_rows: 342
})

## Process the docs

Load the model and tokenizer:

In [8]:
from transformers import set_seed, BitsAndBytesConfig, DataCollatorWithPadding
set_seed(0)

In [9]:
model_name = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
# Most LLMs don't have a pad token by default
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="flash_attention_2", # FlashAttention-2 can be combined with other optimization techniques like quantization
    #attn_implementation="sdpa",
    quantization_config=BitsAndBytesConfig(load_in_8bit=True)
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
!nvidia-smi

Wed Dec 18 12:20:52 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.05             Driver Version: 550.127.05     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A4000               On  |   00000000:09:00.0 Off |                  Off |
| 50%   64C    P2             38W /  140W |    3703MiB /  16376MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [11]:
# according to one analysis the effective context lenght of Llama is 32k so I use that.
# reduced max_len due to GPU OOM issues.
max_length = int(10e3)
stride = int(.3 * max_length)

with open('./instruction.txt', 'r') as f:
    instructions = f.read()


def split_text_add_instruction_tokenize(examples):
    '''
    Splits input text into chuncks of size max_length by tokenizing it,
    maps that back to the raw text then adds the instruction template.
    Finally tokenizes the new text to get the final model input.
    '''
    result = tokenizer(
        examples["text"],
        max_length=max_length,
        padding=False,
        stride=stride,
        truncation=True,
        return_overflowing_tokens=True,
        return_offsets_mapping=True, #  returns (char_start, char_end) for each token.
    )

    # Extract mapping between new and old indices
    sample_maps = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        if key != 'text':
            result[key] = [values[i] for i in sample_maps]

    # make the raw text match the tokenized ids and add the instruction
    offsets_mappings = result.pop('offset_mapping')
    result['text'] = []
    for off_maps, samp_id in zip(offsets_mappings, sample_maps):
        # remove special tokens
        off_maps = torch.tensor(off_maps)
        non_special_mask = off_maps[:,0] != off_maps[:,1]
        non_special_tokens_mappings = off_maps[non_special_mask]
        # extract matching text
        text_start = non_special_tokens_mappings.min()
        text_end = non_special_tokens_mappings.max()
        original_text = examples["text"][samp_id]
        document_section = original_text[text_start:text_end]
        # add the instruction prompt
        intsruction_and_document = (
            instructions +
            'The section of the document:\n<section>' +
            document_section +
            '</section>\nJSON Answer:\n{'
        )
        result['text'].append(intsruction_and_document)

    final_result = tokenizer(
        result["text"],
        truncation=False,
        padding=False,
        # can't convert to tensor here since it will try to collate into batches
        #return_tensors="pt"
    )
    final_result['file_name'] = result['file_name']
    final_result['path'] = result['path']
    final_result['text'] = result['text']

    return final_result

In [12]:
tokenized_save_path = './my_HF_Arrow_Dataset/trainset_tokenized'


trainset_tokenized = train_dataset.map(split_text_add_instruction_tokenize, batched=True, batch_size=300)
trainset_tokenized = trainset_tokenized.with_format("torch")
trainset_tokenized.save_to_disk(tokenized_save_path)


# trainset_tokenized = load_from_disk(tokenized_save_path)
# trainset_tokenized = trainset_tokenized.with_format("torch")

Map:   0%|          | 0/342 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/786 [00:00<?, ? examples/s]

In [13]:
trainset_tokenized

Dataset({
    features: ['file_name', 'path', 'text', 'input_ids', 'attention_mask'],
    num_rows: 786
})

In [14]:
# for text in tokenizer.batch_decode(trainset_tokenized['input_ids'][:3], skip_special_tokens=True):
#     print('_'*50)
#     print(text)

In [15]:
# for i in range(4):
#     print('_'*50)
#     print(trainset_tokenized['text'][i])

In [16]:
model.generation_config

GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "temperature": 0.6,
  "top_p": 0.9
}

In [17]:
from transformers import GenerationConfig
generation_config = GenerationConfig(
    max_new_tokens=400,
    do_sample=True,
    eos_token_id=model.config.eos_token_id,
    bos_token_id=model.config.bos_token_id,
    stop_strings= '}',
    #num_return_sequences=3,
)

In [18]:
# default is to pad to the longest sequence in the batch
# since we have set side left when loading the tokenizer it should do left padding
dynamic_padder = DataCollatorWithPadding(tokenizer=tokenizer)

def generate_answer(context):
    model_inputs = {k:context[k] for k in ['input_ids','attention_mask']}
    batched_inputs = dynamic_padder(model_inputs)
    batched_inputs = {k:v.cuda() for k,v in batched_inputs.items()}
    outputs = model.generate(
            **batched_inputs,
            generation_config=generation_config,
            tokenizer=tokenizer
        )
    answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return {'model_answer': answer}


In [19]:
answers = trainset_tokenized.map(generate_answer, batched=True, batch_size=3)
answers.save_to_disk('./my_HF_Arrow_Dataset/trainset_answers')

Map:   0%|          | 0/786 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

Saving the dataset (0/1 shards):   0%|          | 0/786 [00:00<?, ? examples/s]

In [20]:
# browse the answer
train_case_names = train_frame['file_name'].to_list()

count = 0
for case_name in train_case_names:
    this_case_answers = answers.filter(
                        lambda r: [name == case_name for name in r['file_name']],
                        batched=True,
                        batch_size=1000,
      )

    print('_'*50)
    print(case_name)
    print()
    for ans in this_case_answers['model_answer']:
        query, response = ans.split('</section>\nJSON Answer:\n')
        print(response)
        print('*'*50)

    count += 1
    if count > 3:
        break

Filter:   0%|          | 0/786 [00:00<?, ? examples/s]

__________________________________________________
R (on the application of AS) v Liverpool City Council

{ 
"reasoning": "The court has considered the Claimant's history of trauma and his asylum application and the Defendant's duties, the court concluded that the Claimant had a triable issue that permission should be granted. The court accepted the Claimant's evidence and rejected the Defendant's argument about the balance of convenience in favor of granting interim relief.",

"important_factors": "the Claimant's history of trauma, the Defendant's previous acceptance of the Claimant's asserted age, recent evidence of the Claimant's birth certificate, the Defendant's duties towards a former relevant child, the mandatory character of the order sought, the Claimant's asylum application, the balance of convenience argument made by the Defendant.",

"answer": "acquittal"
}
**************************************************


Filter:   0%|          | 0/786 [00:00<?, ? examples/s]

__________________________________________________
R (on the application of Y) v Secretary of State for the Home Department

{ "reasoning": "The court's final verdict is a dismissal of the claimant's claim, specifically the rationality of the decision regarding the claimant's initial age assessment. This indicates that the court accepted the defendant's justification for the assessment, finding no irrationality in the decision.", 

"important_factors": "The facts and evidence presented by both the claimant and the defendant, including the assessment of the claimant's age, the application of the Dublin III Regulation, and the evaluation of the claimant's risk of absconding, were key factors in the court's decision.", 

"answer": "acquittal" }
**************************************************


Filter:   0%|          | 0/786 [00:00<?, ? examples/s]

__________________________________________________
R (on the application of SPM) v Secretary of State for the Home Department R (on the application of

{ "answer": "neither", "reasoning": "The court held that the Defendant's decision to open Derwentside IRC without adequate legal aid services does not amount to a hindrance, impediment or breaching of the duty under section 1 of LASPO to secure that legal aid is made available. The law of access to justice allows for flexible arrangements, including provision of services by telephone or other electronic means.", "important_factors": "The Defendant has statutory powers to detain people for immigration purposes. The Claimants have not joined the Lord Chancellor to this claim, and confirmed in their reply that they do not allege any breach of duty by the Lord Chancellor in the provision of legal aid. The law of access to justice allows for flexible arrangements, including provision of services by telephone or other electronic means." }
***

Filter:   0%|          | 0/786 [00:00<?, ? examples/s]

__________________________________________________
Re JR147_s Application for Judicial Review

{ "reasoning": "The key factors to consider in the case are the applicant's claim of age, the respondent's refusal to consider the issue, the applicant's experience with human trafficking and the availability of an appeal through the First-tier Tribunal.", "important_factors": "The applicant claims to be a Kuwaiti Bidun of 17/04/2003, the respondent has refused to consider the age issue, the applicant was subjected to a coercive immigration interview and the applicant can appeal the decision through the First-tier Tribunal.", "answer": "conviction" }
**************************************************
{ "reasoning": "The court should exercise its jurisdiction because the matter can no longer be dealt with by a specialist tribunal, as it has been found that the applicant is a victim of modern slavery/trafficking and the Greek authorities treated him as an adult. However, the respondent's assess

## Debug the prompt

In [33]:
train_case_names = list(trainset_tokenized['file_name'])

count = 0
for case_name in tqdm(train_case_names[40:]):
    # this might be only one chunk
    document_chuncks = trainset_tokenized.filter(
                        lambda r: [name == case_name for name in r['file_name']],
                        batched=True,
                        batch_size=1000,
      )
    answer = document_chuncks.map(generate_answer, batched=True, batch_size=3)

    print('_'*50)
    print(case_name)
    print()
    for ans in answer['model_answer']:
        query, response = ans.split('</section>\nJSON Answer:\n')
        print(response)
        print('*'*50)

    count += 1
    if count > 3:
        break

  0%|          | 0/746 [00:00<?, ?it/s]

Filter:   0%|          | 0/786 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 1/746 [00:51<10:36:17, 51.24s/it]

__________________________________________________
HJ (by her litigation friend) v A local authority

{ "answer": "conviction", "reasoning": "Since the Defendant was deemed to have a duty to assess HJ as a child in need, but failed to do so after being aware of the preliminary assessment by RBG that HJ was an adult, the Defendant can be considered guilty of a breach of its statutory duty.", "important_factors": "The Defendant's failure to assess HJ, despite being aware of the preliminary assessment, is the key issue here. The Defendant's refusal was based on RBG's responsibility, but this was not a valid reason. The guidance is clear that a local authority should not pass the child on to another authority while there is a dispute about who is responsible. The Defendant's submission that the Claimant had an alternative remedy to seeking judicial review of the Defendant's refusal to assess the Defendant as a child is not supported by the law." }
******************************************

Filter:   0%|          | 0/786 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 2/746 [01:40<10:22:22, 50.19s/it]

__________________________________________________
R (on the application of NN) v Secretary of State for the Home Department_ R (on the application of

{ "reasoning": "The court grants general interim relief for the claimants' claim that the Secretary of State's policy of ending support 45 days after a conclusive determination is unlawful. The court has jurisdiction to grant this form of relief due to its wide scope under s 37 of the Senior Courts Act 1981. The balance of convenience comes down in favour of granting this relief due to the serious risk of irreparable harm to a significant number of vulnerable victims of modern slavery if their support ends after 45 days.", 

"important_factors": "The claimants' application for judicial review challenging the Secretary of State's policy of ending support 45 days after a conclusive determination. The Secretary of State submitted that individual interim relief has been ordered for the claimants and that general interim relief would make no

Filter:   0%|          | 0/786 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 3/746 [02:47<11:54:48, 57.72s/it]

__________________________________________________
PHAN v HM ADVOCATE - 2018 JC 195

{ "answer": "neither", "important_factors": "The court must decide whether the Human Trafficking and Exploitation (Scotland) Act 2015 is incompatible with the EU Human Trafficking Directive because the minuter lacks access to a defence for being trafficked into the UK and compelled to commit the offence. The Directive provides a defence for victims of trafficking who have been compelled to commit an offence as a direct consequence of being a victim of trafficking. However, the Act does not provide such a defence, leaving victims in Scotland without the protected status afforded by the Directive.", "reasoning": "The court must determine whether the absence of a statutory defence in the 2015 Act for being trafficked into the UK and compelled to commit an offence is incompatible with the EU Human Trafficking Directive. The Directive requires Member States to provide for the possibility of not imposing pen

Filter:   0%|          | 0/786 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 3/746 [03:34<14:46:04, 71.55s/it]

__________________________________________________
PHAN v HM ADVOCATE - 2018 JC 195

{ "answer": "conviction", "reasoning": "The court held that the Lord Advocate's discretion not to prosecute met the requirement of Art 8 of the Directive that national authorities be entitled not to prosecute victims of trafficking. The court also found that the respondent's guidelines, which contained a presumption against the prosecution of victims of trafficking, were compliant with the Directive.", "important_factors": "The court held that the Lord Advocate's discretion not to prosecute met the requirement of Art 8 of the Directive, and the respondent's guidelines were compliant with the Directive." }
**************************************************
{  
  "reasoning": "Given the procedural guidelines and rules in place for handling human trafficking cases, the defence team did not contest the court proceedings and did not raise any points that could be seen as the 'correct' verdict. The trial ult




In [30]:
print(query)

Your role is that of a legal lawyer specializing in modern slavery cases. 
You are given part of a legal document reporting the result of a court case. The document is delimited by <section> tags.
Be logical and factual when answering. Provide step by step reasoning.
Only use the information provided in this section of the document.

According to this section of the document, is the court's final verdict of the defendant a conviction (“guilty”), an acquittal (“not guilty”), or neither?

The JSON format of the answer is given in the example output below:
{
"reasoning": "Write your reasoning here. Keep it under 70 words and answer concisely.",

"important_factors": "Write the important deciding factors for the case here. Keep it under 70 words and answer concisely.",
	
"answer": "Write your final choice here. Choose exactly one of the three options: conviction, acquittal, neither."
}
Do not write an introduction or summary. Keep the whole answer under 180 words. Respond only with valid J

In [None]:
!nvidia-smi

Mon Dec 16 15:29:36 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A4000               On  |   00000000:07:00.0 Off |                  Off |
| 41%   36C    P8             14W /  140W |       2MiB /  16376MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

``` json
"tetd": "kkc"
"tetd": "kkc"
"tetd": "kkc"
```