***LEGAL ASSISTANCE BOT BASED ON LLM (LLAMA) | Fine-Tuned over Legal Texts***

**TANMAY SINGH**\
**CSAI | IIIT-D**\
**CLASS OF '25**

***P.S***: This file is **NOT** the original source code, but only a **demonstration file** which mimics the actual workflow

***Importing the Dependencies***

In [1]:
!pip install peft
!pip install tqdm
!pip install torch
!pip install gdown
!pip install wandb
!pip install pynvml
!pip install gradio
!pip install datasets
!pip install accelerate
!pip install transformers
!pip install mplcyberpunk
!pip install -U bitsandbytes
!pip install --upgrade transformers



In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import os
import re
import time
import json
import torch
import wandb
import gdown
import accelerate
import gradio as gr
import numpy as num
import pandas as pan
import mplcyberpunk as mcy
import matplotlib.pyplot as mtp

from tqdm.auto import tqdm
from sklearn.metrics import *
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlShutdown
from peft import (LoraConfig, get_peft_model, prepare_model_for_kbit_training,
                  TaskType, PeftModelForCausalLM, PeftModel)
from transformers import (AutoTokenizer, AutoModelForCausalLM,
                          Trainer, TrainingArguments, BitsAndBytesConfig, DataCollatorForLanguageModeling)

In [None]:
login(token='') #Access Token Removed

In [None]:
wandb.login(key="") #Access Token Removed

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /teamspace/studios/this_studio/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtanmay21569[0m ([33miiitd-sachin[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

***Setting up the Variables***

In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [7]:
device

device(type='cuda', index=0)

In [8]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [9]:
checkpoints_dir = "./model_checkpoints/"
data_dir = "./data/"

os.makedirs(checkpoints_dir, exist_ok=True)
os.makedirs(data_dir, exist_ok=True)

In [10]:
base_model = "meta_llama"
peft_model = f'QLoRa-{base_model}-ft_model'
peft_output_dir = checkpoints_dir + peft_model
peft_output_dir

'./model_checkpoints/QLoRa-meta_llama-ft_model'

***Downloading the Dataset***

In [11]:
links = [
    "https://drive.google.com/uc?id=1C5CmT7p6aa8mNIW8nyXDbLAGRMITkjOT",
    "https://drive.google.com/uc?id=1OsJEK1FFSdHLUpEPAGtyv_YF4WEDwvzL",
    "https://drive.google.com/uc?id=1AR_NgDNqmRllO3L5YbU3Fw7nrv3Ov9Rc"
]

In [12]:
for link in links:
    drive_link = link
    gdown.download(drive_link, output = data_dir, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1C5CmT7p6aa8mNIW8nyXDbLAGRMITkjOT
To: /teamspace/studios/this_studio/Intern_Project/data/dataset1.json
100%|██████████| 675k/675k [00:00<00:00, 76.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1OsJEK1FFSdHLUpEPAGtyv_YF4WEDwvzL
To: /teamspace/studios/this_studio/Intern_Project/data/dataset2.json
100%|██████████| 2.18M/2.18M [00:00<00:00, 129MB/s]
Downloading...
From: https://drive.google.com/uc?id=1AR_NgDNqmRllO3L5YbU3Fw7nrv3Ov9Rc
To: /teamspace/studios/this_studio/Intern_Project/data/dataset3.json
100%|██████████| 1.28M/1.28M [00:00<00:00, 102MB/s]


***Preprocessing the Dataset***

In [13]:
formatted_dataset = []

In [14]:
with open("./data/dataset1.json", "r") as f:
    dataset1 = json.load(f)

with open("./data/dataset2.json", "r") as f:
    dataset2 = json.load(f)

with open("./data/dataset3.json", "r") as f:
    dataset3 = json.load(f)

In [15]:
for data in dataset1:
    formatted_dataset.append({
        "instruction": "Answer the legal question based on Indian Penal Code(IPC).",
        "input"      : data["question"],
        "output"     : data["answer"],
        "source"     : "IPC"
    })

In [16]:
for data in dataset2:
    formatted_dataset.append({
        "instruction": "Answer the legal question based on Criminal Procedure Code(CRPC).",
        "input"      : data["question"],
        "output"     : data["answer"],
        "source"     : "CRPC"
    })

In [17]:
for data in dataset3:
    formatted_dataset.append({
        "instruction": "Answer the legal question based on Indian Constitution.",
        "input"      : data["question"],
        "output"     : data["answer"],
        "source"     : "Constitution"
    })

In [18]:
with open("./data/dataset_formatted.json", "w") as f:
    json.dump(formatted_dataset, f, indent=2)

***Preparing the Dataset for Fine-tuning***

In [19]:
dataset = load_dataset("json", data_files="./data/dataset_formatted.json")

Generating train split: 0 examples [00:00, ? examples/s]

In [20]:
sources = [data["source"] for data in formatted_dataset]

training_set, testing_set = train_test_split(
    formatted_dataset, test_size=0.1, random_state=42, stratify=sources
)

In [21]:
train_dataset = Dataset.from_list(training_set)
test_dataset = Dataset.from_list(testing_set)

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [22]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'source'],
        num_rows: 13088
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'source'],
        num_rows: 1455
    })
})

***Initialising the Tokeniser***

In [23]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True, use_fast=True, trust_remote_code=True)

In [24]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

***Pre-trained Model***

In [25]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

***QLoRA Config for Fine-tuning***

In [26]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=64,
    lora_dropout=0.2,
    bias="none",
    init_lora_weights=True,
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

In [27]:
model = prepare_model_for_kbit_training(base_model)
model.gradient_checkpointing_enable()

peft_model = get_peft_model(model, lora_config)

In [28]:
peft_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.2, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora

***Helper Functions***

In [29]:
def tokenise_samples(sample):
    prompt = (
        f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n"
        f"{sample['instruction']}\n\n{sample['input']}<|eot_id|>"
        f"<|start_header_id|>assistant<|end_header_id|>\n{sample['output']}<|eot_id|>"
    )

    tokenized = tokenizer(
        prompt,
        truncation=True,
        padding="max_length",
        max_length=512
    )

    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

In [30]:
tokenized_dataset = dataset.map(
    lambda samples: tokenise_samples(samples),
    remove_columns=dataset['train'].column_names
)

Map:   0%|          | 0/13088 [00:00<?, ? examples/s]

Map:   0%|          | 0/1455 [00:00<?, ? examples/s]

In [31]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

***Fine-tuning the Model***

In [32]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [33]:
train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(500))
val_dataset   = tokenized_dataset["test"].shuffle(seed=42).select(range(50))

In [34]:
training_args = TrainingArguments(
    output_dir=peft_output_dir,                  
    overwrite_output_dir=True,                  
    per_device_train_batch_size=4,             
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    learning_rate=0.0001,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    logging_dir=checkpoints_dir + "logs/",       
    fp16=True            
)

In [35]:
peft_model.config.use_cache = False
peft_model.config.pad_token_id = tokenizer.pad_token_id

In [36]:
trainer = Trainer(
    model=peft_model,
    tokenizer=tokenizer,              
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [37]:
start = time.time()
trainer.train()
end = time.time()

print(f"Time taken to fine-tune the model: {((end - start) / 60):.2f} minutes")



Epoch,Training Loss,Validation Loss
1,1.3598,1.080491
2,0.7904,1.082584
3,0.4535,1.242537
4,0.2605,1.461627
5,0.173,1.746478


Time taken to fine-tune the model: 82.62 minutes


***Generating Inference***

In [42]:
peft_model.eval()

def generate_response(instruction, query):
    prompt = (
        f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n"
        f"{instruction}\n\n{query}<|eot_id|>"
        f"<|start_header_id|>assistant<|end_header_id|>\n"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(peft_model.device)

    with torch.no_grad():
        outputs = peft_model.generate(
            **inputs,
            max_new_tokens=256,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(
        outputs[0],
        skip_special_tokens=True
    ).split("<|start_header_id|>assistant<|end_header_id|>\n")[-1].strip()

    return response

In [None]:
demo = gr.Interface(
    fn=generate_response,
    inputs=[
        gr.Textbox(label="Instruction", value="Answer the legal question."),
        gr.Textbox(
            label="Question",
            value="What is 'Man', 'Woman', 'Person', and 'Public' according to the Indian Penal Code?"
        )
    ],
    outputs=gr.Textbox(label="Response"),
    title="LLM-based Legal Assistant",
    description="Hello! I'm a legal assistance bot. I'll answer your queries to legal matters based on IPC, CRPC & Constitution Data. Let's get started :)"
)

In [44]:
demo.launch(share=True)

* Running on local URL:  http://127.0.0.1:7861


* Running on public URL: https://2f4375a3bce0e2241d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


