In [1]:
%%capture
%pip install accelerate peft bitsandbytes transformers langchain langchain_community langchain_huggingface nest_asyncio faiss-gpu

In [15]:
import json
import os
import torch
import bitsandbytes
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig

In [3]:
from transformers import BitsAndBytesConfig
from huggingface_hub import login

from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader

from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.chains import LLMChain
from langchain_huggingface import HuggingFacePipeline

from langchain.docstore.document import Document
from transformers import AutoTokenizer, AutoModelForCausalLM

import nest_asyncio

In [11]:
from huggingface_hub import login

In [12]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hug_token = user_secrets.get_secret("huggingface_token")

In [13]:
login(token=hug_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [16]:
# Load JSON data
with open('/kaggle/input/labour-law-json/formatted_data_ebp.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

In [17]:
# Process documents
documents = []
for item in data:
    title = item['Title']
    for doc in item['documents']:
        question = doc['question']
        answer = doc['answer']
        documents.append(Document(page_content=f"Question: {question}\nAnswer: {answer}", metadata={"title": title}))


In [18]:
# Convert documents into a format suitable for FAISS
text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=50)
split_docs = []
for doc in documents:
    chunks = text_splitter.split_text(doc.page_content)
    for chunk in chunks:
        split_docs.append(Document(page_content=chunk, metadata=doc.metadata))

In [19]:
# Load chunked documents into the FAISS index
embedding_model_name = 'sentence-transformers/all-mpnet-base-v2'
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
db = FAISS.from_documents(split_docs, embeddings)
print("DB index", db.index.ntotal)
print("Vector Storing Done")

  warn_deprecated(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

DB index 56
Vector Storing Done


In [20]:
# Create a retriever from the FAISS index
retriever = db.as_retriever()

In [21]:
# Function to perform retrieval
def perform_retrieval(query):
    results = retriever.invoke(query)
    contexts = []
    for i, result in enumerate(results):
        parts = result.page_content.split("Answer:")
        if len(parts) > 1:
            answer = parts[1].strip()
        else:
            answer = result.page_content.strip()
        print(f"Result {i+1}: {answer}")
        contexts.append(answer)
    return contexts

In [22]:
# Load the pre-trained model and tokenizer from Hugging Face
model_name = 'unsloth/tinyllama-chat'
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

In [23]:
# Function to generate text based on the input question and context
def generate_text(question, context):
    # Define the prompt template with the new instruction
    prompt_template = (f"Context: {context}\n"
                       f"Question: {question}\n"
                       "Please generate the answer to the question based on the context provided which is the retrieved data.\n"
                       "Answer:")
    
    # Tokenize the input prompt
    inputs = tokenizer(prompt_template, return_tensors="pt")
    
    # Generate the output using the model
    output = model.generate(**inputs, max_length=1000, num_return_sequences=1)
    
    # Decode the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Extract and return the generated answer (everything after 'Answer:')
    answer = generated_text.split("Answer:")[1].strip()
    return answer

In [24]:
# Test the retrieval and generation with a sample query
sample_query = "What should be the minimum age of child labour?"
retrieved_contexts = perform_retrieval(sample_query)
combined_context = " ".join(retrieved_contexts)
generated_answer = generate_text(sample_query, combined_context)

print("Generated Answer:", generated_answer)

Result 1: Under s.48 and s.49 of the Child Rights Law 2019, a child under the age of 14 shall not be employed. If 
free compulsory education requires a child to be in school until after the age of 14, they shall also not 
be employed. If a child is employable under the Child Rights Law, they may engage in employment in 
accordance with existing labour laws.
Result 2: The Child Rights Law defines hazardous work as one of the worst forms of child labour and prohibits 
it for all children under 18 years.1 However, other labour laws, such as the Factories Act, contain 
provisions which are not consistent with the Child Right Law, as they provide for the possibility for 
children from the age of 16 years to be employed in hazardous work, sometimes as a general rule 
(and not as an exception) without the necessary safeguards provided for in international labour 
standards. 
Section 49(a) of the Child Rights Law provides that the Ministry shall establish what types of work 
shall be considere