# dependecy

In [None]:
pip install sentence-transformers langchain_community unstructured faiss-cpu Together together


# Code

In [None]:
import json


with open("/kaggle/input/final-database/legal_database.json", "r", encoding="utf-8") as f:
    qa_data = json.load(f)


questions = [item["question"] for item in qa_data]
answers = {item["question"]: item["answer"] for item in qa_data}  # Mapping Q -> A


In [None]:
from sentence_transformers import SentenceTransformer


model = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-v4")  
model.save("qa_model")

question_embeddings = model.encode(questions, convert_to_tensor=True)


In [None]:
import torch


torch.save({
    "embeddings": question_embeddings,
    "questions": questions
}, "question_embeddings.pt")


In [None]:
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, PyPDFDirectoryLoader


txt_loader = DirectoryLoader('/kaggle/input/indian-law', glob="*.txt")
pdf_loader = PyPDFDirectoryLoader('/kaggle/input/indian-law')  

text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)


txt_documents = txt_loader.load_and_split(text_splitter)
pdf_documents = pdf_loader.load_and_split(text_splitter)


all_documents = txt_documents + pdf_documents
chunks = [doc.page_content for doc in all_documents]


np.save("legal_chunks.npy", np.array(chunks, dtype=object))

print(f"✅ Processed {len(chunks)} legal text chunks from TXT & PDF.")


In [None]:
chunks[1]

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings



embedding_model = HuggingFaceEmbeddings(model_name="law-ai/InLegalBERT")


legal_embeddings = np.array([embedding_model.embed_query(chunk) for chunk in chunks])


np.save("legal_embeddings.npy", legal_embeddings)


In [None]:
model_path = "legal_model"
embedding_model.client.save_pretrained(model_path) 

In [None]:
import faiss

legal_embeddings = np.load("/kaggle/working/legal_embeddings.npy", allow_pickle=True)
dim = legal_embeddings.shape[1]


faiss_index = faiss.IndexFlatL2(dim)


faiss_index.add(legal_embeddings)


faiss.write_index(faiss_index, "legal_faiss.index")


In [None]:
import together
together.api_key = "tgp_v1_hcAYeb5IquESKVQOsx6_wbAn0jkRNJTWHnNipFUTIlI"

In [None]:
import json
import torch
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer, util
from langchain_community.embeddings import HuggingFaceEmbeddings


model = SentenceTransformer("/kaggle/working/qa_model")
embedding_model = HuggingFaceEmbeddings(model_name="/kaggle/working/legal_model")


data = torch.load("question_embeddings.pt",weights_only=False)
question_embeddings = data["embeddings"]
questions = data["questions"]
faiss_index = faiss.read_index("/kaggle/working/legal_faiss.index")
legal_chunks = np.load("/kaggle/working/legal_chunks.npy", allow_pickle=True)



def find_closest_match(user_question):
    
    user_embedding = model.encode(user_question, convert_to_tensor=True)
    
    similarities = util.pytorch_cos_sim(user_embedding, question_embeddings)
    best_match_idx = similarities.argmax().item()
    
    best_match_question = questions[best_match_idx]
    similarity_score = similarities[0][best_match_idx].item()

    return best_match_question, similarity_score


def search_legal_docs(query,top_k=5):
    
    query_embedding = np.array(embedding_model.embed_query(query)).reshape(1, -1)
    distances, indices = faiss_index.search(query_embedding, top_k)
    
    retrieved_chunks = [legal_chunks[idx] for idx in indices[0]]

    
    query_vector = torch.tensor(query_embedding, dtype=torch.float32)
    chunk_vectors = torch.tensor([embedding_model.embed_query(chunk) for chunk in retrieved_chunks], dtype=torch.float32)

    scores = util.pytorch_cos_sim(query_vector, chunk_vectors)[0]
    ranked_chunks = sorted(zip(retrieved_chunks, scores), key=lambda x: x[1], reverse=True)
    best_chunk, _ = ranked_chunks[0]
    model_name = "mistralai/Mistral-7B-Instruct-v0.2"  # Choose another if needed

    prompt = f"""
    As a legal chatbot specializing in the Indian Penal Code, you are tasked with providing highly accurate and contextually appropriate responses. Ensure your answers meet these criteria:
- First of all state the law in which the context comes in. Then Respond in a 5 bullet-point format to clearly delineate distinct aspects of the legal query.
- Each point should accurately reflect the legal provision in question, avoiding over-specificity unless directly relevant to the user's query.
- Clarify the general applicability of the legal rules or sections mentioned, highlighting any common misconceptions or frequently misunderstood aspects.
- Limit responses to essential information that directly addresses the user's question, providing concise yet comprehensive explanations.
- Avoid assuming specific contexts or details not provided in the query, focusing on delivering universally applicable legal interpretations unless otherwise specified.


CONTEXT: {best_chunk}

QUESTION: {query}
ANSWER:


    """
    
    response = together.Complete.create(
        model=model_name,
        prompt=prompt,
        max_tokens=400,
        temperature=1
    )
    

    if "choices" in response and response["choices"]:
        
        return response["choices"][0]["text"].strip()
    else:
        return "Error: No response received from Together AI."

def get_answer(user_question):
    
    best_match_question, similarity_score = find_closest_match(user_question)

    if similarity_score >= 0.7:
        return f"✅ Exact Match Found({similarity_score:.2f}):\n{answers[best_match_question]}"

    elif 0.3 < similarity_score < 0.7:
        top_chunks = search_legal_docs(user_question)
        return f"⚖️ Legal Text Found:\n{top_chunks}"  

    else:
        return "Sorry cant give response at the moment."  


user_question = "i got in an accident. What should i do?"
response = get_answer(user_question)
print(response)


## location awareness

In [None]:
import requests
import json
import re

api_key = "AlzaSymw9AJ0q15jTp12FlRKCqCHrNHH54hWoxJ"
query = "law firms in dharwad"


url = f"https://maps.gomaps.pro/maps/api/place/textsearch/json?query={query}&key={api_key}"
response = requests.get(url).json()


places = []
def get_photo_href(photo_info):
    if photo_info:
        attributions = photo_info.get("html_attributions", [])
        if attributions:
            links = [re.search(r'href="([^"]+)"', attr).group(1) for attr in attributions if 'href="' in attr]
            return links[0] if links else "No attribution link"
    return "No attribution link"


for place in response.get("results", []):
    name = place.get("name", "Unknown")
    address = place.get("formatted_address", "No address found")
    rating = place.get("rating", "No rating")

    photo_info = place.get("photos", [{}])[0]
    reference = get_photo_href(photo_info)
    
    places.append(f"{name}, Address: {address}, Rating: {rating}, reference:{reference}")


places_text = "\n\n".join(places)


prompt = f"""
Based on the given legal places, provide the reference, name, address ,rating  of the most relevant ones to the user:

{places_text}

Only return the 3 most relevant results in a user-friendly way.
"""
response = together.Complete.create(
        model="mistralai/Mistral-7B-Instruct-v0.2",
        prompt=prompt,
        max_tokens=500,
        temperature=0
    )
    

if "choices" in response and response["choices"]:
        
        print(response["choices"][0]["text"].strip())
else:
        print( "Error: No response received from Together AI.")




## legal advice from lawyers

In [None]:
import json
import numpy as np
from langchain_community.embeddings import HuggingFaceEmbeddings

# Load your dataset
with open("/kaggle/input/legal-advice/answers_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)


# Storage containers
processed_data = []


for entry in data:
    full_text = entry["full_text"]
    answers = entry["answers"]
    question_url = entry["question_url"]

    # Join all answers into one string
    joined_answers = "\n".join(answers)

    # Store only necessary fields
    processed_data.append({
        "question_url": question_url,
        "full_text": full_text,
        "joined_answers": joined_answers
    })

# Save cleaned and processed data
with open("processed_lawyer_data.json", "w", encoding="utf-8") as f:
    json.dump(processed_data, f, indent=2, ensure_ascii=False)


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

## summarize question and answer before embedding

In [None]:
import json

# Load the JSON data
with open('/kaggle/working/processed_lawyer_data.json', 'r') as file:
    data = json.load(file)



In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")


In [None]:
model.save_pretrained("bart_summarizer")
tokenizer.save_pretrained("bart_summarizer")

In [None]:
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
from tqdm import tqdm
import torch

# Load your dataset
df = pd.read_json("/kaggle/working/processed_lawyer_data.json")  # Or use pd.read_csv(...) depending on your format

# Load BART model and tokenizer
model = BartForConditionalGeneration.from_pretrained("/kaggle/working/bart_summarizer", forced_bos_token_id=0)
tokenizer = BartTokenizer.from_pretrained("/kaggle/working/bart_summarizer")
model.eval().to("cuda" if torch.cuda.is_available() else "cpu")

# Function to summarize a single text
def summarize(text, max_input_length=1024, max_output_length=350):
    prompt = (
    "You are a legal assistant. Summarize the following legal situation by extracting:\n"
    "- The legal dispute and its current status\n"
    "- Actions taken by the people involved\n"
    "- The legal question or help being asked\n\n"
    "Text:\n" + text
    )
    inputs = tokenizer(prompt, max_length=1024, truncation=True, return_tensors="pt").to(model.device)

    summary_ids = model.generate(
        inputs["input_ids"],
        num_beams=6,
        max_length=max_output_length,
        min_length=120,
        no_repeat_ngram_size=3,
        length_penalty=1.2,
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# Prepare new columns for summaries
df["fulltext_summary"] = ""
df["answers_summary"] = ""

# Summarize in batches
for idx in tqdm(range(len(df))):
    try:
        full = df.loc[idx, "full_text"]
        ans = df.loc[idx, "joined_answers"]

        df.at[idx, "fulltext_summary"] = summarize(full)
        df.at[idx, "answers_summary"] = summarize(ans)
    except Exception as e:
        print(f"Error at index {idx}: {e}")
        continue

# Save the summarized dataset
df.to_json("summarized_legal_data.json", orient="records", indent=2)


In [None]:
df = pd.read_json("/kaggle/working/summarized_legal_data.json")

In [None]:
final_legal_advice_data=df[["answers_summary", "fulltext_summary", "question_url"]]

In [None]:
final_legal_advice_data.to_json("final_summarized_data.json", orient="records", indent=2)

## fine tuning inlegalbert

In [None]:
import json

with open("/kaggle/working/final_summarized_data.json", "r") as f:
    data = json.load(f)


In [None]:
from sentence_transformers import SentenceTransformer, models

# Load INLegalBERT model (from HuggingFace or local path)
word_embedding_model = models.Transformer("law-ai/InLegalBERT", max_seq_length=512)

pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


In [None]:
from sentence_transformers import losses, InputExample, SentenceTransformer
import torch
from torch.utils.data import Dataset, DataLoader


train_examples = [
    InputExample(texts=[item["fulltext_summary"], item["answers_summary"]], label=1) for item in        data
]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=4)

train_loss = losses.CosineSimilarityLoss(model=model)

# Step 6: Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=100,
    show_progress_bar=True
)


In [None]:
model.save("fine-tuned-inlegalbert")


## creating faiss index and cross encoder for similarity search

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("/kaggle/working/fine-tuned-inlegalbert")


In [None]:
import faiss
full_texts = [item["fulltext_summary"] for item in data]
answers = [item["answers_summary"] for item in data]  # in case you want to use later

# Embed all full_texts
full_text_embeddings = model.encode(full_texts, convert_to_numpy=True, show_progress_bar=True)

# Create FAISS index (for cosine similarity use IndexFlatIP and normalize)
dimension = full_text_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)

# Normalize embeddings (important for cosine similarity)
faiss.normalize_L2(full_text_embeddings)

# Add to FAISS
index.add(full_text_embeddings)

# Save the index and texts (optional)
faiss.write_index(index, "final_faiss_fulltext.index")
with open("full_texts.json", "w") as f:
    json.dump(full_texts, f)

In [None]:
from sentence_transformers import CrossEncoder

# Load a cross-encoder model trained for relevance ranking
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")


## legal advice

In [None]:
import numpy as np
import faiss
index=faiss.read_index("/kaggle/working/final_faiss_fulltext.index")
def get_similar_advice(user_question, top_k=10, threshold=0.2):
    # Step 1: Encode query
    query_embedding = model.encode(user_question, convert_to_numpy=True)
    query_embedding = np.array(query_embedding).reshape(1, -1).astype('float32')
    faiss.normalize_L2(query_embedding)

   
    D, I = index.search(query_embedding, top_k)
    similarity_scores = D[0]

    candidates = []
    for idx, i in enumerate(I[0]):
        if i < len(data):
            item = data[i].copy()
            item["similarity"] = float(similarity_scores[idx])
            candidates.append(item)

    # Step 3: Rerank
    rerank_pairs = [(user_question, item["fulltext_summary"]) for item in candidates]
    scores = reranker.predict(rerank_pairs)

    for i, item in enumerate(candidates):
        item["rerank_score"] = scores[i]

    # Step 4: Sort by rerank score
    sorted_candidates = sorted(candidates, key=lambda x: x["rerank_score"], reverse=True)

    
    final_results = []
    for item in sorted_candidates:
        if item["similarity"] >= threshold:
            final_results.append({
                "answers": item["answers_summary"],
                "url": item.get("question_url", None),
                "similarity": round(item["similarity"], 3),
                "rerank_score": round(item["rerank_score"], 3)
            })

    if not final_results:
        return [{
            "answers": "No relevant legal advice found for your query.",
            "url": None,
            "similarity": 0,
            "rerank_score": 0
        }]

    return final_results


In [None]:
user_question = "can i get forced to get divorce because of normal fights"
responses = get_similar_advice(user_question)

for i, res in enumerate(responses, 1):
    print(f"\n--- Response {i} ---")
    print("Answers:\n", res["answers"])
    print("URL:", res["url"])
    print("Similarity:", res["similarity"])
    print("Rerank Score:", res["rerank_score"])
