In [None]:
# Install necessary libraries
!pip install transformers faiss-cpu sentence-transformers --quiet
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import faiss
import numpy as np
cases = [
{"case_id": 1, "title": "Fraud Investigation", "summary": "A case involving financial fraud where an individual defrauded investors.", "full_text": "Detailed legal text about the fraud investigation..."},
{"case_id": 2, "title": "Tax Evasion", "summary": "A case related to a company that evaded taxes for several years, resulting in legal action.", "full_text": "Full legal document about the tax evasion case..."},
{"case_id": 3, "title": "Breach of Contract", "summary": "A business partnership ended due to a breach of contract, leading to a lawsuit.", "full_text": "Complete legal document regarding breach of contract..."},
{"case_id": 4, "title": "Personal Injury Claim", "summary": "A personal injury claim filed against a company for unsafe work conditions.", "full_text": "Full legal document related to the personal injury claim..."},
{"case_id": 5, "title": "Trademark Infringement", "summary": "A dispute over trademark infringement between two clothing brands.", "full_text": "Full legal document detailing the trademark infringement case..."},
{"case_id": 6, "title": "Cybercrime Investigation", "summary": "A case involving hacking and theft of data from a major corporation.", "full_text": "Legal document detailing the cybercrime investigation..."},
{"case_id": 7, "title": "Real Estate Fraud", "summary": "A case about fraudulent real estate transactions involving falsified property documents.", "full_text": "Complete legal text regarding real estate fraud..."},
{"case_id": 8, "title": "Insurance Dispute", "summary": "A dispute between an insurance company and a claimant over coverage of damages.", "full_text": "Legal text detailing the insurance dispute case..."}
]
case_summaries = [case['summary'] for case in cases]
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = model.to('cuda') # Move the model to GPU for faster processing
def embed_text(text):
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
  inputs = {key: val.to('cuda') for key, val in inputs.items()} # Move inputs to GPU
  with torch.no_grad():
    outputs = model(**inputs)
  return outputs.last_hidden_state.mean(dim=1).cpu()
case_embeddings = torch.stack([embed_text(summary) for summary in case_summaries])
case_embeddings_np = case_embeddings.numpy().reshape(len(case_summaries), -1)
index = faiss.IndexFlatL2(case_embeddings_np.shape[1])
index.add(case_embeddings_np) # Add the precomputed embeddings to the FAISS index
def search_cases(query, index, cases, top_n=5):
  query_embedding = embed_text(query).numpy().reshape(1, -1)
  distances, indices = index.search(query_embedding, top_n)
  return [cases[i] for i in indices[0]] # Return the cases corresponding to the top N indices
user_query = input("Enter your legal query: ")
results = search_cases(user_query, index, cases, top_n=5)
for result in results:
  print(f"Case Title: {result['title']}")
  print(f"Summary: {result['summary']}")
  print(f"Full Text: {result['full_text'][:500]}...") # Show the first 500 characters of the full text
  print("\n")