# Question and Answers Prompt For Legal Documents

In [4]:
docs = ['The appellant having been convicted under Section 80 of the Karnataka Police Act, 1963 (for short, ‘the 1963 Act’) has filed the present appeal.',
          'Notice in the appeal was issued on 27.02.2023 limited to the extent of consideration as to whether the appellant can be granted benefit of probation.',
           'The brief facts of the case are that FIR dated 16.8.2007 was registered against 24 accused persons including the appellant under sections 79 and 80 of the 1963 Act as they were found to be indulging in gambling.',
          'The charge sheet was filed and the Trial Court vide order dated 21.8.2007 convicted them under Section 79 & 80 of the 1963 Act and sentenced them to undergo imprisonment for a period of one yeareach under both the provisions along with a fine of ₹ 600/- after the accused had pleaded guilty.'
         ]

query = ["On what Section the appelant was convicted?"]


In [10]:
from sentence_transformers import SentenceTransformer, util

#Load the model
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

#Encode query and documents
query_emb = model.encode(query)
doc_emb = model.encode(docs)

#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()

#Combine docs & scores
doc_score_pairs = list(zip(docs, scores))

#Sort by decreasing score
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)


In [11]:
#Output passages with maximum score
doc_score_pairs[:1]

[('The charge sheet was filed and the Trial Court vide order dated 21.8.2007 convicted them under Section 79 & 80 of the 1963 Act and sentenced them to undergo imprisonment for a period of one yeareach under both the provisions along with a fine of ₹ 600/- after the accused had pleaded guilty.',
  0.6195369958877563)]

In [12]:
from transformers import AutoTokenizer

# Model page: https://huggingface.co/deepset/minilm-uncased-squad2 
model_ckpt = "deepset/minilm-uncased-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


Downloading (…)okenizer_config.json:   0%|          | 0.00/107 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [31]:
context = ['The appellant having been convicted under Section 80 of the Karnataka Police Act, 1963 (for short, ‘the 1963 Act’) has filed the present appeal.',]

question = ["On what Section the appelant were convicted?"]


inputs = tokenizer(question, context, return_tensors="pt")
print(tokenizer.decode(inputs["input_ids"][0]))

[CLS] on what section the appelant were convicted? [SEP] the appellant having been convicted under section 80 of the karnataka police act, 1963 ( for short, ‘ the 1963 act ’ ) has filed the present appeal. [SEP]


In [15]:
import torch
from transformers import AutoModelForQuestionAnswering
model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)

with torch.no_grad():
    outputs=model(**inputs)

print(outputs)

Downloading pytorch_model.bin:   0%|          | 0.00/133M [00:00<?, ?B/s]

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[ 0.3214, -4.5442, -5.8110, -6.0528, -6.1171, -5.7079, -6.1160, -6.4792,
         -6.1341, -5.9327, -6.6333,  0.3214, -3.3670, -1.9377, -5.9480, -3.6266,
         -4.4707, -3.3391,  0.8898,  6.0573,  4.2786, -4.5633,  2.9414,  4.2382,
         -1.8567, -3.4903, -5.2006, -0.2059, -3.8126, -4.6771, -5.4918, -5.0669,
          0.0161,  1.3732, -0.5214, -4.2869, -4.8108, -3.8870, -5.5576, -5.7487,
         -4.5185, -5.3228, -6.0512, -5.7679,  0.3214]]), end_logits=tensor([[ 0.4426, -6.6505, -6.3572, -5.6178, -6.1253, -6.5285, -6.1588, -5.4673,
         -6.2583, -6.3197, -5.1253,  0.4426, -5.5266, -5.9630, -4.4588, -5.6780,
         -6.4042, -5.4548, -4.3415, -0.8966,  5.9446, -3.0777, -2.6579, -1.2025,
         -0.3754,  3.2214,  0.1343,  5.6785, -3.1645, -6.6984, -5.9521, -5.0107,
         -4.6704, -4.1363, -1.2422,  1.3008,  0.5916,  1.5918, -5.2049, -5.8271,
         -4.7443, -5.9232, -4.1225, -2.4251,  0.4426]]), hidden_state

In [32]:
start_logits = outputs.start_logits
end_logits = outputs.end_logits
print(f"Input IDs: {inputs.input_ids.size()}")
print(f"Start Logits: {start_logits.size()}")
print(f"End Logits: {end_logits.size()}")

Input IDs: torch.Size([1, 45])
Start Logits: torch.Size([1, 45])
End Logits: torch.Size([1, 45])


In [33]:
# preprocess data
sentence = tokenizer.decode(inputs["input_ids"][0])
x = sentence.replace('.', ' .').replace(',', ' ,').replace('?', ' ?').split()
y = start_logits.numpy()[0]
y2 = end_logits.numpy()[0]

In [34]:
start_idx = torch.argmax(start_logits)
end_idx = torch.argmax(end_logits)+1

# Get the maximum scoring start and end tokens then get the sentence within that
answer_span = inputs["input_ids"][0][start_idx:end_idx]
answer = tokenizer.decode(answer_span)

print(f"Q: {question}")
print(f"A: {answer}")

Q: ['On what Section the appelant were convicted?']
A: section 80
