In [101]:
!pip install datasets transformers -q

In [105]:
from datasets import load_from_disk
classification_answers = load_from_disk("/content/drive/MyDrive/Andre_legal_refference/trainset_answers")
classification_answers

Dataset({
    features: ['file_name', 'path', 'text', 'input_ids', 'attention_mask', 'model_answer'],
    num_rows: 786
})

## Seperate into conviction / acquittal

In [106]:
def clean_responses(sample):
  '''
  sanetizes the json and creates a valid dict as much as possible.
  the model could still not have one of the 3 main keys !
  '''
  all_model_answers = sample['model_answer']
  all_clean_answers = []
  for model_answer in all_model_answers:
    query, response = model_answer.split('</section>\nJSON Answer:')
    # sanetize the json
    response = response.replace('“', '"').replace('”', '"')

    all_json_keys = ['"reasoning":', '"answer":', '"important_factors":']
    # [(key, start, end)]
    json_key_idxs = []
    for json_key in all_json_keys:
      start_idx = response.find(json_key)
      if start_idx != -1:
        json_key_idxs.append([json_key, start_idx+len(json_key), -1])
      else:
        json_key_idxs.append([json_key, -1, -1])

    # put them inorder
    json_key_idxs = sorted(json_key_idxs, key=lambda e: e[1])
    # add end index
    for i in range(len(json_key_idxs)-1):
      if json_key_idxs[i][1] != -1:
        json_key_idxs[i][2] = json_key_idxs[i+1][1] - len(json_key_idxs[i+1][0])

    answer_parts = {e[0][1:-2]:response[e[1]:e[2]] for e in json_key_idxs}
    answer_parts = {k:v.strip().strip('",}').lower() for k, v in answer_parts.items()}
    all_clean_answers.append(answer_parts)

  return {'clean_answer_dict': all_clean_answers}

In [107]:
classification_answers = classification_answers.map(clean_responses, batched=True, batch_size=300, num_proc=4)
classification_answers

Dataset({
    features: ['file_name', 'path', 'text', 'input_ids', 'attention_mask', 'model_answer', 'clean_answer_dict'],
    num_rows: 786
})

In [108]:
for d in classification_answers['clean_answer_dict'][:5]:
  print()
  print(d)


{'answer': 'acquittal', 'important_factors': "the claimant's history of trauma, the defendant's previous acceptance of the claimant's asserted age, recent evidence of the claimant's birth certificate, the defendant's duties towards a former relevant child, the mandatory character of the order sought, the claimant's asylum application, the balance of convenience argument made by the defendant.", 'reasoning': "the court has considered the claimant's history of trauma and his asylum application and the defendant's duties, the court concluded that the claimant had a triable issue that permission should be granted. the court accepted the claimant's evidence and rejected the defendant's argument about the balance of convenience in favor of granting interim relief."}

{'answer': 'acquittal', 'important_factors': "the facts and evidence presented by both the claimant and the defendant, including the assessment of the claimant's age, the application of the dublin iii regulation, and the evalua

In [109]:
from tqdm import tqdm
from datasets.utils.logging import disable_progress_bar
disable_progress_bar()

train_case_names = set(classification_answers['file_name'])

no_answer_count = 0

default_no_answer = 'No Answer.'
all_answers_md = ''

for case_name in tqdm(train_case_names):

    this_case_answers = classification_answers.filter(
                        lambda r: [name == case_name for name in r['file_name']],
                        batched=True,
                        batch_size=1000,
      )

    total_chunks = len(this_case_answers['clean_answer_dict'])
    votes= {'acquittal':0, 'conviction':0, 'neither':0}
    document_answer_md = f'# {case_name}\n'
    for idx, ans_dict in enumerate(this_case_answers['clean_answer_dict']):

        #remove spaces from answer
        ans_dict['answer'] = "".join(ans_dict['answer'].split())
        if ans_dict['answer'] == '':
          del ans_dict['answer']

        document_answer_md += (f'## Part {idx+1} of {total_chunks}\n' +
                              '### Reasoning:\n' + ans_dict.get('reasoning', default_no_answer) + '\n' +
                              '### Important Factors:\n' + ans_dict.get('important_factors', default_no_answer ) + '\n' +
                              '### Answer:\n' + ans_dict.get('answer', default_no_answer) + '\n')


        if "answer" not in ans_dict or ans_dict['answer'] not in votes.keys():
          no_answer_count += 1
        else:
          votes[ans_dict['answer']] += 1

    final_answer = max(votes, key= lambda e: votes[e])
    document_answer_md += '## Final Answer:\n'+ str(final_answer)+ '\n\n\n'

    all_answers_md += document_answer_md


100%|██████████| 342/342 [03:51<00:00,  1.48it/s]


In [111]:
with open('/content/drive/MyDrive/Andre_legal_refference/all_results.md', 'w') as f:
  f.write(all_answers_md)

In [None]:
document_answer_md

'# R v Tang - (2008) 25 BHRC 35\n## Part 1 of 6\n### Reasoning:\nThe court\'s final verdict is a conviction because the complainant is held to be a slave by the definition in s 270.1, as the powers of ownership attached to right of ownership were exercised over them.\n### Important Factors:\nThe complainants were subjected to exploitation, confined to the premises, and their passports were retained. This indicates slavery under the definition in s 270.1.\n### Answer:\nconviction\n## Part 2 of 6\n### Reasoning:\nThe critical issue that the Court of Appeal differed from the primary judge is the approach to the\nexercised powers in determining whether the respondent was exercising a power attaching to a right of ownership.\nIn determining this issue, the jury must consider the powers that the respondent exercised, as well as the state of\nmind of the respondent. The Court of Appeal also considered how to distinguish between exploitation under slavery\nand exploitation not under slavery, b

In [112]:
no_answer_count

48

In [116]:
count = 0
for case_name in tqdm(train_case_names):

    this_case_answers = classification_answers.filter(
                        lambda r: [name == case_name for name in r['file_name']],
                        batched=True,
                        batch_size=1000,
      )


    count += 1

100%|██████████| 342/342 [00:03<00:00, 100.15it/s]


In [117]:
count

342

In [115]:
classification_answers

Dataset({
    features: ['file_name', 'path', 'text', 'input_ids', 'attention_mask', 'model_answer', 'clean_answer_dict'],
    num_rows: 786
})