## Imports and Device

In [1]:
# ! pip install transformers

import pandas as pd
import numpy as np
import torch
import os, gc
import re

from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

from torch import cuda, nn, optim
# from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
manual_seed = 595
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Read the Previous Predictions

In [3]:
# run locally
# text_path = 'to_annotate/'
file = 'allard_a_preds.csv'

In [4]:
df = pd.read_csv(file)

print(df.shape)
df.head(6)

(532, 25)


Unnamed: 0,raw_file_str,cleaned_case_with_newlines,full_file,metadata,content,citation,file_number,language,year,ltb_location,...,Did the decision state the landlord was represented?,Did the tenant propose a payment plan?,"If the tenant did propose a payment plan, did the member accept the proposed payment plan?",Did the decision state the tenant was represented?,"If the tenant had a history of arrears, did the decision mention a history of the tenant making payments on those arrears (separate from any payments made in response to the present eviction notice/hearing)?",Did the decision state that the tenant had children living with them?,Was the tenant employed at the time of the hearing?,"If the tenant was not employed, did the decision state the tenant was receiving any form of government assistance (e.g. OW, childcare benefits, ODSP, OSAP)?","If the tenant was employed, did the decision state any doubts about the stability of employment e.g. lack of guaranteed hours, contract work, etc.?",Did the member find the tenant had sufficient income to pay rent?
0,Metadata:\nDate:\t2022-02-24\nFile number:\t\n...,Metadata:\nDate: 2022-02-24\nFile number:\nSWL...,Metadata: Date: 2022-02-24 File number: SWL-57...,Date: 2022-02-24 File number: SWL-57718-22 Ci...,Order under Section 77 Residential Tenancies A...,"Drier v Hill, 2022 CanLII 128599 (ON LTB)",SWL-57718-22,English,2022,6 Dunsmere Drive Kitchener,...,Yes,No,Not stated,No,Not stated,No,Not stated,Not stated,Not stated,No
1,Metadata:\nDate:\t2022-02-02\nFile number:\t\n...,Metadata:\nDate: 2022-02-02\nFile number:\nSWL...,Metadata: Date: 2022-02-02 File number: SWL-57...,Date: 2022-02-02 File number: SWL-57618-22 Ci...,Order under Section 78(6) Residential Tenancie...,"Waterloo Region Housing v Underwood, 2022 CanL...",SWL-57618-22,English,2022,49 Holborn Drive Kitchener,...,No,No,Not stated,No,Not stated,No,Not stated,Not stated,Not stated,No
2,Metadata:\nDate:\t2022-02-23\nFile number:\t\n...,Metadata:\nDate: 2022-02-23\nFile number:\nSOL...,Metadata: Date: 2022-02-23 File number: SOL-26...,Date: 2022-02-23 File number: SOL-26921-22 Ci...,Order under Section 78(6) Residential Tenancie...,"Anastasakis v Iwashita, 2022 CanLII 128519 (ON...",SOL-26921-22,English,2022,"R, 1306 King St E Hamilt",...,Yes,No,Not stated,No,Not stated,No,Not stated,Not stated,Not stated,No
3,Metadata:\nDate:\t2022-02-04\nFile number:\t\n...,Metadata:\nDate: 2022-02-04\nFile number:\nCEL...,Metadata: Date: 2022-02-04 File number: CEL-04...,Date: 2022-02-04 File number: CEL-04513-22 Ci...,Order under Section 78(6) Residential Tenancie...,"Virk v Hashey, 2022 CanLII 88013 (ON LTB)",CEL-04513-22,English,2022,Juniper Crescent Brampt,...,Yes,Yes,Not stated,No,Not stated,No,Not stated,Not stated,Not stated,No
4,Metadata:\nDate:\t2022-01-19\nFile number:\t\n...,Metadata:\nDate: 2022-01-19\nFile number:\nCEL...,Metadata: Date: 2022-01-19 File number: CEL-04...,Date: 2022-01-19 File number: CEL-04413-22 Ci...,Order under Section 78(6) Residential Tenancie...,"Grey Bruce Property Rentals Inc v Thompson, 20...",CEL-04413-22,English,2022,9Th Avenue East Owen Sound,...,Yes,No,Not stated,No,Not stated,No,Not stated,Not stated,Not stated,No
5,Metadata:\nDate:\t2022-02-24\nFile number:\t\n...,Metadata:\nDate: 2022-02-24\nFile number:\nCEL...,Metadata: Date: 2022-02-24 File number: CEL-04...,Date: 2022-02-24 File number: CEL-04570-22 Ci...,Order under Section 78(6) Residential Tenancie...,"Mckendry v Pinkerton, 2022 CanLII 127268 (ON LTB)",CEL-04570-22,English,2022,8 Codringt Street Barrie,...,Yes,No,Not stated,No,Not stated,No,Not stated,Not stated,Not stated,No


In [5]:
df.columns

Index(['raw_file_str', 'cleaned_case_with_newlines', 'full_file', 'metadata',
       'content', 'citation', 'file_number', 'language', 'year',
       'ltb_location', 'hearing_date', 'decision_date', 'url',
       'adjudicating_member', 'outcome_span',
       'Did the decision state the landlord was represented?',
       'Did the tenant propose a payment plan?',
       'If the tenant did propose a payment plan, did the member accept the proposed payment plan?',
       'Did the decision state the tenant was represented?',
       'If the tenant had a history of arrears, did the decision mention a history of the tenant making payments on those arrears (separate from any payments made in response to the present eviction notice/hearing)?',
       'Did the decision state that the tenant had children living with them?',
       'Was the tenant employed at the time of the hearing?',
       'If the tenant was not employed, did the decision state the tenant was receiving any form of government ass

In [6]:
led3_qs = [
    'Did the decision state that the tenant had children living with them?',
    'Was the tenant employed at the time of the hearing?'
          ]

In [7]:
raw_file_text = df['raw_file_str']
raw_file_text[0][:200]

'Metadata:\nDate:\t2022-02-24\nFile number:\t\nSWL-57718-22\n\nCitation:\tDrier v Hill, 2022 CanLII 128599 (ON LTB), <https://canlii.ca/t/jv57m>, retrieved on 2023-05-16\nContent:\n\n\n\n\xa0\nOrder under Section 77\nRe'

## Initialize the Tokenizer and Load the LED Model

In [8]:
# @article{Beltagy2020Longformer,
#   title={Longformer: The Long-Document Transformer},
#   author={Iz Beltagy and Matthew E. Peters and Arman Cohan},
#   journal={arXiv:2004.05150},
#   year={2020},
# }

led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")
led3 = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384", 
                                              gradient_checkpointing=True, use_cache=False)
led3.load_state_dict(torch.load('led_3epoch_law_allqs.pt', map_location=device))

<All keys matched successfully>

## Prepare the Prediction Input

In [9]:
def q_prompt(raw_texts, q_lst, q_no):
    input_texts = []
    
    for i in range(len(raw_texts)):
        full_text = raw_texts[i]
       
        text = full_text[full_text.find('Content:')+len('Content:'):]

        text = text.replace('\n', ' ')
        text = text.replace('\xa0', ' ')
        text = text.replace('\t', ' ')
        text = text.replace('   ', ' ').replace('  ', ' ').replace('  ', ' ').replace('  ', ' ')
        # text
        # for word in stop_words:
        #     text = text.replace(' '+word+' ', ' ')

        if 'Schedule 1' in text:
            s_idx = text.find('Schedule 1')
            text = text[:s_idx]

        input_text = f'Question: {q_lst[q_no]} Text: {text}'  
        input_texts.append(input_text)

    # print(len(input_texts), len(outputs))
    # print(input_texts[0], outputs[0]) 
    # print(long_cases)
    return input_texts


def q_preprocess(raw_texts, q_lst, q_no, tokenizer):
    input_texts = q_prompt(raw_texts, q_lst, q_no)   
    
    input_toks = tokenizer.batch_encode_plus(input_texts,
                                             add_special_tokens=False, 
                                             return_token_type_ids=False)
    
    return input_toks
    

## Create the Dataset

In [10]:
class CaseDataset(Dataset):

    def __init__(self, inputs):
        self.inputs = inputs
        # self.outputs = outputs

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        input_ids = self.inputs['input_ids'][idx]
        attention_mask = self.inputs['attention_mask'][idx]

        # target_ids = self.outputs['input_ids'][idx]
        # target_attention_mask = self.outputs['attention_mask'][idx]
        return {"input_ids": input_ids, "attention_mask":attention_mask}#, "output_ids":target_ids}


def collate_fn_led(batch):
    batch_input = [torch.LongTensor(example['input_ids']) for example in batch]
    # batch_output = [torch.LongTensor(example['output_ids']) for example in batch]
    batch_mask = [torch.LongTensor(example['attention_mask']) for example in batch]

    padded_batch_input_ids = pad_sequence(batch_input, batch_first=True, padding_value=led_tokenizer.pad_token_id)
    # padded_batch_label = pad_sequence(batch_output, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_batch_att_mask = pad_sequence(batch_mask, batch_first=True, padding_value=-100)

    return {"input_ids": padded_batch_input_ids, "attention_mask": padded_batch_att_mask}#, "labels": padded_batch_label}

def collate_fn_longt5(batch):
    batch_input = [torch.LongTensor(example['input_ids']) for example in batch]
    # batch_output = [torch.LongTensor(example['output_ids']) for example in batch]
    batch_mask = [torch.LongTensor(example['attention_mask']) for example in batch]

    padded_batch_input_ids = pad_sequence(batch_input, batch_first=True, padding_value=longt5_tokenizer.pad_token_id)
    # padded_batch_label = pad_sequence(batch_output, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_batch_att_mask = pad_sequence(batch_mask, batch_first=True, padding_value=-100)

    return {"input_ids": padded_batch_input_ids, "attention_mask": padded_batch_att_mask}#, "labels": padded_batch_label}

def to_device(data, device):
    new_data = {}
    for k in data:
        new_data[k] = data[k].to(device)
    return new_data

## Prepare the Functions for Prediction

In [11]:
@torch.no_grad()
def answer(model, loader, tokenizer):
    all_preds = []
    all_labels = []
    model.eval()
    for batch in loader:
        batch = to_device(batch, device)
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        # labels = batch["labels"]
        outputs = model.generate(input_ids=input_ids, 
                                 attention_mask=attention_mask, 
                                 return_dict_in_generate=True, 
                                 pad_token_id=tokenizer.pad_token_id, 
                                 max_length=512, 
                                 top_k=15)
        
        decode_texts = tokenizer.batch_decode([l[l != 0] for l in outputs['sequences']])
        # gold_texts = tokenizer.batch_decode([l[l != 0] for l in labels])
        # print(decode_texts, gold_texts)
        for decode in decode_texts:
            # l = gold.replace('</s>', '').replace('<pad>','').replace('<s>', '')
            p = decode.replace('</s>', '').replace('<pad>','').replace('<s>', '')

            # if '<pad>' in gold:
            #     l_pad_idx = gold.index('<pad>')
            #     l = gold[:l_pad_idx].replace(' ', '').replace('</s>', '').replace('<pad>','').replace('<s>', '')
            # else:
            #     l = gold.replace(' ', '').replace('</s>', '').replace('<pad>','').replace('<s>', '')
            
            # if '<pad>' in decode:
            #     p_pad_idx = decode.index('<pad>')
            #     p = decode[:p_pad_idx].replace(' ', '').replace('</s>', '').replace('<pad>','').replace('<s>', '')
            # else:
            #     p = decode.replace(' ', '').replace('</s>', '').replace('<pad>','').replace('<s>', '')

            # print(l, p)
            # all_labels.append(l)
            all_preds.append(p)
    
    return all_preds


## Predict the Answers and Replace the Previous Answers

In [12]:
def get_pred_dataloader(raw_texts, q_lst, q_no, tokenizer):
    input_toks = q_preprocess(raw_texts, q_lst, q_no, tokenizer)
    dataset = CaseDataset(input_toks)
    if tokenizer == led_tokenizer:
        dataloader = DataLoader(dataset, 
                                batch_size=64, 
                                collate_fn=collate_fn_led, 
                                shuffle=False)
    elif tokenizer == longt5_tokenizer:
        dataloader = DataLoader(dataset, 
                                batch_size=64, 
                                collate_fn=collate_fn_longt5, 
                                shuffle=False)
    return dataloader

In [13]:
def answer_qs(raw_texts, q_lst, q_no, tokenizer, model):
    loader = get_pred_dataloader(raw_texts, q_lst, q_no, tokenizer)
    # print(len(loader))
    
    print(f'Q{q_no+1}: {q_lst[q_no]}')
    
    preds = answer(model, loader, tokenizer)
    # acc, correct, total = accuracy(preds, golds)
    # acc = round(acc, 5)
    
    # print(f"Accuracy for this question is: {acc*100}%")
    # print('')
    
    return preds

In [14]:
# del train_df, train_loader, count_parameters
# gc.collect()

In [15]:
preds_dict = {}

In [16]:
led3.to(device)
for i in range(len(led3_qs)):
    preds_dict[led3_qs[i]] = answer_qs(
        raw_file_text, led3_qs, i, led_tokenizer, led3
    )

del led3, led_tokenizer
gc.collect()

Q1: Did the decision state that the tenant had children living with them?




Q2: Was the tenant employed at the time of the hearing?


NameError: name 'led3_tokenizer' is not defined

In [17]:
print(len(preds_dict))
preds_dict.keys()

2


dict_keys(['Did the decision state that the tenant had children living with them?', 'Was the tenant employed at the time of the hearing?'])

In [18]:
led_df = pd.DataFrame(preds_dict)
led_df

Unnamed: 0,Did the decision state that the tenant had children living with them?,Was the tenant employed at the time of the hearing?
0,No,Not stated
1,No,Not stated
2,No,Not stated
3,No,Yes
4,No,Not stated
...,...,...
527,No,Yes
528,No,Not stated
529,No,Not stated
530,No,Not stated


In [19]:
led_df.to_csv('led_sup_preds.csv', index=False)

In [21]:
for key in led_df:
    df[key] = led_df[key]
df

Unnamed: 0,raw_file_str,cleaned_case_with_newlines,full_file,metadata,content,citation,file_number,language,year,ltb_location,...,Did the decision state the landlord was represented?,Did the tenant propose a payment plan?,"If the tenant did propose a payment plan, did the member accept the proposed payment plan?",Did the decision state the tenant was represented?,"If the tenant had a history of arrears, did the decision mention a history of the tenant making payments on those arrears (separate from any payments made in response to the present eviction notice/hearing)?",Did the decision state that the tenant had children living with them?,Was the tenant employed at the time of the hearing?,"If the tenant was not employed, did the decision state the tenant was receiving any form of government assistance (e.g. OW, childcare benefits, ODSP, OSAP)?","If the tenant was employed, did the decision state any doubts about the stability of employment e.g. lack of guaranteed hours, contract work, etc.?",Did the member find the tenant had sufficient income to pay rent?
0,Metadata:\nDate:\t2022-02-24\nFile number:\t\n...,Metadata:\nDate: 2022-02-24\nFile number:\nSWL...,Metadata: Date: 2022-02-24 File number: SWL-57...,Date: 2022-02-24 File number: SWL-57718-22 Ci...,Order under Section 77 Residential Tenancies A...,"Drier v Hill, 2022 CanLII 128599 (ON LTB)",SWL-57718-22,English,2022,6 Dunsmere Drive Kitchener,...,Yes,No,Not stated,No,Not stated,No,Not stated,Not stated,Not stated,No
1,Metadata:\nDate:\t2022-02-02\nFile number:\t\n...,Metadata:\nDate: 2022-02-02\nFile number:\nSWL...,Metadata: Date: 2022-02-02 File number: SWL-57...,Date: 2022-02-02 File number: SWL-57618-22 Ci...,Order under Section 78(6) Residential Tenancie...,"Waterloo Region Housing v Underwood, 2022 CanL...",SWL-57618-22,English,2022,49 Holborn Drive Kitchener,...,No,No,Not stated,No,Not stated,No,Not stated,Not stated,Not stated,No
2,Metadata:\nDate:\t2022-02-23\nFile number:\t\n...,Metadata:\nDate: 2022-02-23\nFile number:\nSOL...,Metadata: Date: 2022-02-23 File number: SOL-26...,Date: 2022-02-23 File number: SOL-26921-22 Ci...,Order under Section 78(6) Residential Tenancie...,"Anastasakis v Iwashita, 2022 CanLII 128519 (ON...",SOL-26921-22,English,2022,"R, 1306 King St E Hamilt",...,Yes,No,Not stated,No,Not stated,No,Not stated,Not stated,Not stated,No
3,Metadata:\nDate:\t2022-02-04\nFile number:\t\n...,Metadata:\nDate: 2022-02-04\nFile number:\nCEL...,Metadata: Date: 2022-02-04 File number: CEL-04...,Date: 2022-02-04 File number: CEL-04513-22 Ci...,Order under Section 78(6) Residential Tenancie...,"Virk v Hashey, 2022 CanLII 88013 (ON LTB)",CEL-04513-22,English,2022,Juniper Crescent Brampt,...,Yes,Yes,Not stated,No,Not stated,No,Yes,Not stated,Not stated,No
4,Metadata:\nDate:\t2022-01-19\nFile number:\t\n...,Metadata:\nDate: 2022-01-19\nFile number:\nCEL...,Metadata: Date: 2022-01-19 File number: CEL-04...,Date: 2022-01-19 File number: CEL-04413-22 Ci...,Order under Section 78(6) Residential Tenancie...,"Grey Bruce Property Rentals Inc v Thompson, 20...",CEL-04413-22,English,2022,9Th Avenue East Owen Sound,...,Yes,No,Not stated,No,Not stated,No,Not stated,Not stated,Not stated,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527,Metadata:\nDate:\t2022-02-07\nFile number:\t\n...,Metadata:\nDate: 2022-02-07\nFile number:\nSWL...,Metadata: Date: 2022-02-07 File number: SWL-57...,Date: 2022-02-07 File number: SWL-57644-22 Ci...,Order under Section 78(6) Residential Tenancie...,"Marda Management Inc. v Derose, 2022 CanLII 97...",SWL-57644-22,English,2022,71 Tecumseh Road W Windsor,...,Yes,Yes,Not stated,No,Not stated,No,Yes,Not stated,Not stated,No
528,Metadata:\nDate:\t2022-01-21\nFile number:\t\n...,Metadata:\nDate: 2022-01-21\nFile number:\nHOL...,Metadata: Date: 2022-01-21 File number: HOL-12...,Date: 2022-01-21 File number: HOL-12856-22 Ci...,Order under Section 77 Residential Tenancies A...,"Farahani v Plourde, 2022 CanLII 78831 (ON LTB)",HOL-12856-22,English,2022,Rfa Crescent West Kingst,...,Yes,No,Not stated,No,Not stated,No,Not stated,Not stated,Not stated,No
529,Metadata:\nDate:\t2022-02-16\nFile number:\t\n...,Metadata:\nDate: 2022-02-16\nFile number:\nSOL...,Metadata: Date: 2022-02-16 File number: SOL-26...,Date: 2022-02-16 File number: SOL-26900-22 Ci...,Order under Section 77 Residential Tenancies A...,"Retsinas v Napieraj, 2022 CanLII 108707 (ON LTB)",SOL-26900-22,English,2022,1 Buffalo Street Brantford,...,Yes,No,Not stated,No,Not stated,No,Not stated,Not stated,Not stated,No
530,Metadata:\nDate:\t2022-02-09\nFile number:\t\n...,Metadata:\nDate: 2022-02-09\nFile number:\nCEL...,Metadata: Date: 2022-02-09 File number: CEL-04...,Date: 2022-02-09 File number: CEL-04532-22 Ci...,Order under Section 77 Residential Tenancies A...,"Beers v Banks, 2022 CanLII 87905 (ON LTB)",CEL-04532-22,English,2022,Albot Street Port Mcnicoll,...,No,No,Not stated,No,Not stated,No,Not stated,Not stated,Not stated,No


In [22]:
df.to_csv('allard_a_preds_532.csv', index=False)