## Imports and Device

In [23]:
# ! pip install transformers

import pandas as pd
import numpy as np
import torch
import os, gc
import re

from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

from torch import cuda, nn, optim
# from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


In [24]:
manual_seed = 595
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


## Read the Cleaned Data

### Define the paths

In [25]:
# run locally
text_path = '../../to_annotate/'
file = '../../annotated_data.xlsx'

In [26]:
# # run on Google Colab
# from google.colab import drive
# drive.mount('/content/gdrive')
# text_path = '/content/gdrive/My Drive/595/formatted_cases/'
# file = '/content/gdrive/My Drive/595/annotated_data.xlsx'
# REGEX = r';+'
# sup_path = '/content/gdrive/My Drive/595/annotated_sup/'
# multi_path = text_path + 'multiple_files/'

### Clean the Dataframe

In [27]:
df = pd.read_excel(file)
df.rename(columns={
    'If yes to the previous question, did the decision state these conditions would make moving particularly burdensome?':
    'If any of the children had mental, medical or physical conditions, did the decision state these conditions would make moving particularly burdensome?',
    'If yes to the previous question, which of the following were applicable to the tenant?':
    'If the tenant had difficulty finding alternative housing for any reason, which of the following were applicable to the tenant?'    
}, inplace=True)

df = df.iloc[:, 2:-2]

print(df.shape)
# df.head(6)

(702, 50)


In [28]:
df.columns

Index(['What is the file number of the case?',
       'What was the date of the hearing? [mm/dd/yyyy]',
       'What was the date of the decision? [mm/dd/yyyy]',
       'Who was the member adjudicating the decision?',
       'What was the location of the landlord tenant board?',
       'Did the decision state the landlord was represented?',
       'Did the decision state the landlord attended the hearing?',
       'Did the decision state the tenant was represented?',
       'Did the decision state the tenant attended the hearing?',
       'Did the decision state the landlord was a not-for-profit landlord (e.g. Toronto Community Housing)?',
       'Did the decision state the tenant was collecting a subsidy?',
       'What was the outcome of the case?',
       'What was the length of the tenancy, or in other words, how long had the tenants lived at the residence in question? ',
       'What was the monthly rent?',
       'What was the amount of the rental deposit? ',
       'If any rent 

In [29]:
# remove columns that have too little information
little_info_col = [15, 16, 26, 27, 28, 29, 30, 31, 41, 43, 45]
to_del = [df.columns[i] for i in little_info_col]
for col in to_del:
    del df[col]
to_del

['If any rent increases occurred, what was the rent after the increase(s)?',
 'If any rent increases occurred, when did the rent increase(s) come into effect? ',
 'How many total children did the tenant have living with them? ',
 'How many total children aged 17 or younger did the tenant have living with them?',
 'How many total children aged 13 or younger did the tenant have living with them? ',
 'How many total children aged 4 or younger did the tenant have living with them?',
 'Did the decision state any of the children had mental, medical or physical conditions?',
 'If any of the children had mental, medical or physical conditions, did the decision state these conditions would make moving particularly burdensome?',
 'If a payment plan was ordered, what was the length of the payment plan? ',
 'If the tenant had difficulty finding alternative housing for any reason, which of the following were applicable to the tenant?',
 'If the tenant was given prior notice for the eviction, how mu

In [30]:
print(df.shape)
del to_del
gc.collect()

(702, 39)


1756

In [31]:
questions = list(df.columns)
questions

['What is the file number of the case?',
 'What was the date of the hearing? [mm/dd/yyyy]',
 'What was the date of the decision? [mm/dd/yyyy]',
 'Who was the member adjudicating the decision?',
 'What was the location of the landlord tenant board?',
 'Did the decision state the landlord was represented?',
 'Did the decision state the landlord attended the hearing?',
 'Did the decision state the tenant was represented?',
 'Did the decision state the tenant attended the hearing?',
 'Did the decision state the landlord was a not-for-profit landlord (e.g. Toronto Community Housing)?',
 'Did the decision state the tenant was collecting a subsidy?',
 'What was the outcome of the case?',
 'What was the length of the tenancy, or in other words, how long had the tenants lived at the residence in question? ',
 'What was the monthly rent?',
 'What was the amount of the rental deposit? ',
 'What was the total amount of arrears?',
 'Over how many months did the arrears accumulate? ',
 'If the tenan

In [33]:
led3_qs = ['What was the location of the landlord tenant board?',
           'Did the decision state the landlord was represented?',
           'Did the decision state that the tenant had children living with them?',
           'Was the tenant employed at the time of the hearing?',
           'Did the tenant propose a payment plan?',
           'If the tenant did propose a payment plan, did the member accept the proposed payment plan?'
          ]

led2_qs = []

longt5_qs = ['Did the decision state the tenant was represented?',
             'If the tenant was not employed, did the decision state the tenant was receiving any form of government assistance (e.g. OW, childcare benefits, ODSP, OSAP)?',
             'If the tenant was employed, did the decision state any doubts about the stability of employment e.g. lack of guaranteed hours, contract work, etc.?',
             'Did the member find the tenant had sufficient income to pay rent?']


In [34]:
file_names = os.listdir(text_path)
raw_file_text = []
for file in file_names:
    with open(text_path+file) as t:
        raw_file_text.append(t.read())

## Initialize the Tokenizers and Load the Models

In [35]:
# @article{Beltagy2020Longformer,
#   title={Longformer: The Long-Document Transformer},
#   author={Iz Beltagy and Matthew E. Peters and Arman Cohan},
#   journal={arXiv:2004.05150},
#   year={2020},
# }

led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")
led3 = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384", 
                                              gradient_checkpointing=True, use_cache=False)
led3.load_state_dict(torch.load('led_3epoch_law_allqs.pt', map_location=device))
led3.to(device)

# LED (2 epochs), and it share the `led_tokenizer`
led2 = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384", 
                                              gradient_checkpointing=True, use_cache=False)
led2.load_state_dict(torch.load('led_2epoch_law_allqs.pt', map_location=device))
led2.to(device)

# LongT5
longt5_tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
longt5 = AutoModelForSeq2SeqLM.from_pretrained("google/long-t5-local-base")
longt5.load_state_dict(torch.load('longT5_2epoch_law_allqs.pt', map_location=device))
longt5.to(device)


<All keys matched successfully>

## Prepare the Prediction Input

In [38]:
def q_prompt(raw_texts, q_lst, q_no):
    input_texts = []
    
    for i in range(len(raw_texts)):
        full_text = raw_texts[i]
       
        text = full_text[full_text.find('Content:')+len('Content:'):]

        text = text.replace('\n', ' ')
        text = text.replace('\xa0', ' ')
        text = text.replace('\t', ' ')
        text = text.replace('   ', ' ').replace('  ', ' ').replace('  ', ' ').replace('  ', ' ')
        # text
        # for word in stop_words:
        #     text = text.replace(' '+word+' ', ' ')

        if 'Schedule 1' in text:
            s_idx = text.find('Schedule 1')
            text = text[:s_idx]

        input_text = f'Question: {q_lst[q_no]} Text: {text}'  
        input_texts.append(input_text)

    # print(len(input_texts), len(outputs))
    # print(input_texts[0], outputs[0]) 
    # print(long_cases)
    return input_texts


def q_preprocess(raw_texts, q_lst, q_no, tokenizer):
    input_texts = q_prompt(raw_texts, q_lst, q_no)   
    
    input_toks = tokenizer.batch_encode_plus(input_texts,
                                             add_special_tokens=False, 
                                             return_token_type_ids=False)
    
    return input_toks
    

## Create the Dataset

In [39]:
PAD = tokenizer.pad_token_id
SEP = tokenizer.sep_token_id
PAD, SEP

(1, 2)

In [52]:
class CaseDataset(Dataset):

    def __init__(self, inputs):
        self.inputs = inputs
        # self.outputs = outputs

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        input_ids = self.inputs['input_ids'][idx]
        attention_mask = self.inputs['attention_mask'][idx]

        # target_ids = self.outputs['input_ids'][idx]
        # target_attention_mask = self.outputs['attention_mask'][idx]
        return {"input_ids": input_ids, "attention_mask":attention_mask}#, "output_ids":target_ids}


def collate_fn(batch):
    batch_input = [torch.LongTensor(example['input_ids']) for example in batch]
    # batch_output = [torch.LongTensor(example['output_ids']) for example in batch]
    batch_mask = [torch.LongTensor(example['attention_mask']) for example in batch]

    padded_batch_input_ids = pad_sequence(batch_input, batch_first=True, padding_value=tokenizer.pad_token_id)
    # padded_batch_label = pad_sequence(batch_output, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_batch_att_mask = pad_sequence(batch_mask, batch_first=True, padding_value=-100)

    return {"input_ids": padded_batch_input_ids, "attention_mask": padded_batch_att_mask}#, "labels": padded_batch_label}


def to_device(data, device):
    new_data = {}
    for k in data:
        new_data[k] = data[k].to(device)
    return new_data

## Prepare the Functions for Evaluation

In [53]:
@torch.no_grad()
def answer(model, loader):
    all_preds = []
    all_labels = []
    model.eval()
    for batch in loader:
        batch = to_device(batch, device)
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        # labels = batch["labels"]
        outputs = model.generate(input_ids=input_ids, 
                                 attention_mask=attention_mask, 
                                 return_dict_in_generate=True, 
                                 pad_token_id=tokenizer.pad_token_id, 
                                 max_length=512, 
                                 top_k=15)
        
        decode_texts = tokenizer.batch_decode([l[l != 0] for l in outputs['sequences']])
        # gold_texts = tokenizer.batch_decode([l[l != 0] for l in labels])
        # print(decode_texts, gold_texts)
        for decode in decode_texts:
            # l = gold.replace('</s>', '').replace('<pad>','').replace('<s>', '')
            p = decode.replace('</s>', '').replace('<pad>','').replace('<s>', '')

            # if '<pad>' in gold:
            #     l_pad_idx = gold.index('<pad>')
            #     l = gold[:l_pad_idx].replace(' ', '').replace('</s>', '').replace('<pad>','').replace('<s>', '')
            # else:
            #     l = gold.replace(' ', '').replace('</s>', '').replace('<pad>','').replace('<s>', '')
            
            # if '<pad>' in decode:
            #     p_pad_idx = decode.index('<pad>')
            #     p = decode[:p_pad_idx].replace(' ', '').replace('</s>', '').replace('<pad>','').replace('<s>', '')
            # else:
            #     p = decode.replace(' ', '').replace('</s>', '').replace('<pad>','').replace('<s>', '')

            # print(l, p)
            # all_labels.append(l)
            all_preds.append(p)
    
    return all_preds


## Predict the Answers

In [54]:
def get_pred_dataloader(raw_texts, q_lst, q_no, tokenizer):
    input_toks = q_preprocess(raw_texts, q_lst, q_no, tokenizer)
    dataset = CaseDataset(input_toks)
    dataloader = DataLoader(dataset, 
                            batch_size=8, 
                            collate_fn=collate_fn, 
                            shuffle=False)
    return dataloader

In [55]:
def answer_qs(raw_texts, q_lst, q_no, tokenizer, model):
    loader = get_pred_dataloader(raw_texts, q_lst, q_no, tokenizer)
    # print(len(loader))
    
    print(f'Q{q_no+1}: {q_lst[q_no]}')
    
    preds = answer(model, loader)
    # acc, correct, total = accuracy(preds, golds)
    # acc = round(acc, 5)
    
    # print(f"Accuracy for this question is: {acc*100}%")
    # print('')
    
    return preds

In [56]:
# del train_df, train_loader, count_parameters
# gc.collect()

In [59]:
preds_dict = {}

In [62]:
for i in range(len(led3_qs)):
    preds_dict[led3_qs[i]] = answer_qs(
        raw_file_text, led3_qs, i, led_tokenizer, led3
    )
    

In [62]:
for i in range(len(led2_qs)):
    preds_dict[led2_qs[i]] = answer_qs(
        raw_file_text, led2_qs, i, led_tokenizer, led2
    ) 

In [62]:
for i in range(len(led3_qs)):
    preds_dict[led3_qs[i]] = answer_qs(
        raw_file_text, longt5_qs, i, longt5_tokenizer, longt5
    )

In [61]:
preds_dict

{}

In [64]:
df = pd.DataFrame(preds_dict)
df