## Imports and Device

In [1]:
# ! pip install transformers

import pandas as pd
import numpy as np
import torch
import os, gc
import re

from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

from torch import cuda, nn, optim
# from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
manual_seed = 595
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Read the Cleaned Data

### Define the paths

In [3]:
# run locally
text_path = '../formatted_cases/'
file = '../annotated_data.xlsx'
REGEX = r';+'
sup_path = '../annotated_sup/'
multi_path = text_path + 'multiple_files/'

In [4]:
# # run on Google Colab
# from google.colab import drive
# drive.mount('/content/gdrive')
# text_path = '/content/gdrive/My Drive/595/formatted_cases/'
# file = '/content/gdrive/My Drive/595/annotated_data.xlsx'
# REGEX = r';+'
# sup_path = '/content/gdrive/My Drive/595/annotated_sup/'
# multi_path = text_path + 'multiple_files/'

### Clean the Dataframe

In [5]:
df = pd.read_excel(file)
df['What is the file number of the case?'] = df['What is the file number of the case?'].str.replace(' and ', ';')
df['What is the file number of the case?'] = df['What is the file number of the case?'].str.replace(' ', ';')
df['What is the file number of the case?'] = df['What is the file number of the case?'].str.replace('/', ';')
df['What is the file number of the case?'] = df['What is the file number of the case?'].str.strip(';')
df['What is the file number of the case?'] = df['What is the file number of the case?'].apply(lambda x: re.sub(REGEX, ';', x))
df['What is the file number of the case?'] = df['What is the file number of the case?'].str.replace('File;number:;', '')
df['What is the file number of the case?'] = df['What is the file number of the case?'].str.replace('TET-89650-18;TET-89650-18', 'TET-89650-18;TEL-90138-18')
df = df.fillna('Not stated')
df = df.replace('Not applicable', 'Not stated')
df.rename(columns={
    'If yes to the previous question, did the decision state these conditions would make moving particularly burdensome?':
    'If any of the children had mental, medical or physical conditions, did the decision state these conditions would make moving particularly burdensome?',
    'If yes to the previous question, which of the following were applicable to the tenant?':
    'If the tenant had difficulty finding alternative housing for any reason, which of the following were applicable to the tenant?'    
}, inplace=True)

df = df.iloc[:, 2:-2]

print(df.shape)
df.head(6)

(702, 50)


Unnamed: 0,What is the file number of the case?,What was the date of the hearing? [mm/dd/yyyy],What was the date of the decision? [mm/dd/yyyy],Who was the member adjudicating the decision?,What was the location of the landlord tenant board?,Did the decision state the landlord was represented?,Did the decision state the landlord attended the hearing?,Did the decision state the tenant was represented?,Did the decision state the tenant attended the hearing?,Did the decision state the landlord was a not-for-profit landlord (e.g. Toronto Community Housing)?,...,"If the tenant did propose a payment plan, did the member accept the proposed payment plan?","If a payment plan was ordered, what was the length of the payment plan?","Did the decision mention the tenant’s difficulty finding alternative housing for any reason e.g.physical limitations, reliance on social assistance, etc.?","If the tenant had difficulty finding alternative housing for any reason, which of the following were applicable to the tenant?",Did the decision state the tenant was given prior notice for the eviction?,"If the tenant was given prior notice for the eviction, how much notice was given?",Did the decisions state postponement would result in the tenant accruing additional arrears?,Which other specific applications of the landlord or the tenant were mentioned?,Did the decision mention the validity of an N4 eviction notice?,Were there detail(s) in the decision not captured by this questionnaire that should be included?
0,CEL-87788-19,2019-10-16 00:00:00,2020-06-04 00:00:00,Sonia Anwar-Ali,Toronto,Yes,Not stated,No,Not stated,No,...,Not stated,12,No,Not stated,No,Not stated,No,L2: Application to End a Tenancy and Evict a T...,No,Tenant was a single mother with no support fro...
1,CEL-90549-19,2020-01-22 00:00:00,2020-01-10 00:00:00,Shelby Whittick,Mississauga,Yes,Yes,No,Yes,No,...,No,Not stated,No,Not stated,Yes,Not stated,Yes,No other specific applications were mentioned,No,Not stated
2,TEL-94478-18,2018-10-31 00:00:00,2018-11-21 00:00:00,Ruth Carey (Vice Chair),Toronto,Yes,Yes,No,Yes,No,...,Not stated,Not stated,No,Not stated,Yes,Not stated,No,N13: Notice to End your Tenancy Because the La...,No,Previous decision TEL-92736-18 < This decision...
3,TEL-94493-18,2018-10-31 00:00:00,2018-11-21 00:00:00,Ruth Carey (Vice Chair),Toronto,Yes,Yes,No,Yes,No,...,Yes,1,No,Not stated,Yes,Not stated,No,No other specific applications were mentioned,No,There were 7 previous application for non-paym...
4,CEL-72994-18,2018-03-07 00:00:00,2018-03-14 00:00:00,Avril Cardoso,Mississauga,Yes,No,Yes,No,No,...,No,Not stated,No,Not stated,Yes,Not stated,No,No other specific applications were mentioned,No,Third Application by Landlord in past 6 months...
5,CEL-73021-18,2018-06-15 00:00:00,2018-06-18 00:00:00,Avril Cardoso,Mississauga,Yes,No,No,No,No,...,Not stated,Not stated,No,Not stated,Yes,Not stated,No,L1: Application to Evict a Tenant for Non-paym...,No,Tenant did not show up because hearing took pl...


In [6]:
# df.columns   #`Timestamp` is not the time of the case

In [7]:
df_unique = df.drop_duplicates(subset=['What is the file number of the case?'])
df_unique = df_unique.reset_index(drop=True)

print(df_unique.shape)
df_unique.head(6)

(682, 50)


Unnamed: 0,What is the file number of the case?,What was the date of the hearing? [mm/dd/yyyy],What was the date of the decision? [mm/dd/yyyy],Who was the member adjudicating the decision?,What was the location of the landlord tenant board?,Did the decision state the landlord was represented?,Did the decision state the landlord attended the hearing?,Did the decision state the tenant was represented?,Did the decision state the tenant attended the hearing?,Did the decision state the landlord was a not-for-profit landlord (e.g. Toronto Community Housing)?,...,"If the tenant did propose a payment plan, did the member accept the proposed payment plan?","If a payment plan was ordered, what was the length of the payment plan?","Did the decision mention the tenant’s difficulty finding alternative housing for any reason e.g.physical limitations, reliance on social assistance, etc.?","If the tenant had difficulty finding alternative housing for any reason, which of the following were applicable to the tenant?",Did the decision state the tenant was given prior notice for the eviction?,"If the tenant was given prior notice for the eviction, how much notice was given?",Did the decisions state postponement would result in the tenant accruing additional arrears?,Which other specific applications of the landlord or the tenant were mentioned?,Did the decision mention the validity of an N4 eviction notice?,Were there detail(s) in the decision not captured by this questionnaire that should be included?
0,CEL-87788-19,2019-10-16 00:00:00,2020-06-04 00:00:00,Sonia Anwar-Ali,Toronto,Yes,Not stated,No,Not stated,No,...,Not stated,12,No,Not stated,No,Not stated,No,L2: Application to End a Tenancy and Evict a T...,No,Tenant was a single mother with no support fro...
1,CEL-90549-19,2020-01-22 00:00:00,2020-01-10 00:00:00,Shelby Whittick,Mississauga,Yes,Yes,No,Yes,No,...,No,Not stated,No,Not stated,Yes,Not stated,Yes,No other specific applications were mentioned,No,Not stated
2,TEL-94478-18,2018-10-31 00:00:00,2018-11-21 00:00:00,Ruth Carey (Vice Chair),Toronto,Yes,Yes,No,Yes,No,...,Not stated,Not stated,No,Not stated,Yes,Not stated,No,N13: Notice to End your Tenancy Because the La...,No,Previous decision TEL-92736-18 < This decision...
3,TEL-94493-18,2018-10-31 00:00:00,2018-11-21 00:00:00,Ruth Carey (Vice Chair),Toronto,Yes,Yes,No,Yes,No,...,Yes,1,No,Not stated,Yes,Not stated,No,No other specific applications were mentioned,No,There were 7 previous application for non-paym...
4,CEL-72994-18,2018-03-07 00:00:00,2018-03-14 00:00:00,Avril Cardoso,Mississauga,Yes,No,Yes,No,No,...,No,Not stated,No,Not stated,Yes,Not stated,No,No other specific applications were mentioned,No,Third Application by Landlord in past 6 months...
5,CEL-73021-18,2018-06-15 00:00:00,2018-06-18 00:00:00,Avril Cardoso,Mississauga,Yes,No,No,No,No,...,Not stated,Not stated,No,Not stated,Yes,Not stated,No,L1: Application to Evict a Tenant for Non-paym...,No,Tenant did not show up because hearing took pl...


In [8]:
info_lst = df_unique.columns[2:-2]

raw_file_text = []

for i in range(len(df_unique)):
    file_no = df_unique.iloc[i,0]
    if not os.path.isfile(text_path+file_no+'.txt'):
        print(f'{file_no} not found. Going to the supplement directory.')
        # passed_cases.append(file_no)
        if not os.path.isfile(sup_path+file_no+'.txt'):
            print(f'{file_no} not found. Going to the multiple directory.')
            with open (multi_path+file_no+'.txt') as t:
                # file_no_lst = file_no.split(';')
                # print(file_no_lst)
                raw_file_text.append(t.read())
        else:
            with open (sup_path+file_no+'.txt') as t:
                raw_file_text.append(t.read())
    else:
        with open (text_path+file_no+'.txt') as t:
            # cases_info[-1]['text'] = t.read()
            raw_file_text.append(t.read())
            # raw_file_name.append(file_no+'.txt')

TET-89650-18;TEL-90138-18 not found. Going to the supplement directory.
TNL-00793-18;TNL-01183-18 not found. Going to the supplement directory.
TNL-00793-18;TNL-01183-18 not found. Going to the multiple directory.
TNL-03299-18;TNT-00589-17 not found. Going to the supplement directory.
TNL-03299-18;TNT-00589-17 not found. Going to the multiple directory.
TNL-04435-18;TNL-03907-18 not found. Going to the supplement directory.
HOL-02144-17;HOT-02146-17 not found. Going to the supplement directory.
TEL-87475-18;TET-86819-17;TET-88355-18 not found. Going to the supplement directory.
TEL-87475-18;TET-86819-17;TET-88355-18 not found. Going to the multiple directory.
SWL-08112-17;SWL-08113-17 not found. Going to the supplement directory.
SWL-12547-18;SWL-12548-18 not found. Going to the supplement directory.
SWL-12547-18;SWL-12548-18 not found. Going to the multiple directory.
SWL-13901-18;SWT-14627-18 not found. Going to the supplement directory.
TEL-77442-17;TET-77790-17 not found. Going to 

In [9]:
# remove columns that have too little information
little_info_col = [15, 16, 26, 27, 28, 29, 30, 31, 41, 43, 45]
to_del = [df_unique.columns[i] for i in little_info_col]
for col in to_del:
    del df_unique[col]
to_del

['If any rent increases occurred, what was the rent after the increase(s)?',
 'If any rent increases occurred, when did the rent increase(s) come into effect? ',
 'How many total children did the tenant have living with them? ',
 'How many total children aged 17 or younger did the tenant have living with them?',
 'How many total children aged 13 or younger did the tenant have living with them? ',
 'How many total children aged 4 or younger did the tenant have living with them?',
 'Did the decision state any of the children had mental, medical or physical conditions?',
 'If any of the children had mental, medical or physical conditions, did the decision state these conditions would make moving particularly burdensome?',
 'If a payment plan was ordered, what was the length of the payment plan? ',
 'If the tenant had difficulty finding alternative housing for any reason, which of the following were applicable to the tenant?',
 'If the tenant was given prior notice for the eviction, how mu

In [10]:
del to_del
gc.collect()

1055

### Split the Train Dataframe and Validation Dataframe

In [11]:
# train_df = df_unique.iloc[:620, :]
val_df = df_unique.iloc[620:, :].reset_index(drop=True)
val_df.shape

(62, 39)

## Initialize the Tokenizer and Load the Model

In [12]:
# @article{Beltagy2020Longformer,
#   title={Longformer: The Long-Document Transformer},
#   author={Iz Beltagy and Matthew E. Peters and Arman Cohan},
#   journal={arXiv:2004.05150},
#   year={2020},
# }

tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384", 
                                              gradient_checkpointing=True, use_cache=False)
model.load_state_dict(torch.load('../led_3epoch_law_allqs.pt', map_location=device))


<All keys matched successfully>

## Prepare the Validation Set

In [13]:
def prompt(dataframe, raw_texts):
    input_texts = []
    outputs = []
    # long_cases = 0
    
    questions = dataframe.columns
    
    for q_no in range(len(questions)):
        answers = dataframe.iloc[:,q_no]
        # print(len(raw_texts), len(answers))
        assert len(raw_texts) == len(answers)

        for i in range(len(answers)):
            full_text = raw_texts[i]
            text = full_text[full_text.find('Content:')+len('Content:'):]

            # if len(text) > 26000:
            #     # print(len(text))
            #     text = text[:26000]
            #     long_cases += 1

            text = text.replace('\n', ' ')
            text = text.replace('\xa0', ' ')
            text = text.replace('\t', ' ')
            text = text.replace('   ', ' ').replace('  ', ' ').replace('  ', ' ').replace('  ', ' ')
            # text
            # for word in stop_words:
            #     text = text.replace(' '+word+' ', ' ')

            if 'Schedule 1' in text:
                s_idx = text.find('Schedule 1')
                text = text[:s_idx]

            input_text = f'Question: {questions[q_no]} Text: {text}'  
            input_texts.append(input_text)

            output = str(answers[i])
            outputs.append(output)
        
    # print(len(input_texts), len(outputs))
    # print(input_texts[0], outputs[0]) 
    # print(long_cases)
    return input_texts, outputs


def preprocess(dataframe, tokenizer, raw_texts):
    input_texts, outputs = prompt(dataframe, raw_texts)   
    
    input_toks = tokenizer.batch_encode_plus(input_texts,
                                             add_special_tokens=False, 
                                             return_token_type_ids=False)
    output_toks = tokenizer.batch_encode_plus(outputs, 
                                              add_special_tokens=False,
                                              return_token_type_ids=False)
    # print(len(q1_train_input['input_ids']), len(q1_train_output['input_ids']))
    return input_toks, output_toks
    

In [14]:
# train_raw_texts = raw_file_text[:620]
val_raw_texts = raw_file_text[620:]
val_input, val_output = preprocess(val_df, tokenizer, val_raw_texts)

## Create the Dataset

In [15]:
PAD = tokenizer.pad_token_id
SEP = tokenizer.sep_token_id
PAD, SEP

(1, 2)

In [16]:
class CaseDataset(Dataset):

    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        input_ids = self.inputs['input_ids'][idx]
        attention_mask = self.inputs['attention_mask'][idx]

        target_ids = self.outputs['input_ids'][idx]
        # target_attention_mask = self.outputs['attention_mask'][idx]
        return {"input_ids": input_ids, "attention_mask":attention_mask, "output_ids":target_ids}


def collate_fn(batch):
    batch_input = [torch.LongTensor(example['input_ids']) for example in batch]
    batch_output = [torch.LongTensor(example['output_ids']) for example in batch]
    batch_mask = [torch.LongTensor(example['attention_mask']) for example in batch]

    padded_batch_input_ids = pad_sequence(batch_input, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_batch_label = pad_sequence(batch_output, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_batch_att_mask = pad_sequence(batch_mask, batch_first=True, padding_value=-100)

    return {"input_ids": padded_batch_input_ids, "attention_mask": padded_batch_att_mask, "labels": padded_batch_label}


def to_device(data, device):
    new_data = {}
    for k in data:
        new_data[k] = data[k].to(device)
    return new_data

## Prepare the Functions for Evaluation

In [17]:
@torch.no_grad()
def answer(model, loader):
    all_preds = []
    all_labels = []
    model.eval()
    for batch in loader:
        batch = to_device(batch, device)
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        outputs = model.generate(input_ids=input_ids, 
                                 attention_mask=attention_mask, 
                                 return_dict_in_generate=True, 
                                 pad_token_id=tokenizer.pad_token_id, 
                                 max_length=512, 
                                 top_k=15)
        
        decode_texts = tokenizer.batch_decode([l[l != 0] for l in outputs['sequences']])
        gold_texts = tokenizer.batch_decode([l[l != 0] for l in labels])
        # print(decode_texts, gold_texts)
        for gold, decode in zip(gold_texts, decode_texts):
            l = gold.replace('</s>', '').replace('<pad>','').replace('<s>', '')
            p = decode.replace('</s>', '').replace('<pad>','').replace('<s>', '')

            # if '<pad>' in gold:
            #     l_pad_idx = gold.index('<pad>')
            #     l = gold[:l_pad_idx].replace(' ', '').replace('</s>', '').replace('<pad>','').replace('<s>', '')
            # else:
            #     l = gold.replace(' ', '').replace('</s>', '').replace('<pad>','').replace('<s>', '')
            
            # if '<pad>' in decode:
            #     p_pad_idx = decode.index('<pad>')
            #     p = decode[:p_pad_idx].replace(' ', '').replace('</s>', '').replace('<pad>','').replace('<s>', '')
            # else:
            #     p = decode.replace(' ', '').replace('</s>', '').replace('<pad>','').replace('<s>', '')

            # print(l, p)
            all_labels.append(l)
            all_preds.append(p)
    
    return all_preds, all_labels


def accuracy(sys, gold):
    total = 0
    correct = 0
    for s, g in zip(sys, gold):
        if s == g:
            correct += 1
        total += 1
            
    accuracy = correct / total
    return accuracy, correct, total


## Evaluate the Model on Validation Set

In [18]:
def q_prompt(dataframe, q_no, raw_texts):
    input_texts = []
    outputs = []
    # long_cases = 0
    
    questions = dataframe.columns
    answers = dataframe.iloc[:,q_no]
    # print(len(raw_texts), len(answers))
    assert len(raw_texts) == len(answers)

    for i in range(len(answers)):
        full_text = raw_texts[i]
        text = full_text[full_text.find('Content:')+len('Content:'):]

        # if len(text) > 26000:
        #     # print(len(text))
        #     text = text[:26000]
        #     long_cases += 1

        text = text.replace('\n', ' ')
        text = text.replace('\xa0', ' ')
        text = text.replace('\t', ' ')
        text = text.replace('   ', ' ').replace('  ', ' ').replace('  ', ' ').replace('  ', ' ')
        # text
        # for word in stop_words:
        #     text = text.replace(' '+word+' ', ' ')

        if 'Schedule 1' in text:
            s_idx = text.find('Schedule 1')
            text = text[:s_idx]

        input_text = f'Question: {questions[q_no]} Text: {text}'  
        input_texts.append(input_text)

        output = str(answers[i])
        outputs.append(output)
        
    # print(len(input_texts), len(outputs))
    # print(input_texts[0], outputs[0]) 
    # print(long_cases)
    return input_texts, outputs

In [19]:
def q_preprocess(dataframe, q_no, tokenizer, raw_texts):
    input_texts, outputs = q_prompt(dataframe, q_no, raw_texts)   
    
    input_toks = tokenizer.batch_encode_plus(input_texts,
                                             add_special_tokens=False, 
                                             return_token_type_ids=False)
    output_toks = tokenizer.batch_encode_plus(outputs, 
                                              add_special_tokens=False,
                                              return_token_type_ids=False)
    # print(len(q1_train_input['input_ids']), len(q1_train_output['input_ids']))
    return input_toks, output_toks
    

In [25]:
def get_test_dataloader(df, q_no, tokenizer, raw_texts):
    input_toks, output_toks = q_preprocess(df, q_no, tokenizer, raw_texts)
    dataset = CaseDataset(input_toks, output_toks)
    dataloader = DataLoader(dataset, 
                            batch_size=4, 
                            collate_fn=collate_fn, 
                            shuffle=False)
    return dataloader

In [26]:
def answer_qs(val_df, q_no, tokenizer):
    loader = get_test_dataloader(val_df, q_no, tokenizer, val_raw_texts)
    # print(len(loader))
    
    questions = val_df.columns
    print(f'Q{q_no+1}: {questions[q_no]}')
    
    preds, golds = answer(model, loader)
    acc, correct, total = accuracy(preds, golds)
    acc = round(acc, 5)
    
    print(f"Accuracy for this question is: {acc*100}%")
    print('')
    
    return acc, preds

In [27]:
# del train_df, train_loader, count_parameters
# gc.collect()

## Evaluate

In [28]:
model.to(device)

LEDForConditionalGeneration(
  (led): LEDModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): LEDEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): LEDLearnedPositionalEmbedding(16384, 768)
      (layers): ModuleList(
        (0-5): 6 x LEDEncoderLayer(
          (self_attn): LEDEncoderAttention(
            (longformer_self_attn): LEDEncoderSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (value_global): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): Linear(in_features=768, out_features=768, bias=True)
          )
     

In [29]:
acc_lst = []
with open ('LED_allqs_preds_3epochs.txt', 'w', encoding='utf-8') as p:
    for i in range(0, val_df.shape[1]):
        p.write(f'Q{i+1}: {val_df.columns[i]}\n')
        acc, preds = answer_qs(val_df, i, tokenizer)
        acc_lst.append(acc)
        print(preds)
        print(' ')
        p.write(str(preds)+'\n')
        p.write('\n')
        del preds
        gc.collect()
avg_acc = sum(acc_lst) / len(acc_lst)
avg_acc

Q1: What is the file number of the case?




Accuracy for this question is: 46.774%

['TSL-056-19', 'TEL-80084-17', 'TEL-80169-17', 'TEL-80248-17', 'TEL-80320-17', 'TEL-80483-17-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV', 'TSL-05944-19-T-T-T-T-19-T-T-T-T-T-T-T-T44-19-TL-TAL-T-T-T-T-T44-19-T-T-T-T-T-T-T-T-T-T-T-T-T-T-T-T-TNL-T-T-T-T-T-T-T-T-T-T-T-T-T-T-T-T-T-TNL-T-T-T-T-T-T-T-T-T-T-TNL-T-T-T-T-TNL-T-TNL-T-T44-T-T-TAL-TNL-TNL-T-T44-T-T-TNL-T-TNL-T-T-T44-T-T-T-T-TNL-T-T-TNL-T-T-T-T-T-T-T-TNL-T-T-T-T-TNL-T-TNL-TNL-TNL-TNL-TNL-TNL-TNL-TNL-TNL-T-TNL-05944-19-T-TNL-TNL-T-TNL-TNL-TNL-

0.6943748717948716

In [30]:
# del preds
# gc.collect()

In [31]:
# continue
# with open ('/content/gdrive/My Drive/595/longT5_allqs_preds.txt', 'a', encoding='utf-8') as p:
#     for i in range(28, val_df.shape[1]):
#         p.write(f'Q{i+1}: {val_df.columns[i]}')
#         acc, preds = answer_qs(val_df, i, tokenizer)
#         acc_lst.append(acc)
#         print(preds)
#         print(' ')
#         p.write(str(preds)+'\n')
#         p.write('\n')
# avg_acc = sum(acc_lst) / len(acc_lst)

In [32]:
assert len(acc_lst) == 39
acc_lst

[0.46774,
 0.6129,
 0.48387,
 0.75806,
 0.93548,
 0.8871,
 0.56452,
 0.30645,
 0.64516,
 0.98387,
 0.93548,
 0.56452,
 0.80645,
 0.40323,
 0.27419,
 0.14516,
 0.06452,
 0.59677,
 0.95161,
 0.91935,
 0.79032,
 0.98387,
 0.72581,
 0.75806,
 0.75806,
 0.85484,
 0.80645,
 0.82258,
 0.87097,
 0.96774,
 0.69355,
 0.82258,
 0.82258,
 0.85484,
 0.93548,
 0.48387,
 0.37097,
 0.8871,
 0.56452]

In [33]:
q1_loader = get_test_dataloader(val_df, 0, tokenizer, val_raw_texts)
# print(len(loader))

questions = val_df.columns
print(f'Q1: {questions[0]}')

preds, golds = answer(model, q1_loader)
preds, golds

Q1: What is the file number of the case?


(['TSL-056-19',
  'TEL-80084-17',
  'TEL-80169-17',
  'TEL-80248-17',
  'TEL-80320-17',
  'TEL-80483-17-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV-RV',
  'TSL-05944-19-T-T-T-T-19-T-T-T-T-T-T-T-T44-19-TL-TAL-T-T-T-T-T44-19-T-T-T-T-T-T-T-T-T-T-T-T-T-T-T-T-TNL-T-T-T-T-T-T-T-T-T-T-T-T-T-T-T-T-T-TNL-T-T-T-T-T-T-T-T-T-T-TNL-T-T-T-T-TNL-T-TNL-T-T44-T-T-TAL-TNL-TNL-T-T44-T-T-TNL-T-TNL-T-T-T44-T-T-T-T-TNL-T-T-TNL-T-T-T-T-T-T-T-TNL-T-T-T-T-TNL-T-TNL-TNL-TNL-TNL-TNL-TNL-TNL-TNL-TNL-T-TNL-05944-19-T-TNL-TNL-T-TNL-TNL-TNL-TNL-T44-T-T-TNL-T-T44-T-T-T