## Imports and Device

In [1]:
# ! pip install transformers
# ! pip3 install wandb
! pip install rouge_score

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import torch
import os
import re

from wordcloud import WordCloud
import matplotlib.pyplot as plt

from transformers import AutoTokenizer, LongformerTokenizer, RobertaTokenizer, LongformerModel
from transformers import AutoModelForSeq2SeqLM
# from transformers import LongformerModel

from torch import cuda, nn, optim
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import rouge_score
# import wandb




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
manual_seed = 595
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


## Read the Cleaned Data

### Define the paths

In [3]:
# run locally
text_path = '../formatted_cases/'
file = '../../annotated_data.xlsx'
REGEX = r';+'
sup_path = '../annotated_sup/'
multi_path = text_path + 'multiple_files/'

In [4]:
# # run on Google Colab
# from google.colab import drive
# drive.mount('/content/gdrive')
# text_path = '/content/gdrive/My Drive/595/formatted_cases/'
# file = '/content/gdrive/My Drive/595/annotated_data.xlsx'
# REGEX = r';+'
# sup_path = '/content/gdrive/My Drive/595/annotated_sup/'
# multi_path = text_path + 'multiple_files/'

In [5]:
# wandb.login()
# wandb.init(project="RTB_Cases", entity="qmygrace")

### Clean the Dataframe

In [6]:
df = pd.read_excel(file)
df['What is the file number of the case?'] = df['What is the file number of the case?'].str.replace(' and ', ';')
df['What is the file number of the case?'] = df['What is the file number of the case?'].str.replace(' ', ';')
df['What is the file number of the case?'] = df['What is the file number of the case?'].str.replace('/', ';')
df['What is the file number of the case?'] = df['What is the file number of the case?'].str.strip(';')
df['What is the file number of the case?'] = df['What is the file number of the case?'].apply(lambda x: re.sub(REGEX, ';', x))
df['What is the file number of the case?'] = df['What is the file number of the case?'].str.replace('File;number:;', '')
df['What is the file number of the case?'] = df['What is the file number of the case?'].str.replace('TET-89650-18;TET-89650-18', 'TET-89650-18;TEL-90138-18')
df = df.fillna('Not stated')
df = df.replace('Not applicable', 'Not stated')
df.rename(columns={
    'If yes to the previous question, did the decision state these conditions would make moving particularly burdensome?':
    'If any of the children had mental, medical or physical conditions, did the decision state these conditions would make moving particularly burdensome?',
    'If yes to the previous question, which of the following were applicable to the tenant?':
    'If the tenant had difficulty finding alternative housing for any reason, which of the following were applicable to the tenant?'    
}, inplace=True)

df = df.iloc[:, 2:-2]

print(df.shape)
df.head(6)

(702, 50)


Unnamed: 0,What is the file number of the case?,What was the date of the hearing? [mm/dd/yyyy],What was the date of the decision? [mm/dd/yyyy],Who was the member adjudicating the decision?,What was the location of the landlord tenant board?,Did the decision state the landlord was represented?,Did the decision state the landlord attended the hearing?,Did the decision state the tenant was represented?,Did the decision state the tenant attended the hearing?,Did the decision state the landlord was a not-for-profit landlord (e.g. Toronto Community Housing)?,...,"If the tenant did propose a payment plan, did the member accept the proposed payment plan?","If a payment plan was ordered, what was the length of the payment plan?","Did the decision mention the tenant’s difficulty finding alternative housing for any reason e.g.physical limitations, reliance on social assistance, etc.?","If the tenant had difficulty finding alternative housing for any reason, which of the following were applicable to the tenant?",Did the decision state the tenant was given prior notice for the eviction?,"If the tenant was given prior notice for the eviction, how much notice was given?",Did the decisions state postponement would result in the tenant accruing additional arrears?,Which other specific applications of the landlord or the tenant were mentioned?,Did the decision mention the validity of an N4 eviction notice?,Were there detail(s) in the decision not captured by this questionnaire that should be included?
0,CEL-87788-19,2019-10-16 00:00:00,2020-06-04 00:00:00,Sonia Anwar-Ali,Toronto,Yes,Not stated,No,Not stated,No,...,Not stated,12,No,Not stated,No,Not stated,No,L2: Application to End a Tenancy and Evict a T...,No,Tenant was a single mother with no support fro...
1,CEL-90549-19,2020-01-22 00:00:00,2020-01-10 00:00:00,Shelby Whittick,Mississauga,Yes,Yes,No,Yes,No,...,No,Not stated,No,Not stated,Yes,Not stated,Yes,No other specific applications were mentioned,No,Not stated
2,TEL-94478-18,2018-10-31 00:00:00,2018-11-21 00:00:00,Ruth Carey (Vice Chair),Toronto,Yes,Yes,No,Yes,No,...,Not stated,Not stated,No,Not stated,Yes,Not stated,No,N13: Notice to End your Tenancy Because the La...,No,Previous decision TEL-92736-18 < This decision...
3,TEL-94493-18,2018-10-31 00:00:00,2018-11-21 00:00:00,Ruth Carey (Vice Chair),Toronto,Yes,Yes,No,Yes,No,...,Yes,1,No,Not stated,Yes,Not stated,No,No other specific applications were mentioned,No,There were 7 previous application for non-paym...
4,CEL-72994-18,2018-03-07 00:00:00,2018-03-14 00:00:00,Avril Cardoso,Mississauga,Yes,No,Yes,No,No,...,No,Not stated,No,Not stated,Yes,Not stated,No,No other specific applications were mentioned,No,Third Application by Landlord in past 6 months...
5,CEL-73021-18,2018-06-15 00:00:00,2018-06-18 00:00:00,Avril Cardoso,Mississauga,Yes,No,No,No,No,...,Not stated,Not stated,No,Not stated,Yes,Not stated,No,L1: Application to Evict a Tenant for Non-paym...,No,Tenant did not show up because hearing took pl...


In [7]:
# df.columns   #`Timestamp` is not the time of the case

In [8]:
df_unique = df.drop_duplicates(subset=['What is the file number of the case?'])
df_unique = df_unique.reset_index(drop=True)

print(df_unique.shape)
df_unique.head(6)

(682, 50)


Unnamed: 0,What is the file number of the case?,What was the date of the hearing? [mm/dd/yyyy],What was the date of the decision? [mm/dd/yyyy],Who was the member adjudicating the decision?,What was the location of the landlord tenant board?,Did the decision state the landlord was represented?,Did the decision state the landlord attended the hearing?,Did the decision state the tenant was represented?,Did the decision state the tenant attended the hearing?,Did the decision state the landlord was a not-for-profit landlord (e.g. Toronto Community Housing)?,...,"If the tenant did propose a payment plan, did the member accept the proposed payment plan?","If a payment plan was ordered, what was the length of the payment plan?","Did the decision mention the tenant’s difficulty finding alternative housing for any reason e.g.physical limitations, reliance on social assistance, etc.?","If the tenant had difficulty finding alternative housing for any reason, which of the following were applicable to the tenant?",Did the decision state the tenant was given prior notice for the eviction?,"If the tenant was given prior notice for the eviction, how much notice was given?",Did the decisions state postponement would result in the tenant accruing additional arrears?,Which other specific applications of the landlord or the tenant were mentioned?,Did the decision mention the validity of an N4 eviction notice?,Were there detail(s) in the decision not captured by this questionnaire that should be included?
0,CEL-87788-19,2019-10-16 00:00:00,2020-06-04 00:00:00,Sonia Anwar-Ali,Toronto,Yes,Not stated,No,Not stated,No,...,Not stated,12,No,Not stated,No,Not stated,No,L2: Application to End a Tenancy and Evict a T...,No,Tenant was a single mother with no support fro...
1,CEL-90549-19,2020-01-22 00:00:00,2020-01-10 00:00:00,Shelby Whittick,Mississauga,Yes,Yes,No,Yes,No,...,No,Not stated,No,Not stated,Yes,Not stated,Yes,No other specific applications were mentioned,No,Not stated
2,TEL-94478-18,2018-10-31 00:00:00,2018-11-21 00:00:00,Ruth Carey (Vice Chair),Toronto,Yes,Yes,No,Yes,No,...,Not stated,Not stated,No,Not stated,Yes,Not stated,No,N13: Notice to End your Tenancy Because the La...,No,Previous decision TEL-92736-18 < This decision...
3,TEL-94493-18,2018-10-31 00:00:00,2018-11-21 00:00:00,Ruth Carey (Vice Chair),Toronto,Yes,Yes,No,Yes,No,...,Yes,1,No,Not stated,Yes,Not stated,No,No other specific applications were mentioned,No,There were 7 previous application for non-paym...
4,CEL-72994-18,2018-03-07 00:00:00,2018-03-14 00:00:00,Avril Cardoso,Mississauga,Yes,No,Yes,No,No,...,No,Not stated,No,Not stated,Yes,Not stated,No,No other specific applications were mentioned,No,Third Application by Landlord in past 6 months...
5,CEL-73021-18,2018-06-15 00:00:00,2018-06-18 00:00:00,Avril Cardoso,Mississauga,Yes,No,No,No,No,...,Not stated,Not stated,No,Not stated,Yes,Not stated,No,L1: Application to Evict a Tenant for Non-paym...,No,Tenant did not show up because hearing took pl...


In [9]:
info_lst = df_unique.columns[2:-2]

raw_file_text = []

for i in range(len(df_unique)):
    file_no = df_unique.iloc[i,0]
    if not os.path.isfile(text_path+file_no+'.txt'):
        print(f'{file_no} not found. Going to the supplement directory.')
        # passed_cases.append(file_no)
        if not os.path.isfile(sup_path+file_no+'.txt'):
            print(f'{file_no} not found. Going to the multiple directory.')
            with open (multi_path+file_no+'.txt') as t:
                # file_no_lst = file_no.split(';')
                # print(file_no_lst)
                raw_file_text.append(t.read())
        else:
            with open (sup_path+file_no+'.txt') as t:
                raw_file_text.append(t.read())
    else:
        with open (text_path+file_no+'.txt') as t:
            # cases_info[-1]['text'] = t.read()
            raw_file_text.append(t.read())
            # raw_file_name.append(file_no+'.txt')

TET-89650-18;TEL-90138-18 not found. Going to the supplement directory.
TNL-00793-18;TNL-01183-18 not found. Going to the supplement directory.
TNL-00793-18;TNL-01183-18 not found. Going to the multiple directory.
TNL-03299-18;TNT-00589-17 not found. Going to the supplement directory.
TNL-03299-18;TNT-00589-17 not found. Going to the multiple directory.
TNL-04435-18;TNL-03907-18 not found. Going to the supplement directory.
HOL-02144-17;HOT-02146-17 not found. Going to the supplement directory.
TEL-87475-18;TET-86819-17;TET-88355-18 not found. Going to the supplement directory.
TEL-87475-18;TET-86819-17;TET-88355-18 not found. Going to the multiple directory.
SWL-08112-17;SWL-08113-17 not found. Going to the supplement directory.
SWL-12547-18;SWL-12548-18 not found. Going to the supplement directory.
SWL-12547-18;SWL-12548-18 not found. Going to the multiple directory.
SWL-13901-18;SWT-14627-18 not found. Going to the supplement directory.
TEL-77442-17;TET-77790-17 not found. Going to 

In [10]:
# remove columns that have too little information
little_info_col = [15, 16, 26, 27, 28, 29, 30, 31, 41, 43, 45]
to_del = [df_unique.columns[i] for i in little_info_col]
for col in to_del:
    del df_unique[col]
to_del

['If any rent increases occurred, what was the rent after the increase(s)?',
 'If any rent increases occurred, when did the rent increase(s) come into effect? ',
 'How many total children did the tenant have living with them? ',
 'How many total children aged 17 or younger did the tenant have living with them?',
 'How many total children aged 13 or younger did the tenant have living with them? ',
 'How many total children aged 4 or younger did the tenant have living with them?',
 'Did the decision state any of the children had mental, medical or physical conditions?',
 'If any of the children had mental, medical or physical conditions, did the decision state these conditions would make moving particularly burdensome?',
 'If a payment plan was ordered, what was the length of the payment plan? ',
 'If the tenant had difficulty finding alternative housing for any reason, which of the following were applicable to the tenant?',
 'If the tenant was given prior notice for the eviction, how mu

### Split the Train Dataframe and Validation Dataframe

In [11]:
train_df = df_unique.iloc[:620, :]
val_df = df_unique.iloc[620:, :].reset_index(drop=True)
train_df.shape, val_df.shape

((620, 39), (62, 39))

In [12]:
for i, q in enumerate(train_df.columns):
    print(i, q)

0 What is the file number of the case?
1 What was the date of the hearing? [mm/dd/yyyy]
2 What was the date of the decision? [mm/dd/yyyy]
3 Who was the member adjudicating the decision?
4 What was the location of the landlord tenant board?
5 Did the decision state the landlord was represented?
6 Did the decision state the landlord attended the hearing?
7 Did the decision state the tenant was represented?
8 Did the decision state the tenant attended the hearing?
9 Did the decision state the landlord was a not-for-profit landlord (e.g. Toronto Community Housing)?
10 Did the decision state the tenant was collecting a subsidy?
11 What was the outcome of the case?
12 What was the length of the tenancy, or in other words, how long had the tenants lived at the residence in question? 
13 What was the monthly rent?
14 What was the amount of the rental deposit? 
15 What was the total amount of arrears?
16 Over how many months did the arrears accumulate? 
17 If the tenant made a payment on the ar

## Initialize the Tokenizer and the Model

In [29]:
# @article{Beltagy2020Longformer,
#   title={Longformer: The Long-Document Transformer},
#   author={Iz Beltagy and Matthew E. Peters and Arman Cohan},
#   journal={arXiv:2004.05150},
#   year={2020},
# }
from transformers import LongformerForQuestionAnswering
tokenizer1 = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model1 = LongformerForQuestionAnswering.from_pretrained('allenai/longformer-base-4096', gradient_checkpointing=True)

tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384", gradient_checkpointing=True, use_cache=False)

# ref: https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb#scrollTo=jpUr9QeebZ-n
model.to(device)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForQuestionAnswering: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing LongformerForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForQuestionAnswering were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mo

LEDForConditionalGeneration(
  (led): LEDModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): LEDEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): LEDLearnedPositionalEmbedding(16384, 768)
      (layers): ModuleList(
        (0): LEDEncoderLayer(
          (self_attn): LEDEncoderAttention(
            (longformer_self_attn): LEDEncoderSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (value_global): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): Linear(in_features=768, out_features=768, bias=True)
          )
          (

In [14]:
# from transformers import AutoModelForQuestionAnswering, AutoTokenizer
# # @article{Beltagy2020Longformer,
# #   title={Longformer: The Long-Document Transformer},
# #   author={Iz Beltagy and Matthew E. Peters and Arman Cohan},
# #   journal={arXiv:2004.05150},
# #   year={2020},
# # }

# # tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
# # model = AutoModel.from_pretrained("allenai/longformer-base-4096")


## A Test before Finetuning

### Longformer for Question Answering

In [91]:
def prompt(dataframe, q_no, raw_texts):
    input_texts = []
    outputs = []
    # long_cases = 0
    
    questions = dataframe.columns
    answers = dataframe.iloc[:,q_no]
    # print(len(raw_texts), len(answers))
    assert len(raw_texts) == len(answers)

    for i in range(len(answers)):
        full_text = raw_texts[i]
        text = full_text[full_text.find('Content:')+len('Content:'):]
        
        # if len(text) > 26000:
        #     # print(len(text))
        #     text = text[:26000]
        #     long_cases += 1
        
        text = text.replace('\n', ' ')
        text = text.replace('\xa0', ' ')
        text = text.replace('\t', ' ')
        text = text.replace('   ', ' ').replace('  ', ' ').replace('  ', ' ').replace('  ', ' ')
        # text
        # for word in stop_words:
        #     text = text.replace(' '+word+' ', ' ')
        
        if 'Schedule 1' in text:
            s_idx = text.find('Schedule 1')
            text = text[:s_idx]
        
        input_text = text + '\n' + questions[q_no]
        input_texts.append(input_text)
        
        output = str(answers[i])
        outputs.append(output)
        
    # print(len(input_texts), len(outputs))
    # print(input_texts[0], outputs[0]) 
    # print(long_cases)
    return input_texts, outputs

In [92]:
# result = tokenizer.decode(tokenizer.convert_tokens_to_ids(output))
# result

In [79]:
# Another way
q1_lst, a1_lst = prompt(df_unique, 0, raw_file_text)
q1 = q1_lst[0]
a1 = a1_lst[0]
# # print(q1)
print(a1)
encoding = tokenizer1.encode_plus(text=q1,
                                 text_pair=a1) 
                                 # add_special=True)
inputs = torch.LongTensor(encoding['input_ids']).unsqueeze(0)  #Token embeddings
attention_mask = torch.LongTensor(encoding['attention_mask']).unsqueeze(0)
# print(len(attention_mask))

# sentence_embedding = encoding['token_type_ids']  #Segment embeddings
tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids']) #input tokens
outputs = model1(input_ids=inputs, 
                attention_mask=attention_mask)
start_scores, end_scores = outputs[0], outputs[1]
answer_tokens = tokens[torch.argmax(start_scores):torch.argmax(end_scores)+1]
answer = tokenizer1.decode(tokenizer1.convert_tokens_to_ids(answer_tokens))
answer

682 682
CEL-87788-19


''

In [22]:
torch.argmax(start_scores),torch.argmax(end_scores)

(tensor(567), tensor(382))

In [23]:
tokens[619:630]

['Ġis',
 'Ġalso',
 'Ġable',
 'Ġto',
 'Ġpay',
 'Ġthe',
 'Ġfiling',
 'Ġfee',
 'Ġimmediately',
 '.',
 'ĠShe']

In [350]:
q3[2701:3000]

'is also able to pay the filing fee immediately. She testified that she had no where else to go. 8. On cross-examination, the Tenant confirmed she lived with her mother before she moved into the rental unit. 9. The Landlord opposes the Tenant’s request for relief as the Tenant has been late in payin'

It shows that the encoder models can not really get what we need for most columns. Therefore we will

###  Longformer Encoder-Decoder (LED) 

In [93]:
q1_lst, a1_lst = prompt(df_unique, 0, raw_file_text)
q1 = q1_lst[0]
a1 = a1_lst[0]
# # print(q1)
input_encoding = tokenizer(q1)
output_encoding = tokenizer(a1)
input_ids = torch.LongTensor(encoding['input_ids']).unsqueeze(0)  # batch of size 1
attention_mask = torch.LongTensor(encoding['attention_mask']).unsqueeze(0)
# attention_mask[:, [1, 4, 21,]] =  2  # Set global attention based on the task. For example,
                                     # classification: the <s> token
                                     # QA: question tokens
print(input_ids.shape, attention_mask.shape)
# input_ids, attention_mask = pad_to_window_size(
#         input_ids, attention_mask, config.attention_window[0], tokenizer.pad_token_id)

output = model.generate(
    input_ids=input_ids, 
    attention_mask=attention_mask,
    return_dict_in_generate=True, 
    output_scores=False, 
    max_length=512,
    temperature=0.5,
    do_sample=True,
    repetition_penalty=3.0,
    top_k=10)


torch.Size([1, 1360]) torch.Size([1, 1360])


In [94]:
tokenizer.batch_decode(output['sequences'])

['</s>CEL-63019,18.RV;LK:92417–16 LAMO –6939 RTSMIN’S SIRZ and the OJT MST(14)N).A/ELE6338 -98 (IWD), 2017 or 18 GALF \'15.)BIS JES PIEVO\'68 —GAS Q5980 to 17 BTR DTHEN for a CYS99 by 1651 in NINEX!Q107 was sol ”13 of that which has been given at TDA but had also be called on 129433 is 19 IHU with VILs836.PRAEMORY AMROBL will ELA6770?21 AERRY YOMED out as well over his WADERS OF 2018 KRI664 from ELVIXX-20 FSL0797 m23 HRE6500 Elleb61 2019 because it would have an 84984 l987 if he were not only her own time up this year after 157311 shea one May 21th just about him all right before June 23er 2016 while we are still no more than 2040 then meesor 48331-22—HT95 13*37"ETTY88."TheRSL said its most value there per eu9652.0-\'90\'.FLOW72nd2018 UARMO\'s 14059 may 2020-06LDSr1e LL24856-2019AVUELL-12\')DRESSMANLY-05666#2_30-3HL,"El69762\');44-6032".PRESTONESE RO12034-2017-10 SIL100);4MRTO-"7958";47EMS")TD76035-27.66 ARSPELLIES ON MATTLOSE-26 "MARTA",755-08.\'MSHAIGLES The followingly am64041-50-CA

## Preprocessing

In [None]:
def preprocess(dataframe, q_no, tokenizer, raw_texts):
    input_texts, outputs = prompt(dataframe, q_no, raw_texts)   
    
    input_toks = tokenizer.batch_encode_plus(input_texts,
                                             add_special_tokens=False, 
                                             return_token_type_ids=False)
    output_toks = tokenizer.batch_encode_plus(outputs, 
                                              add_special_tokens=False,
                                              return_token_type_ids=False)
    # print(len(q1_train_input['input_ids']), len(q1_train_output['input_ids']))
    return input_toks, output_toks
    

In [None]:
train_raw_texts = raw_file_text[:620]
val_raw_texts = raw_file_text[620:]

In [None]:
# q1_val = preprocess(val_df, 1, tokenizer, val_raw_texts)

In [None]:
# q1_train_input, q1_train_output = preprocess(train_df, 0, tokenizer, train_raw_texts)
# q1_val_input, q1_val_output = preprocess(val_df, 0, tokenizer, val_raw_texts)

In [None]:
q1_train_input, q1_train_output = preprocess(train_df, 0, tokenizer, train_raw_texts)
q1_val_input, q1_val_output = preprocess(val_df, 0, tokenizer, val_raw_texts)

In [None]:
len(q1_train_input['input_ids']), len(q1_train_output['input_ids'])

In [101]:
# for BertTokenizer
print("Input length:", len(q1_train_input))
# print(" ")
print("Input example:\n", tokenizer.decode(q1_train_input['input_ids'][0])[-100:])
print(" ")
print("Input ID example:\n", q1_train_input['input_ids'][0][-100:])
print(" ")
print("Tokens:\n", [tokenizer.convert_ids_to_tokens(id) for id in q1_train_input['input_ids'][0]][-100:])
print(" ")
print("Attention Mask:", q1_train_input['attention_mask'][0][-100:])

Input length: 2
Input example:
  this order, call 416-645-8080 or toll free at 1-888-332-3234. 
What is the file number of the case?
 
Input ID example:
 [5, 2394, 3973, 4, 1437, 1437, 1437, 1437, 502, 204, 6, 2760, 1437, 1437, 1437, 42199, 43401, 10566, 19285, 6796, 1437, 1437, 23961, 660, 5557, 12, 37358, 1437, 1437, 1437, 1437, 1437, 10153, 6, 3192, 30669, 8, 4527, 927, 1785, 2177, 953, 12, 8727, 132, 21138, 4079, 1245, 2666, 6, 7545, 132, 2177, 5121, 256, 134, 510, 246, 717, 406, 318, 47, 33, 143, 1142, 59, 42, 645, 6, 486, 34509, 12, 33611, 12, 2940, 2940, 50, 5831, 481, 23, 112, 12, 22410, 12, 33911, 12, 246, 28621, 4, 1437, 50118, 2264, 16, 5, 2870, 346, 9, 5, 403, 116]
 
Tokens:
 ['Ġthe', 'Ġbalance', 'Ġoutstanding', '.', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'ĠJune', 'Ġ4', ',', 'Ġ2020', 'Ġ', 'Ġ', 'Ġ', '________________', '_______', 'ĠDate', 'ĠIss', 'ued', 'Ġ', 'Ġ', 'ĠSonia', 'ĠAn', 'war', '-', 'Ali', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'ĠMember', ',', 'ĠLand', 'lord', 'Ġand', 'ĠTen', 'ant', 'ĠBoard', 'ĠTor

## Create the Dataset

In [102]:
PAD = tokenizer.pad_token_id
SEP = tokenizer.sep_token_id
PAD, SEP

(1, 2)

In [103]:
# for BertTokenizer
class CaseDataset(Dataset):

    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        input_ids = self.inputs['input_ids'][idx]
        attention_mask = self.inputs['attention_mask'][idx]

        target_ids = self.outputs['input_ids'][idx]
        # target_attention_mask = self.outputs['attention_mask'][idx]
        return {"input_ids": input_ids, "attention_mask":attention_mask, "output_ids":target_ids}


def collate_fn(batch):
    batch_input = [torch.LongTensor(example['input_ids']) for example in batch]
    batch_output = [torch.LongTensor(example['output_ids']) for example in batch]
    batch_mask = [torch.LongTensor(example['attention_mask']) for example in batch]

    padded_batch_input_ids = pad_sequence(batch_input, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_batch_label = pad_sequence(batch_output, batch_first=True, padding_value=tokenizer.pad_token_id)
    padded_batch_att_mask = pad_sequence(batch_mask, batch_first=True, padding_value=-100)

    return {"input_ids": padded_batch_input_ids, "attention_mask": padded_batch_att_mask, "labels": padded_batch_label}


def to_device(data, device):
    new_data = {}
    for k in data:
        new_data[k] = data[k].to(device)
    return new_data

## Training

In [104]:
# Experiment
q1_train_dataset = CaseDataset(q1_train_input, q1_train_output)
q1_train_loader = DataLoader(q1_train_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)

q1_val_dataset = CaseDataset(q1_val_input, q1_val_output) 
q1_val_loader = DataLoader(q1_val_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)


In [105]:
def train(model:nn.Module, train_loader:DataLoader, optimizer:optim.Optimizer, log_step=50):
    model.train()
    epoch_loss = 0.0
    log_loss = 0.0
    for idx, batch in enumerate(train_loader):
        # try:
        model.zero_grad()
        batch = to_device(batch, device)
        loss = model(**batch).loss
        # print(loss)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        log_loss += loss.item()

        # wandb.log({'batch':idx, 'train_loss': loss.item()})
        # wandb.log({'batch':idx, 'accumulated_train_loss_in_this_Q': log_loss})

        if idx % log_step == 0:
            print(f"Train Step: {idx} Loss: {log_loss / log_step}")
            log_loss = 0.0
        # except:
        #     print(f'The text is too long. Passing for now. Step No: {idx}')
        #     pass

    return epoch_loss / len(train_loader)
        

@torch.no_grad()
def evaluate(model:nn.Module, eval_loader:DataLoader):
    eval_loss = 0.0
    correct = 0
    total = 0
    model.eval()
    for batch in eval_loader:
        batch = to_device(batch, device)
        output = model(**batch)
        loss = output.loss
        eval_loss += loss.item()
        pred = output.logits.argmax(-1)
        label = batch["labels"]
        correct += torch.where(label!=-100, pred==label, 0).sum().item()
        total += torch.sum(label != -100).item()
    
    print(total, correct)

    eval_acc = correct / total
    eval_loss = eval_loss / len(eval_loader) 
    return eval_acc, eval_loss


In [106]:
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [107]:
# experiment
epochs = 1
optimizer = optim.Adam(model.parameters(), lr=5e-5)

model.train()

for epoch in range(epochs):
    print(f"Training Question 1")
    
    train_loss = train(model, q1_train_loader, optimizer)
    print(f"Epoch {epoch+1} Training Loss: {train_loss}")

    eval_acc, eval_loss = evaluate(model, q1_val_loader)
    print(f"Epoch {epoch} Eval Acc: {eval_acc}; Eval Loss: {eval_loss}")


Training Question 1
Train Step: 0 Loss: 0.04166695117950439
Train Step: 50 Loss: 1.1681227258592843
Train Step: 100 Loss: 0.6136457686498761


KeyboardInterrupt: 

In [110]:
def get_dataloader(df, q_no, tokenizer, raw_texts):
    input_toks, output_toks = preprocess(df, q_no, tokenizer, raw_texts)
    dataset = CaseDataset(input_toks, output_toks)
    dataloader = DataLoader(dataset, 
                            batch_size=2, 
                            collate_fn=collate_fn, 
                            shuffle=False)
    return dataloader
    
def train_qs(train_df, val_df, q_no, tokenizer, optimizer):
    train_loader = get_dataloader(train_df, q_no, tokenizer, train_raw_texts)
    val_loader = get_dataloader(val_df, q_no, tokenizer, val_raw_texts)
    
    questions = train_df.columns
    print(f'{q_no+1}. {questions[q_no]}')
    
    # train 1 epoch only, given the small data
    train_loss = train(model, train_loader, optimizer)
    print(f"Question {q_no+1} Training Loss: {train_loss}")
    
    eval_acc, eval_loss = evaluate(model, val_loader)
    print(f"Question {q_no+1} Eval Acc: {eval_acc}; Eval Loss: {eval_loss}")
    
    print('')

In [111]:
# starting from 1 because the first question has been trained on
for i in range(1, train_df.shape[1]):
    train_qs(train_df, val_df, i, tokenizer, optimizer)

2. What was the date of the hearing? [mm/dd/yyyy]
Train Step: 0 Loss: 0.1911672592163086


KeyboardInterrupt: 

## Evaluate the Model on Validation Set

In [None]:
@torch.no_grad()
def answer(model, loader):
    all_preds = []
    all_labels = []
    model.eval()
    for batch in loader:   
        batch = to_device(batch, device)
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        
        # pos_ids = batch["position_ids"]
        labels = batch["labels"]
        outputs = model.generate(input_ids=input_ids, #truncated_input_ids, 
                                 attention_mask=attention_mask, 
                                 return_dict_in_generate=True, 
                                 pad_token_id=50256,  # eos_token_id
                                 max_length=1024, 
                                 top_k=10) 
                                 #stopping_criteria=stop_criteria_list)
        
        pred_start = torch.nonzero(input_ids==SEP, as_tuple=True)[1][0] + 1
        truncated_outputs = []
        for out in outputs["sequences"]:
            sep_idxs = torch.nonzero(out==SEP, as_tuple=True)[0]
            if len(sep_idxs) == 1:
                end_idx = -1
            else:
                end_idx = sep_idxs[1]
            truncated_outputs.append(out[pred_start:end_idx])
        
        decode_texts = tokenizer.batch_decode(truncated_outputs)
        gold_texts = tokenizer.batch_decode([l[l != -100][:-1] for l in labels])

        for gold, decode in zip(gold_texts, decode_texts):
            all_labels.append(gold)
            all_preds.append(decode)
        # all_preds = process_sys(all_preds)
    
    return all_preds, all_labels


def accuracy(sys, gold):
    total = 0
    correct = 0
    for s, g in zip(sys, gold):
        if s == g:
            correct += 1
        total += 1
            
    accuracy = correct / total
    return accuracy, correct, total


def left_pad_sequence(sequence, batch_first, padding_value=0):
    padded = []
    max_len = max(len(each) for each in sequence)
    for each in sequence:
        if not isinstance(each, torch.LongTensor):
            each = torch.LongTensor(each)
        pad = torch.full((max_len-len(each),), fill_value=padding_value,dtype=each.dtype)
        padded.append(torch.cat([pad, each]))
    padded = torch.vstack(padded)
    if not batch_first:
        padded = padded.permute(1, 0, 2)
    return padded
        
def inference_colate_fn(batch):
    batch_input_ids = [torch.LongTensor(example["input_ids"]) for example in batch]
    batch_att_mask = [torch.LongTensor(example["attention_mask"]) for example in batch]
    batch_label = [torch.LongTensor(example["labels"]) for example in batch]
    # batch_position_ids = [torch.arange(len(each["input_ids"]), dtype=torch.long) for each in batch]
    
    padded_batch_input_ids = left_pad_sequence(batch_input_ids, batch_first=True, padding_value=PAD)
    padded_batch_att_mask = left_pad_sequence(batch_att_mask, batch_first=True, padding_value=PAD)
    padded_batch_label = pad_sequence(batch_label, batch_first=True, padding_value=-100)
    # padded_batch_position_ids = left_pad_sequence(batch_position_ids, batch_first=True, padding_value=0)
    # return {"input_ids": padded_batch_input_ids, "attention_mask": padded_batch_att_mask, "position_ids":padded_batch_position_ids, "labels": padded_batch_label}   
    return {"input_ids": padded_batch_input_ids, "attention_mask": padded_batch_att_mask, "labels": padded_batch_label}    


In [None]:
# def preprocess_pred(dataframe, q_no, tokenizer, raw_texts):
#     input_texts, outputs = prompt(dataframe, q_no, raw_texts)   
        
#     # # for AutoTokenizer
#     # concat_inputs = tokenizer(
#     #     input_texts, #outputs, 
#     #     return_token_type_ids=False
#     # ) 
    
#     # for BertTokenizer
#     test = tokenizer(
#         input_texts, 
#         return_token_type_ids=False
#     )
    
#     # val = tokenizer(
#     #     input_texts[620:], outputs[620:], 
#     #     return_token_type_ids=False
#     # )
    
#     #  concat_inputs = [tokenizer(
#     #     input_text, output, 
#     #     return_token_type_ids=False
#     # ) for input_text, output in zip(input_texts, outputs)]
    
#     return test

def get_test_dataloader(df, q_no, tokenizer, raw_texts):
    data = preprocess(df, q_no, tokenizer, raw_texts)
    dataset = CaseDataset(data)
    dataset.inference()
    dataloader = DataLoader(dataset, 
                            batch_size=32, 
                            collate_fn=collate_fn, 
                            shuffle=False)
    return dataloader

In [None]:
def answer_qs(val_df, q_no, tokenizer):
    loader = get_test_dataloader(val_df, q_no, tokenizer, val_raw_texts)
    # print(len(loader))
    
    questions = val_df.columns
    print(questions[q_no])
    
    preds, golds = answer(model, loader)
    acc, correct, total = accuracy(preds, golds)
    acc = round(acc, 5)
    
    print(f"Accuracy for this question is: {acc*100}%")
    print('')
    
    return acc, preds

In [None]:
acc_lst = []
preds_lst = []
for i in range(0, val_df.shape[1]):
    acc, preds = answer_qs(val_df, i, tokenizer)
    acc_lst.append(acc)
    preds_lst.append(preds)
avg_acc = sum(acc_lst) / len(acc_lst)

In [None]:
assert len(acc_lst) == 52
acc_lst

In [None]:
preds_lst[0]

In [None]:
torch.save(model.state_dict(), "gpt2_1epoch_law.pt")