# Pre-Processing to Match Annotated Data
- Ultimate goal: Match everything as closely as possible (even if it doesn't always make sense to)

## Imports

In [122]:
import os
import pandas as pd
import time
import numpy as np
from collections import deque

# Gold Data (Annotated by Partner)

## Renaming `gold_data` columns

In [123]:
import pandas as pd
gold_data = pd.read_csv("data/gold_labels_with_files.csv") # with partner annotations
# gold_data = pd.read_csv('data/allard_labels_with_text.csv') # allard annotations

# making new names for the columns in gold_data

new_names = {
    'Timestamp': 'timestamp',
    'Email Address': 'email_address',
    'What is the file number of the case?': 'file_number_gold',
    'What was the date of the hearing? [mm/dd/yyyy]': 'hearing_date',
    'What was the date of the decision? [mm/dd/yyyy]': 'decision_date',
    'Who was the member adjudicating the decision?': 'adjudicating_member',
    'What was the location of the landlord tenant board?': 'ltb_location',
    'Did the decision state the landlord was represented?': 'landlord_represented',
    'Did the decision state the landlord attended the hearing?': 'landlord_attended_hearing',
    'Did the decision state the tenant was represented?': 'tenant_represented',
    'Did the decision state the tenant attended the hearing?': 'tenant_attended_hearing',
    'Did the decision state the landlord was a not-for-profit landlord (e.g. Toronto Community Housing)?': 'landlord_nonprofit',
    'Did the decision state the tenant was collecting a subsidy?': 'tenant_collecting_subsidy',
    'What was the outcome of the case?': 'case_outcome',
    'What was the length of the tenancy, or in other words, how long had the tenants lived at the residence in question? ': 'tenancy_length',
    'What was the monthly rent?': 'monthly_rent',
    'What was the amount of the rental deposit? ': 'rental_deposit',
    'If any rent increases occurred, what was the rent after the increase(s)?': 'rent_after_increase',
    'If any rent increases occurred, when did the rent increase(s) come into effect? ': 'rent_increase_effect_date',
    'What was the total amount of arrears?': 'total_arrears',
    'Over how many months did the arrears accumulate? ': 'arrears_duration',
    'If the tenant made a payment on the arrears after the eviction notice was served and/or prior to the hearing, what was the amount of the payment? ': 'arrears_payment_amount',
    'Did the decision mention a history of arrears by the tenant separate from the arrears in the current claim (more than one period of arrears, recurrently coming in and out of arrears, arrears with previous landlord, etc.)?': 'tenant_arrears_history_mentioned',
    'If the tenant had a history of arrears, did the decision mention a history of the tenant making payments on those arrears (separate from any payments made in response to the present eviction notice/hearing)?': 'tenant_arrears_payment_history_mentioned',
    'How frequently were rent payments made late?': 'rent_payments_late_frequency',
    'Did the member find the tenant had or seemed to have the ability to pay rent, but chose not do so?': 'tenant_ability_to_pay_rent',
    'What were the specific mental, medical, or physical conditions of the tenant, if any? ': 'tenant_conditions',
    'Did the decision state that the tenant had children living with them?': 'tenant_children_present',
    'How many total children did the tenant have living with them? ': 'total_children',
    'How many total children aged 17 or younger did the tenant have living with them?': 'children_17_or_younger',
    'How many total children aged 13 or younger did the tenant have living with them? ': 'children_13_or_younger',
    'How many total children aged 4 or younger did the tenant have living with them?': 'children_4_or_younger',
    'Did the decision state any of the children had mental, medical or physical conditions?': 'children_conditions_mentioned',
    'If yes to the previous question, did the decision state these conditions would make moving particularly burdensome?': 'conditions_making_moving_burdensome',
    'Was the tenant employed at the time of the hearing?': 'tenant_employed',
    'If the tenant was not employed, did the decision state the tenant was receiving any form of government assistance (e.g. OW, childcare benefits, ODSP, OSAP)?': 'tenant_government_assistance',
    'If the tenant was employed, did the decision state any doubts about the stability of employment e.g. lack of guaranteed hours, contract work, etc.?': 'employment_stability_doubts',
    'Did the member find the tenant had sufficient income to pay rent?': 'sufficient_income_to_pay_rent',
    'What was the total income of the tenant’s household? ': 'total_household_income',
    'Did the decision mention the tenant lost their job leading up to or during the period of the hearing?': 'tenant_job_loss_mentioned',
    'Did the decision mention any other extenuating circumstances experienced by the tenant leading up to or during the period of the claim (e.g. hospitalization, death in the family, etc.)?': 'tenant_extenuating_circumstances',
    'Did the tenant propose a payment plan?': 'tenant_proposed_payment_plan',
    'If the tenant did propose a payment plan, did the member accept the proposed payment plan?': 'accepted_proposed_payment_plan',
    'If a payment plan was ordered, what was the length of the payment plan? ': 'payment_plan_length',
    'Did the decision mention the tenant’s difficulty finding alternative housing for any reason e.g.physical limitations, reliance on social assistance, etc.?': 'tenant_difficulty_finding_housing',
    'If yes to the previous question, which of the following were applicable to the tenant?': 'applicable_difficulty_reasons',
    'Did the decision state the tenant was given prior notice for the eviction?': 'tenant_prior_notice_given',
    'If the tenant was given prior notice for the eviction, how much notice was given?': 'prior_notice_duration',
    'Did the decisions state postponement would result in the tenant accruing additional arrears?': 'postponement_additional_arrears',
    'Which other specific applications of the landlord or the tenant were mentioned?': 'mentioned_applications',
    'Did the decision mention the validity of an N4 eviction notice?': 'validity_of_N4_notice_mentioned',
    'Were there detail(s) in the decision not captured by this questionnaire that should be included?': 'additional_details_in_decision',
    'Exec Review': 'executive_review',
    'Review Status': 'review_status'
}

gold_data = gold_data.rename(columns = new_names)
# sorting by file_number -- so that ordering of the new data annotations can match this and be more easily compared
gold_data = gold_data.sort_values(by = ['file_number_gold'], ascending = True).reset_index(drop = True)
gold_data.columns

Index(['file_number_gold', 'file_number_gold_cleaned', 'raw_file_text',
       'raw_file_name', 'timestamp', 'email_address', 'hearing_date',
       'decision_date', 'adjudicating_member', 'ltb_location',
       'landlord_represented', 'landlord_attended_hearing',
       'tenant_represented', 'tenant_attended_hearing', 'landlord_nonprofit',
       'tenant_collecting_subsidy', 'case_outcome', 'tenancy_length',
       'monthly_rent', 'rental_deposit', 'rent_after_increase',
       'rent_increase_effect_date', 'total_arrears', 'arrears_duration',
       'arrears_payment_amount', 'tenant_arrears_history_mentioned',
       'tenant_arrears_payment_history_mentioned',
       'rent_payments_late_frequency', 'tenant_ability_to_pay_rent',
       'tenant_conditions', 'tenant_children_present', 'total_children',
       'children_17_or_younger', 'children_13_or_younger',
       'children_4_or_younger', 'children_conditions_mentioned',
       'conditions_making_moving_burdensome', 'tenant_employed',

In [124]:
from dateutil.parser import parse

def convert_to_datetime(date_str):
    # Parse date using dateutil.parser.parse
    dt = parse(date_str)
    
    # Format date with strftime in the format 'MM/DD/YYYY'
    return dt.strftime('%m/%d/%Y')

for col in gold_data.columns:
    # strip all whitespace from the beginning and end of the string
    if type(gold_data[col][0]) == str:
        gold_data[col] = gold_data[col].str.strip()
        # gold_data[col].apply(lambda x: x.replace(" ", ";").replace(";;",)

gold_data['file_number'] = gold_data['file_number_gold_cleaned']
gold_data['file_number'] = gold_data['file_number'].apply(lambda x: x.strip().replace(" ", ";").replace(";;", ";"))
gold_data['hearing_date'] = gold_data['hearing_date'].apply(lambda x: convert_to_datetime(x))
gold_data['decision_date'] = gold_data['decision_date'].apply(lambda x: convert_to_datetime(x))
gold_data['hearing_date']

0      07/05/2018
1      05/19/2017
2      06/30/2017
3      07/07/2017
4      08/15/2017
          ...    
674    11/20/2018
675    11/23/2018
676    12/04/2018
677    11/02/2018
678    09/03/2019
Name: hearing_date, Length: 679, dtype: object

In [125]:
gold_data['file_number']

0         SWL-17348-18
1         TEL-79722-17
2         TEL-80773-17
3      TEL-81359-17-AM
4         TEL-81405-17
            ...       
674       TSL-99691-18
675       TSL-99824-18
676       TSL-99900-18
677       TSL-99965-18
678    TST-06337-19-IN
Name: file_number, Length: 679, dtype: object

# Silver Data
- Only 678 of 702 case files match

In [126]:
# # formatted_cases_path = "/Users/kmaurinjones/Desktop/School/UBC/UBC_Coursework/capstone/Allard_A_Capstone/scraping/45k_formatted_cases/"
# folder_path = "./raw_case_files/"

# # def create_master_dictionary(directory):
# master_dict = {}
# master_dict['raw_file_name'] = []
# master_dict['raw_file_text'] = []

# # Iterate over .txt files in the folder
# for file_name in os.listdir(folder_path):
#     file_path = os.path.join(folder_path, file_name)
    
#     # Check if the file is a .txt file
#     if os.path.isfile(file_path) and file_name.endswith('.txt'):
        
#         # Read the contents of the .txt file
#         with open(file_path, 'r') as file:
#             contents = file.read()
            
#             # Append the contents to the list in the master_dict
#             master_dict['raw_file_name'].append(file_name)
#             master_dict['raw_file_text'].append(contents)
# # master_dict

In [127]:
# for key, value in master_dict.items():
#     print(key, len(value))

# silver_data = pd.DataFrame.from_dict(master_dict)
# print(silver_data.shape)
# silver_data.head()

In [128]:
# matched_files = []
# print(len(silver_data))
# for silver_fn in silver_data.raw_file_name.unique().tolist():
#     if silver_fn[:-4] in gold_data.file_number.unique().tolist():
#         # print(silver_fn)
#         matched_files.append(silver_fn)

# silver_data = silver_data[silver_data.raw_file_name.isin(matched_files)]
# print(len(silver_data))
# silver_data = silver_data.sort_values(by = ['raw_file_name'], ascending = True).reset_index(drop = True)
# silver_data

## Creating `silver_data` df from `gold_data` raw text

In [129]:
silver_data = gold_data.copy()
silver_data = silver_data.drop(columns = [col for col in silver_data.columns if col not in ['raw_file_name', 'raw_file_text']])
silver_data

Unnamed: 0,raw_file_text,raw_file_name
0,Metadata:\nDate:\t2018-07-06\nFile number:\t\n...,SWL-17348-18.txt
1,Metadata:\nDate:\t2017-05-26\nFile number:\t\n...,TEL-79722-17.txt
2,Metadata:\nDate:\t2017-07-05\nFile number:\t\n...,TEL-80773-17.txt
3,Metadata:\nDate:\t2017-07-18\nFile number:\t\n...,TEL-81359-17-AM.txt
4,Metadata:\nDate:\t2017-08-17\nFile number:\t\n...,TEL-81405-17.txt
...,...,...
674,Metadata:\nDate:\t2018-11-23\nFile number:\t\n...,TSL-99691-18.txt
675,Metadata:\nDate:\t2018-11-29\nFile number:\t\n...,TSL-99824-18.txt
676,Metadata:\nDate:\t2018-12-12\nFile number:\t\n...,TSL-99900-18.txt
677,Metadata:\nDate:\t2018-11-20\nFile number:\t\n...,TSL-99965-18.txt


In [130]:
gold_data = gold_data.rename(columns = {'case number': 'file_number',
                                        'board_location': 'ltb_location'})
gold_data.columns.tolist()

['file_number_gold',
 'file_number_gold_cleaned',
 'raw_file_text',
 'raw_file_name',
 'timestamp',
 'email_address',
 'hearing_date',
 'decision_date',
 'adjudicating_member',
 'ltb_location',
 'landlord_represented',
 'landlord_attended_hearing',
 'tenant_represented',
 'tenant_attended_hearing',
 'landlord_nonprofit',
 'tenant_collecting_subsidy',
 'case_outcome',
 'tenancy_length',
 'monthly_rent',
 'rental_deposit',
 'rent_after_increase',
 'rent_increase_effect_date',
 'total_arrears',
 'arrears_duration',
 'arrears_payment_amount',
 'tenant_arrears_history_mentioned',
 'tenant_arrears_payment_history_mentioned',
 'rent_payments_late_frequency',
 'tenant_ability_to_pay_rent',
 'tenant_conditions',
 'tenant_children_present',
 'total_children',
 'children_17_or_younger',
 'children_13_or_younger',
 'children_4_or_younger',
 'children_conditions_mentioned',
 'conditions_making_moving_burdensome',
 'tenant_employed',
 'tenant_government_assistance',
 'employment_stability_doubts',
 

# ALL CASE FILES

In [131]:
# all_cases_master = pd.read_csv("large_files/44k_cases_pproc_filenums.csv")
# all_cases_master

In [132]:
# all_cases_master['file_numbers_from_file_name'] = all_cases_master.raw_file_name.apply(lambda x: x.split(".txt")[0])
# all_cases_master

# `get_nulls()`
- for checking df after each addition to it

In [133]:
def get_nulls(df, col, return_index = False):
    # returns a list of the indices of null values in a column of a dataframe
    null_rows = silver_data[silver_data[col].isnull()] # df of all rows with null ltb_location
    nulls_inds = null_rows.index.tolist()

    if return_index:
        return nulls_inds
    else:
        return null_rows
    
get_nulls(silver_data, 'raw_file_text', return_index = False)

Unnamed: 0,raw_file_text,raw_file_name


# General Cleaning
- Removing unnecessary lines (blank)
- Removing unnecessary characters (extra whitespaces and underscores)
- Separating metadata and content

In [134]:
import re

def general_cleaning(raw_file_str: str):
    # gets rid of tabs, non-breaking spaces, leading/trailing whitespace, removes empty lines, and "\xa0"
    generally_cleaned_list = [line.replace("\t", " ").replace("\xa0", "").strip() for line in raw_file_str.split('\n') if line.strip() != '']
    return generally_cleaned_list

def remove_whitespace_and_underscores(string):
    # Remove consecutive whitespace
    string = re.sub(r'\s+', ' ', string)

    # Remove more than three consecutive underscores
    string = re.sub(r'_+', '', string)

    return string.strip()

def separate_file_sections(text_list):
    metadata_list = []
    content_list = []

    is_metadata = True
    is_content = False

    for line in text_list:
        if line.strip() == 'Metadata:':
            is_metadata = True
            is_content = False
        elif line.strip() == 'Content:':
            is_metadata = False
            is_content = True
        elif is_metadata:
            metadata_list.append(remove_whitespace_and_underscores(line))
        elif is_content:
            content_list.append(remove_whitespace_and_underscores(line))

    return metadata_list, content_list

def merge_numerical_entries(strings_list):
    """
    Turns something like
        [..., '3.',
        'The tenant took occupancy of the rental unit in or about the beginning of December 2016.', ...]
    into
        [..., '3. The tenant took occupancy of the rental unit in or about the beginning of December 2016.', ...]
    
    """
    for i in range(len(strings_list) - 2, -1, -1):
        if re.fullmatch(r'\d+\.', strings_list[i]):
            strings_list[i] += ' ' + strings_list[i + 1]
            del strings_list[i + 1]
    return strings_list

def move_trailing_numbers(strings_list):
    """
    Turns something like
        [..., 'Credibility of the Parties 4.',
        'The Landlord said about two to three months ago he ...', ...]
    into
        [..., 'Credibility of the Parties',
        '4. The Landlord said about two to three months ago he...', ...]
    
    """
    for i in range(len(strings_list) - 1, -1, -1):
        match = re.search(r'\s+(\d{2}\.)$', strings_list[i])
        if match:
            number = match.group(1)
            strings_list[i] = re.sub(r'\s+\d{1,2}\.$', '', strings_list[i])
            strings_list[i + 1] = number + ' ' + strings_list[i + 1]
    return strings_list

import re

def remove_end_tag_and_restructure(metadata_list: list):

    cleaned_str = " ".join(metadata_list)

    # this doesn't add any meaning to the case details we need to extract, and instead just adds noise to the extraction process + adds extra unnecessary tokens
    if cleaned_str.find("If you have any questions about this order") > (len(cleaned_str) - 500):
        cleaned_str = cleaned_str[: cleaned_str.find("If you have any questions about this order")].strip() # ending tag removed
    
    # otherwise just do everything else
    cleaned_str = cleaned_str.replace(". ", ".\n")
    # cleaned_str = cleaned_str.replace(". ", ".\n") # deprecated by regex approach
    # trimmed_list = [line.strip() for line in re.split(r'(?<!\d)\. ', cleaned_str) if line.strip() != ''] # deprecated by regex approach
    cleaned_str = re.sub(r'(?<!\d)\. ', "\n", cleaned_str)
    trimmed_list = [line.strip() for line in cleaned_str.split('\n') if line.strip() != '']
    trimmed_list = merge_numerical_entries(trimmed_list)
    trimmed_list = move_trailing_numbers(trimmed_list)
    return trimmed_list

file_name = "CEL-74519-18.txt"
# row of this particular case
case_file_ind = silver_data.loc[silver_data['raw_file_name'] == file_name].index.tolist()[0]
test_text = silver_data.loc[206, "raw_file_text"]#.item()

metadata, content = separate_file_sections(general_cleaning(test_text))
remove_end_tag_and_restructure(content)

["Order under Section 69 Residential Tenancies Act, 2006 File Number: SWL-16867-18 KC (the 'Landlord') applied for an order to terminate the tenancy and evict JO and JP (the 'Tenants') because the Tenants did not pay the rent that the Tenants owe (the ‘L1 Application’).",
 'The Landlord also applied for an order to terminate the tenancy and evict the Tenants because they have been persistently late in paying their rent.',
 'The Landlord also claimed compensation for each day the Tenants remained in the unit after the termination date (the ‘L2 Application’).',
 'These applications were heard in [CITY] on July 3, 2018.',
 'Only the Landlord attended the hearing.',
 'As of 1:30 p.m., the Tenants were not present or represented at the hearing although properly served with notice of this hearing by the Board.',
 'Determinations and Reasons: The L1 Application 1.',
 'The Tenants have not paid the total rent they were required to pay for the period from November 1, 2017 to July 31, 2018.',
 '

## Updating CSV with Cleaned File, Metadata, and Case Contents
- Adding column for cleaned full file
- Adding column for metadata
- Adding column for case contents

In [135]:
silver_data

Unnamed: 0,raw_file_text,raw_file_name
0,Metadata:\nDate:\t2018-07-06\nFile number:\t\n...,SWL-17348-18.txt
1,Metadata:\nDate:\t2017-05-26\nFile number:\t\n...,TEL-79722-17.txt
2,Metadata:\nDate:\t2017-07-05\nFile number:\t\n...,TEL-80773-17.txt
3,Metadata:\nDate:\t2017-07-18\nFile number:\t\n...,TEL-81359-17-AM.txt
4,Metadata:\nDate:\t2017-08-17\nFile number:\t\n...,TEL-81405-17.txt
...,...,...
674,Metadata:\nDate:\t2018-11-23\nFile number:\t\n...,TSL-99691-18.txt
675,Metadata:\nDate:\t2018-11-29\nFile number:\t\n...,TSL-99824-18.txt
676,Metadata:\nDate:\t2018-12-12\nFile number:\t\n...,TSL-99900-18.txt
677,Metadata:\nDate:\t2018-11-20\nFile number:\t\n...,TSL-99965-18.txt


In [136]:
import time
import numpy as np
from collections import deque

start_time = time.time()

# Initialize a deque to store the latest 100 iteration times
time_deque = deque(maxlen = 500)

cases_contents = []
cases_metadata = []
full_cleaned = []

raw_files = silver_data['raw_file_text'].tolist()
for index, raw_file in enumerate(raw_files):
    iteration_start_time = time.time()
    better_file = general_cleaning(raw_file)
    try:
        metadata_list, content_list = separate_file_sections(better_file)
        full_cleaned.append(better_file)
        # cases_metadata.append(remove_end_tag_and_restructure(metadata_list)) # removing a bit more text if possible
        cases_metadata.append(metadata_list) # removing a bit more text if possible
        cases_contents.append(remove_end_tag_and_restructure(content_list))

        # Save the end time of this iteration and push it into the deque
        iteration_end_time = time.time()
        time_deque.append(iteration_end_time - iteration_start_time)

        # progress tracker
        average_time_per_file = np.mean(time_deque)
        files_left = len(raw_files) - (index + 1)
        estimated_time_left = files_left * average_time_per_file

        print(f"Files processed: {index + 1} of {len(raw_files)}, Estimated time remaining: {time.strftime('%H:%M:%S', time.gmtime(estimated_time_left))}", end='\r')
    except Exception as any_error:
        print(f"{any_error} with file at Df row: ", index)

silver_data['full_cleaned'] = full_cleaned
silver_data['metadata'] = cases_metadata
silver_data['content'] = cases_contents
silver_data.head()

Files processed: 679 of 679, Estimated time remaining: 00:00:00

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content
0,Metadata:\nDate:\t2018-07-06\nFile number:\t\n...,SWL-17348-18.txt,"[Metadata:, Date: 2018-07-06, File number:, SW...","[Date: 2018-07-06, File number:, SWL-17348-18,...",[Order under Section 69 Residential Tenancies ...
1,Metadata:\nDate:\t2017-05-26\nFile number:\t\n...,TEL-79722-17.txt,"[Metadata:, Date: 2017-05-26, File number:, TE...","[Date: 2017-05-26, File number:, TEL-79722-17,...",[Order under Section 69 Residential Tenancies ...
2,Metadata:\nDate:\t2017-07-05\nFile number:\t\n...,TEL-80773-17.txt,"[Metadata:, Date: 2017-07-05, File number:, TE...","[Date: 2017-07-05, File number:, TEL-80773-17,...",[Order under Section 69 Residential Tenancies ...
3,Metadata:\nDate:\t2017-07-18\nFile number:\t\n...,TEL-81359-17-AM.txt,"[Metadata:, Date: 2017-07-18, File number:, TE...","[Date: 2017-07-18, File number:, TEL-81359-17-...",[Order under Section 69 Residential Tenancies ...
4,Metadata:\nDate:\t2017-08-17\nFile number:\t\n...,TEL-81405-17.txt,"[Metadata:, Date: 2017-08-17, File number:, TE...","[Date: 2017-08-17, File number:, TEL-81405-17,...",[Order under Section 69 Residential Tenancies ...


# Case Citation and File Number
- Extracting case citation from metadata
- Extracting file number from all data (it's not as consistently formatted so the approach has to be more broad)

In [137]:
import re

def get_case_citation(metadata_list):
    for line in metadata_list:
        if "Citation:" in line:
            citation_start = line.find("Citation: ")
            # print(citation_start)
            citation_end = line.find("LTB)") + 4
            # print(citation_end)
            return line[citation_start : citation_end].replace("Citation: ", "").strip()
        elif "Référence: " in line:
            citation_start = line.find("Référence: ")
            # print(citation_start)
            citation_end = line.find("LTB)") + 4
            # print(citation_end)
            return line[citation_start : citation_end].replace("Référence: ", "").strip()
    return None

def get_file_number(metadata_list):
    # metadata_str = "".join(get_case_citation(metadata_list))
    metadata_str = " ".join(metadata_list)

    if "Citation: " in metadata_str:
        file_nums = metadata_str[metadata_str.find("File number: ") + len("File number: ") : metadata_str.find("Citation:")].strip()
    elif "Référence: " in metadata_str:
        file_nums = metadata_str[metadata_str.find("Numéro de dossier: ") + len("Numéro de dossier: ") : metadata_str.find("Référence")].strip()

    if len(file_nums) == 0:
        return None
    
    file_nums = file_nums.replace(";", " ")
    
    # eliminates duplicates or instances of something like 'TNL-10004-18 TNL-10004-18' being one list item
    file_num = list(set(file_nums.split()))
    file_num = ";".join(file_num)
    file_num = re.sub(r'[^\w\s]$', '', file_num)

    # removing duplicate file numbers
    if ";" in file_num:
        file_num = list(set(file_num.split(";"))) 
        file_num = [re.sub(r'[\(\)]', '', num) for num in file_num] # removing any trailing punctuation from each file number
        file_num = ";".join(file_num)

    # removes any parentheses from the file number
    file_num = re.sub(r'[\(\)]', '', file_num)

    return file_num

# test_row = 149
# test_metadata = silver_data.loc[test_row, "metadata"]# + silver_data.loc[test_row, "content"]
# print(get_file_number(test_metadata))
# silver_data.loc[test_row, "metadata"]

In [138]:
import time
import numpy as np
from collections import deque

start_time = time.time()

# Initialize a deque to store the latest 500 iteration times
time_deque = deque(maxlen = 500)

for index, row in enumerate(silver_data.itertuples()):

    # Save the start time of this iteration
    iteration_start_time = time.time()

    # adding to 'case_citation' and 'file_number' columns
    try:
        metadata_list, content_list = separate_file_sections(general_cleaning(silver_data.loc[row.Index, "raw_file_text"]))
        silver_data.at[row.Index, 'case_citation'] = get_case_citation(metadata_list)
        silver_data.at[row.Index, 'file_number'] = get_file_number(metadata_list)
        
    except Exception as any_error:
        print(f"{any_error} with file at Df row: ", row.Index)

    # Save the end time of this iteration and push it into the deque
    iteration_end_time = time.time()
    time_deque.append(iteration_end_time - iteration_start_time)

    # progress tracker
    average_time_per_row = np.mean(time_deque)
    rows_left = len(silver_data) - (index + 1)
    estimated_time_left = rows_left * average_time_per_row

    print(f"Files processed: {index + 1} of {len(silver_data)}, Estimated time remaining: {time.strftime('%H:%M:%S', time.gmtime(estimated_time_left))}", end='\r')

silver_data.head()

Files processed: 679 of 679, Estimated time remaining: 00:00:00

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number
0,Metadata:\nDate:\t2018-07-06\nFile number:\t\n...,SWL-17348-18.txt,"[Metadata:, Date: 2018-07-06, File number:, SW...","[Date: 2018-07-06, File number:, SWL-17348-18,...",[Order under Section 69 Residential Tenancies ...,"SWL-17348-18 (Re), 2018 CanLII 88643 (ON LTB)",SWL-17348-18
1,Metadata:\nDate:\t2017-05-26\nFile number:\t\n...,TEL-79722-17.txt,"[Metadata:, Date: 2017-05-26, File number:, TE...","[Date: 2017-05-26, File number:, TEL-79722-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-79722-17 (Re), 2017 CanLII 48856 (ON LTB)",TEL-79722-17
2,Metadata:\nDate:\t2017-07-05\nFile number:\t\n...,TEL-80773-17.txt,"[Metadata:, Date: 2017-07-05, File number:, TE...","[Date: 2017-07-05, File number:, TEL-80773-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-80773-17 (Re), 2017 CanLII 60498 (ON LTB)",TEL-80773-17
3,Metadata:\nDate:\t2017-07-18\nFile number:\t\n...,TEL-81359-17-AM.txt,"[Metadata:, Date: 2017-07-18, File number:, TE...","[Date: 2017-07-18, File number:, TEL-81359-17-...",[Order under Section 69 Residential Tenancies ...,"TEL-81359-17-AM (Re), 2017 CanLII 60052 (ON LTB)",TEL-81359-17-AM
4,Metadata:\nDate:\t2017-08-17\nFile number:\t\n...,TEL-81405-17.txt,"[Metadata:, Date: 2017-08-17, File number:, TE...","[Date: 2017-08-17, File number:, TEL-81405-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-81405-17 (Re), 2017 CanLII 60203 (ON LTB)",TEL-81405-17


In [139]:
from evaluation import *
# wrongs = evaluate(silver_data, gold_data, "file_number", return_inaccurate = True, metric = "jaro_winkler")
# extracted_successfully = 0
# for row in wrongs.index:
#     if str(wrongs.loc[row, wrongs.columns[1]]) in str(wrongs.loc[row, wrongs.columns[0]]):
#         extracted_successfully += 1

gold_data['file_number'] = gold_data['file_number_gold']
        
# print(f"\nExtracted {extracted_successfully} out of {len(silver_data)} also file numbers correctly. Total accuracy could be {round((extracted_successfully / len(silver_data)), 2) + 0.916}%")
evaluate(silver_data, gold_data, "file_number", return_inaccurate = True, metric = "jaro_winkler")

Unnamed: 0,silver_file_number,gold_file_number
5,TNL-01183-18;TNL-00793-18,TNL-00793-18 TNL-01183-18
14,TNL-03769-18-RV2,TNL-03769-18
59,CEL-71370-17-RV,CEL-71370-17
62,CEL-72209-17;CET-73173-18,CEL-72209-17; CET-73173-18
74,CEL-75759-18;CET-73038-18;CEL-73640-18,CEL-73640-18
...,...,...
622,TSL-93207-18-RV;TST-94747-18;TSL-00082-18,TSL-93207-18-RV; TST-94747-18; TSL-00082-18
639,TSL-95431-18-RV;TSL-95431-18,TSL-95431-18-RV
643,TSL-95667-18-RV,TSL-95667-18
663,TSL-98384-18,TSL-98384-18-AM


In [140]:
from evaluation import *

print(len(silver_data))
# test = evaluate(silver_data, gold_data, "file_number", return_inaccurate = True, metric = "jaro_winkler", print_score = True)#.head(60)
evaluate(silver_data, gold_data, "file_number", return_inaccurate = True, metric = "jaro_winkler", print_score = True)#.head(60)
# test.loc[0, 'silver_file_number'] == test.loc[0, 'gold_file_number']
# test.loc[0, 'silver_file_number']
# test.loc[0, 'gold_file_number']

679
0.9970544918998527


Unnamed: 0,silver_file_number,gold_file_number
5,TNL-01183-18;TNL-00793-18,TNL-00793-18 TNL-01183-18
14,TNL-03769-18-RV2,TNL-03769-18
59,CEL-71370-17-RV,CEL-71370-17
62,CEL-72209-17;CET-73173-18,CEL-72209-17; CET-73173-18
74,CEL-75759-18;CET-73038-18;CEL-73640-18,CEL-73640-18
...,...,...
622,TSL-93207-18-RV;TST-94747-18;TSL-00082-18,TSL-93207-18-RV; TST-94747-18; TSL-00082-18
639,TSL-95431-18-RV;TSL-95431-18,TSL-95431-18-RV
643,TSL-95667-18-RV,TSL-95667-18
663,TSL-98384-18,TSL-98384-18-AM


In [141]:
test_row = 67
test_metadata = silver_data.loc[test_row, "metadata"]# + silver_data.loc[test_row, "content"]
print(get_file_number(test_metadata))
silver_data.loc[test_row, "metadata"]

CEL-72966-18


['Date: 2018-02-27',
 'File number:',
 'CEL-72966-18',
 'CEL-72966-18',
 'Citation: CEL-72966-18 (Re), 2018 CanLII 41856 (ON LTB), <https://canlii.ca/t/hs05q>, retrieved on 2023-05-16 https://canlii.ca/t/hs05q']

In [142]:
def check_df(df):
    """
    Checks df for null values, number of unique values in each column, and data types
    """

    print(f"Df Size: {df.shape[0]} rows, {df.shape[1]} columns")
    print("-" * 30)

    print("\n" + "Checking for null values...")
    print("-" * 25)
    print(df.isnull().sum())

    print("\n" + "Checking data types...")
    print("-" * 25)
    for col in df.columns:
        col_type = str(type(df.loc[0, col])).replace("<class '", "").replace("'>", "")
        print(f"{col}: {col_type}")

check_df(silver_data)

Df Size: 679 rows, 7 columns
------------------------------

Checking for null values...
-------------------------
raw_file_text    0
raw_file_name    0
full_cleaned     0
metadata         0
content          0
case_citation    0
file_number      0
dtype: int64

Checking data types...
-------------------------
raw_file_text: str
raw_file_name: str
full_cleaned: list
metadata: list
content: list
case_citation: str
file_number: str


In [143]:
asdf

NameError: name 'asdf' is not defined

In [None]:
## can't eval because it wasn't actually annotated in the gold data
# evaluate(silver_data, gold_data, "case_citation", return_inaccurate = True, metric = "jaro_winkler")

# Language Detection

In [None]:
# !pip install langdetect
from langdetect import detect

def is_mostly_french(text, threshold):
    try:
        detected_language = detect(text)
        if detected_language == 'fr':
            return True
        else:
            return False
    except:
        return False

def is_french(text, threshold):
    try:
        detected_language = detect(text)
        if detected_language == 'fr':
            return True
        language_probabilities = detect_langs(text)
        for lang in language_probabilities:
            if lang.lang == 'fr' and lang.prob > threshold:
                return True
        return False
    except:
        return False

is_french(silver_data.loc[109, "raw_file_text"], 0.7)

False

## Updating CSV with Language

In [None]:
# import time
# import numpy as np
# from collections import deque

start_time = time.time()

# Initialize a deque to store the latest 500 iteration times
time_deque = deque(maxlen = 500)

for index, row in enumerate(silver_data.itertuples()):

    # Save the start time of this iteration
    iteration_start_time = time.time()

    # adding to 'case_citation' and 'file_number' columns
    try:
        # adding to 'language' column
        if is_french(silver_data.loc[row.Index, "raw_file_text"], 0.7) == True:
            silver_data.at[row.Index, 'language'] = "French"
        else:
            silver_data.at[row.Index, 'language'] = "English"

    except Exception as any_error:
        print(f"{any_error} with file at Df row: ", row.Index)

    # Save the end time of this iteration and push it into the deque
    iteration_end_time = time.time()
    time_deque.append(iteration_end_time - iteration_start_time)

    # progress tracker
    average_time_per_row = np.mean(time_deque)
    rows_left = len(silver_data) - (index + 1)
    estimated_time_left = rows_left * average_time_per_row

    print(f"Files processed: {index + 1} of {len(silver_data)}, Estimated time remaining: {time.strftime('%H:%M:%S', time.gmtime(estimated_time_left))}", end='\r')

silver_data.head()

Files processed: 679 of 679, Estimated time remaining: 00:00:00

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language
0,Metadata:\nDate:\t2018-07-06\nFile number:\t\n...,SWL-17348-18.txt,"[Metadata:, Date: 2018-07-06, File number:, SW...","[Date: 2018-07-06, File number:, SWL-17348-18,...",[Order under Section 69 Residential Tenancies ...,"SWL-17348-18 (Re), 2018 CanLII 88643 (ON LTB)",SWL-17348-18,English
1,Metadata:\nDate:\t2017-05-26\nFile number:\t\n...,TEL-79722-17.txt,"[Metadata:, Date: 2017-05-26, File number:, TE...","[Date: 2017-05-26, File number:, TEL-79722-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-79722-17 (Re), 2017 CanLII 48856 (ON LTB)",TEL-79722-17,English
2,Metadata:\nDate:\t2017-07-05\nFile number:\t\n...,TEL-80773-17.txt,"[Metadata:, Date: 2017-07-05, File number:, TE...","[Date: 2017-07-05, File number:, TEL-80773-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-80773-17 (Re), 2017 CanLII 60498 (ON LTB)",TEL-80773-17,English
3,Metadata:\nDate:\t2017-07-18\nFile number:\t\n...,TEL-81359-17-AM.txt,"[Metadata:, Date: 2017-07-18, File number:, TE...","[Date: 2017-07-18, File number:, TEL-81359-17-...",[Order under Section 69 Residential Tenancies ...,"TEL-81359-17-AM (Re), 2017 CanLII 60052 (ON LTB)",TEL-81359-17-AM,English
4,Metadata:\nDate:\t2017-08-17\nFile number:\t\n...,TEL-81405-17.txt,"[Metadata:, Date: 2017-08-17, File number:, TE...","[Date: 2017-08-17, File number:, TEL-81405-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-81405-17 (Re), 2017 CanLII 60203 (ON LTB)",TEL-81405-17,English


# Year
- not meaningful really but nice for the corpus

In [None]:
import re

def get_year_from_file_number(file_number):

    if isinstance(file_number, list):
        file_number = " ".join(file_number)

    if ";" not in file_number:
        # print("UES")
        file_number = [tok for tok in file_number.split("-") if (len(tok) == 2 and tok.isdigit())]
        return "20" + file_number[0]
    else:
        file_numbers = file_number.split(";")
        file_numbers = [tok for tok in file_numbers if (len(tok) == 2 and tok.isdigit())]
        return "20" + file_number[0]
    # elif file_number.isinstance(list):

print(get_year_from_file_number("TEL-81359-17-AM")) # Outputs: "17"
print(get_year_from_file_number("TEL-81405-17")) # Outputs: "17"
print(get_year_from_file_number(silver_data.loc[5, 'file_number'])) # Outputs: "17"
silver_data.loc[5, 'file_number']

2017
2017
20T


'TNL-00793-18;TNL-01183-18'

## Updating CSV with Year

In [None]:
# import time
# import numpy as np
# from collections import deque

start_time = time.time()

# Initialize a deque to store the latest 500 iteration times
time_deque = deque(maxlen = 500)

for index, row in enumerate(silver_data.itertuples()):

    # Save the start time of this iteration
    iteration_start_time = time.time()

    # adding to 'case_citation' and 'file_number' columns
    try:
        silver_data.at[row.Index, 'year'] = get_year_from_file_number(silver_data.at[row.Index, 'file_number'])
    except Exception as any_error:
        print(f"{any_error} with file at Df row: ", row.Index)

    # Save the end time of this iteration and push it into the deque
    iteration_end_time = time.time()
    time_deque.append(iteration_end_time - iteration_start_time)

    # progress tracker
    average_time_per_row = np.mean(time_deque)
    rows_left = len(silver_data) - (index + 1)
    estimated_time_left = rows_left * average_time_per_row

    print(f"Files processed: {index + 1} of {len(silver_data)}, Estimated time remaining: {time.strftime('%H:%M:%S', time.gmtime(estimated_time_left))}", end='\r')
    
    # if index == 6:
    #     break

silver_data.head()

Files processed: 679 of 679, Estimated time remaining: 00:00:00

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year
0,Metadata:\nDate:\t2018-07-06\nFile number:\t\n...,SWL-17348-18.txt,"[Metadata:, Date: 2018-07-06, File number:, SW...","[Date: 2018-07-06, File number:, SWL-17348-18,...",[Order under Section 69 Residential Tenancies ...,"SWL-17348-18 (Re), 2018 CanLII 88643 (ON LTB)",SWL-17348-18,English,2018
1,Metadata:\nDate:\t2017-05-26\nFile number:\t\n...,TEL-79722-17.txt,"[Metadata:, Date: 2017-05-26, File number:, TE...","[Date: 2017-05-26, File number:, TEL-79722-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-79722-17 (Re), 2017 CanLII 48856 (ON LTB)",TEL-79722-17,English,2017
2,Metadata:\nDate:\t2017-07-05\nFile number:\t\n...,TEL-80773-17.txt,"[Metadata:, Date: 2017-07-05, File number:, TE...","[Date: 2017-07-05, File number:, TEL-80773-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-80773-17 (Re), 2017 CanLII 60498 (ON LTB)",TEL-80773-17,English,2017
3,Metadata:\nDate:\t2017-07-18\nFile number:\t\n...,TEL-81359-17-AM.txt,"[Metadata:, Date: 2017-07-18, File number:, TE...","[Date: 2017-07-18, File number:, TEL-81359-17-...",[Order under Section 69 Residential Tenancies ...,"TEL-81359-17-AM (Re), 2017 CanLII 60052 (ON LTB)",TEL-81359-17-AM,English,2017
4,Metadata:\nDate:\t2017-08-17\nFile number:\t\n...,TEL-81405-17.txt,"[Metadata:, Date: 2017-08-17, File number:, TE...","[Date: 2017-08-17, File number:, TEL-81405-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-81405-17 (Re), 2017 CanLII 60203 (ON LTB)",TEL-81405-17,English,2017


In [None]:
check_df(silver_data)

Df Size: 679 rows, 9 columns
------------------------------

Checking for null values...
-------------------------
raw_file_text    0
raw_file_name    0
full_cleaned     0
metadata         0
content          0
case_citation    0
file_number      0
language         0
year             0
dtype: int64

Checking data types...
-------------------------
raw_file_text: str
raw_file_name: str
full_cleaned: list
metadata: list
content: list
case_citation: str
file_number: str
language: str
year: str


# LTB Location
- not 100% effective but close to. Fallback on other methods (transformers, etc) if need be

## Method 1 - Rule-Based

In [None]:
def find_all_positions(text, keyword):
    positions = []
    start = 0
    while True:
        index = text.find(keyword, start)
        if index == -1:
            break
        positions.append(index)
        start = index + 1
    return positions

In [None]:
# backup for other methods, using " ON " + postal code as a marker
def find_postal_code_index(text):
    match = re.search(r' ON [A-Z]\d[A-Z] ?\d[A-Z]\d', text)
    if match:
        return match.start()
    else:
        return None

def extract_loc_rule(text_list):

    content_str = " ".join(text_list)
    # print(content_str)

    # "hear" is a good marker for the location sentence ("heard in/on", "hearing", etc)
    if "hear" in content_str:
        hear_inds = find_all_positions(content_str, "hear") # list of indices where "hear" appears in string
        # print(hear_inds)

        for hear_ind in hear_inds:

            hear_substr = content_str[hear_ind - 50 : hear_ind + 50]
            possible_sentences = hear_substr.split(". ")
            # print(possible_sentences)
            
            hear_sent = [sent for i, sent in enumerate(possible_sentences) if "hear" in sent][0]
            # print(hear_sent)

            if len(hear_sent.split(" on ")) != 2:
                # print("TEST")
                # return None
                pass # go to next "hear" location in string
            else:
                
                location_sent, date = hear_sent.split(" on ") # should only split into 2 parts
                # print(location_sent)

                if " in " in location_sent:

                    # print(location_sent)
                    location = location_sent.split(" in ")[1].strip() # location name should be last tokens of string after token containing " in " (city name could be multiple tokens, so need to get all tokens after " in " token, not just last one)

                    # print(location)
                    location = location.split(" ")[-1] # location name should be last tokens of string after token containing "hear" (city name could be multiple tokens, so need to get all tokens after "hear" token, not just last one)
                    # print("YESSS")
                    return location.strip()
                
                else:
                    pass

    # use " ON " + postal code as a marker
    if " ON " in content_str:
        # print("|UESSS")
        # print(find_postal_code_index(content_str))
        postal_ind = find_postal_code_index(content_str)
        content_str_subsection = (content_str[postal_ind - 50 : postal_ind + 50])
        location = content_str_subsection.split(" ON ")[0].split()[-1]
        return location.strip()
    
    # if absolutely nothing works, return None and we'll use a transformer or something more nuanced
    return None

test_num = 10
extract_loc_rule(silver_data.loc[test_num, 'content'])
# " ".join(silver_data.loc[56, 'content'])
# silver_data.loc[test_num, 'content']

'Toronto'

## Method 2 - SpaCy

In [None]:
gold_data.columns

Index(['file_number_gold', 'file_number_gold_cleaned', 'raw_file_text',
       'raw_file_name', 'timestamp', 'email_address', 'hearing_date',
       'decision_date', 'adjudicating_member', 'ltb_location',
       'landlord_represented', 'landlord_attended_hearing',
       'tenant_represented', 'tenant_attended_hearing', 'landlord_nonprofit',
       'tenant_collecting_subsidy', 'case_outcome', 'tenancy_length',
       'monthly_rent', 'rental_deposit', 'rent_after_increase',
       'rent_increase_effect_date', 'total_arrears', 'arrears_duration',
       'arrears_payment_amount', 'tenant_arrears_history_mentioned',
       'tenant_arrears_payment_history_mentioned',
       'rent_payments_late_frequency', 'tenant_ability_to_pay_rent',
       'tenant_conditions', 'tenant_children_present', 'total_children',
       'children_17_or_younger', 'children_13_or_younger',
       'children_4_or_younger', 'children_conditions_mentioned',
       'conditions_making_moving_burdensome', 'tenant_employed',

In [None]:
all_locations = list(set(gold_data['ltb_location'].unique().tolist())) # list of all unique locations in the annotated data
all_locations = list(set([loc.strip() for loc in all_locations]))
all_locations

import spacy

# Load the English language model in spaCy
nlp = spacy.load('en_core_web_sm')

def extract_location_spacy(string_list, model = nlp, other_locations = all_locations):

    string = " ".join(string_list)

    # uses a spacy model + its vocabulary to extract and return the location if possible
    doc = model(string)
    for ent in doc.ents:
        if ent.label_ == 'GPE':
            return ent.text

    # otherwise looks through the list of all locations in the annotated data and returns the first one that appears in the string -- for example, "Hamilton"
    for tok in string.split():
        if tok in other_locations:
            return tok

    # if all else fails, return None -- use a transformer or something later idk
    return None

all_locations

2023-06-06 13:44:55.962550: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


['Peterborough',
 'Review completed without hearing',
 'Ottawa',
 'Mississauga',
 'Cobourg',
 'Brantford',
 'Stratford',
 'Waterloo',
 'Burlington',
 'Bracebridge',
 'Not stated, hearing in Windsor',
 'Newmarket',
 'Lindsay',
 'Whitby',
 'Orangeville',
 'Woodstock',
 'Sudbury',
 'Barrie',
 'Toronto',
 'Hamilton',
 'Kingston',
 'by telephone',
 'Not stated',
 'London',
 'Windsor',
 'Thunder Bay',
 'Belleville']

In [None]:
start_time = time.time()

# Initialize a deque to store the latest 100 iteration times
time_deque = deque(maxlen = 500)

for index, row in enumerate(silver_data.itertuples()):

    # Save the start time of this iteration
    iteration_start_time = time.time()

    try:
        ### rule-based extremely quick -- pretty effective but imperfect
        location = extract_loc_rule(silver_data.loc[row.Index, 'content'])#.title() # returns the string in title case

        if not location[0].isalnum(): # something like "[CITY]" -- use spacy method
            location = extract_location_spacy(silver_data.loc[row.Index, 'content'])

        if not location[0].isupper():
            # I know this isn't a great rule in general but this seems to be consistent/reliable across all cases.
            # City names are all capitalized. Otherwise it finds "it", "heard", and more as locations
            location = extract_location_spacy(silver_data.loc[row.Index, 'content'])

        if location == None: # rule-based returns None
            location = extract_location_spacy(silver_data.loc[row.Index, 'content'])

        # use the found location
        silver_data.at[row.Index, 'ltb_location'] = location.title() # Title casing for consistency

    except Exception as any_error:
        print(f"{any_error} with file at Df row: ", row.Index)

    # Save the end time of this iteration and push it into the deque
    iteration_end_time = time.time()
    time_deque.append(iteration_end_time - iteration_start_time)

    # progress tracker
    average_time_per_row = np.mean(time_deque)
    rows_left = len(silver_data) - (index + 1)
    estimated_time_left = rows_left * average_time_per_row

    print("Files processed: ", index + 1, "of", len(silver_data),
          "Estimated time remaining: ", time.strftime('%H:%M:%S', time.gmtime(estimated_time_left)), end='\r')

silver_data.head()

'NoneType' object is not subscriptable with file at Df row:  469
Files processed:  679 of 679 Estimated time remaining:  00:00:00

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year,ltb_location
0,Metadata:\nDate:\t2018-07-06\nFile number:\t\n...,SWL-17348-18.txt,"[Metadata:, Date: 2018-07-06, File number:, SW...","[Date: 2018-07-06, File number:, SWL-17348-18,...",[Order under Section 69 Residential Tenancies ...,"SWL-17348-18 (Re), 2018 CanLII 88643 (ON LTB)",SWL-17348-18,English,2018,London
1,Metadata:\nDate:\t2017-05-26\nFile number:\t\n...,TEL-79722-17.txt,"[Metadata:, Date: 2017-05-26, File number:, TE...","[Date: 2017-05-26, File number:, TEL-79722-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-79722-17 (Re), 2017 CanLII 48856 (ON LTB)",TEL-79722-17,English,2017,Toronto
2,Metadata:\nDate:\t2017-07-05\nFile number:\t\n...,TEL-80773-17.txt,"[Metadata:, Date: 2017-07-05, File number:, TE...","[Date: 2017-07-05, File number:, TEL-80773-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-80773-17 (Re), 2017 CanLII 60498 (ON LTB)",TEL-80773-17,English,2017,Whitby
3,Metadata:\nDate:\t2017-07-18\nFile number:\t\n...,TEL-81359-17-AM.txt,"[Metadata:, Date: 2017-07-18, File number:, TE...","[Date: 2017-07-18, File number:, TEL-81359-17-...",[Order under Section 69 Residential Tenancies ...,"TEL-81359-17-AM (Re), 2017 CanLII 60052 (ON LTB)",TEL-81359-17-AM,English,2017,Toronto
4,Metadata:\nDate:\t2017-08-17\nFile number:\t\n...,TEL-81405-17.txt,"[Metadata:, Date: 2017-08-17, File number:, TE...","[Date: 2017-08-17, File number:, TEL-81405-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-81405-17 (Re), 2017 CanLII 60203 (ON LTB)",TEL-81405-17,English,2017,Lindsay


In [None]:
silver_data.ltb_location.value_counts()

Toronto         371
Mississauga      78
Whitby           64
Newmarket        33
Belleville       16
Peterborough     15
Barrie           12
London           11
Ottawa           11
Lindsay          10
Windsor           8
Burlington        5
Duty Counsel      5
Hamilton          5
Waterloo          5
Stratford         3
Cobourg           3
Sudbury           2
Kingston          2
Nc                1
Dd                1
Orangeville       1
Woodstock         1
Dg                1
Elgin             1
D.C.              1
N.F.              1
Kw                1
Ac                1
Lg                1
Gl                1
Sarnia            1
Goderich          1
Tc                1
Brantford         1
Bay               1
Bracebridge       1
J.H.S.            1
Name: ltb_location, dtype: int64

In [None]:
get_nulls(silver_data, 'ltb_location') # this one is problematic for other columns too. The formatting seems to be completely different

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year,ltb_location
469,Metadata:\nDate:\t2018-12-17\nFile number:\t\n...,TNL-05489-18.txt,"[Metadata:, Date: 2018-12-17, File number:, 65...","[Date: 2018-12-17, File number:, 659/18;, TNL-...","[CITATION: Capreit 2 Limited Partnership v., R...",Cit,TNL-05489-18;659/18,English,20T,


In [None]:
evaluate(silver_data, gold_data, "ltb_location", return_inaccurate = True, metric = "jaro_winkler").head(60)

Unnamed: 0,silver_ltb_location,gold_ltb_location
23,Newmarket,Toronto
24,Toronto,Whitby
31,Barrie,Mississauga
32,Elgin,Mississauga
60,Barrie,Mississauga
107,Toronto,Mississauga
111,Barrie,Mississauga
114,Toronto,Mississauga
132,Ottawa,by telephone
136,Ottawa,by telephone


In [None]:
silver_data.loc[454, 'content']

["Order under Section 69 Residential Tenancies Act, 2006 File Number: TNL-03260-18 RP (the 'Landlord') applied for an order to terminate the tenancy and evict RM (the 'Tenant') because the Tenant did not pay the rent that the Tenant owes.",
 'This application was scheduled to be heard in Toronto on April 17, 2018.',
 'Both parties attended on that day.',
 'The Tenant raised a number of issues pursuant to s.82 of the Residential Tenancies Act, 2006 (the ‘Act’).',
 'The matter was adjourned in order to give the Tenant an opportunity to provide disclosure of all the issues he intended to raise, and to present his evidence to the Landlord.',
 'An Interim Order, TNL-03260-18, was issued, and the Tenant was also ordered to pay all rent in full, as of May 1, 2018, and going forward, until all the issues between the parties were resolved.',
 'The hearing reconvened on May 29, 2018.',
 'The Landlord, (DM), and their witness, KP, attended the reconvened hearing.',
 'The Landlord testified that t

# Decision Date
- uses entire case file (metadata + content)
- Eg.: "This application was heard in Toronto on October 16, 2019." should return "October 16, 2019" (then date is converted into the same format as the other dates in the df)
- Rule-based may be too simple for this so may need to use ML

In [None]:
from dateutil.parser import parse

def convert_to_datetime(date_str):
    # Parse date using dateutil.parser.parse
    dt = parse(date_str)
    
    # Format date with strftime in the format 'MM/DD/YYYY'
    return dt.strftime('%m/%d/%Y')

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm") # loading this outside of the function saves ~2s per function call

def find_dates_in_list(string_list, model=nlp):
    extracted_dates = []
    for string in string_list:
        doc = nlp(string)
        for entity in doc.ents:
            if entity.label_ == "DATE":
                extracted_dates.append(entity.text)

    pattern = r"(?i)(\b\w+ \d{1,2}, \d{4}\b)"
    valid_dates = re.findall(pattern, ", ".join(extracted_dates))
    return list(set(valid_dates))

string_list = [
    "This application was heard in Toronto on October 16, 2019.",
    "The case was heard on June 12, 2020.",
    "The hearing was conducted on January 5, 2021.",
    "The meeting was adjourned, and a new date was set for November 30, 2022.",
    "No hearing was held in this matter.",
    "The matter was discussed on September 14, 2021, and a decision was made."
]
result = find_dates_in_list(string_list)
print(result)

['November 30, 2022', 'September 14, 2021', 'January 5, 2021', 'June 12, 2020', 'October 16, 2019']


In [None]:
def get_hearing_date(string_list, prox = 20):
    case_tokens = " ".join(string_list).split() # case file merged into one string
    match_found = False # assume there's no match found
    first_prox = prox

    while not match_found:
        possible_strings_with_dates = [] # we'll look for dates in these strings
        # print(prox)
        for tok in case_tokens:
            if "issue" in tok.lower():
                hear_ind = case_tokens.index(tok)
                near_toks = case_tokens[hear_ind - (prox // 2) : hear_ind + prox]
                near_text = " ".join(near_toks)
                possible_strings_with_dates.append(near_text) # finds all sections within a "prox" range of the word "hear" and adds them to a list
        
        # print(possible_strings_with_dates)

        best_matches = list(set(find_dates_in_list(possible_strings_with_dates))) # finds all dates in the possible strings and returns a list of unique dates
        # print(best_matches)
        if len(best_matches) == 1 or prox <= 0:
            match_found = True
            break
        else:
            prox -= (first_prox // 4) # decrement by 25% of the first_prox each time
    
    if len(best_matches) > 0:
        return convert_to_datetime(best_matches[0])
    
    # if none of the above works, look for "Date: " and get that date lol (it's also the decision date -- I checked the annotated data)
    for line in string_list:
        if "Date: " in line:
            return convert_to_datetime(line.split("Date: ")[1]) # convert to the same format as other dates in the df

case_num = 156
full_case_list = silver_data.loc[case_num, 'metadata'] + silver_data.loc[case_num, 'content']
full_case_list[:15]
print(get_hearing_date(silver_data.loc[case_num, 'metadata'] + silver_data.loc[case_num, 'content'], prox = 20))
(silver_data.loc[case_num, 'metadata'] + silver_data.loc[case_num, 'content'])[:15]

05/02/2018


['Date: 2018-05-30',
 'File number:',
 'HOL-02735-18',
 'HOL-02735-18',
 'Citation: HOL-02735-18 (Re), 2018 CanLII 111824 (ON LTB), <https://canlii.ca/t/hw7tg>, retrieved on 2023-05-16 https://canlii.ca/t/hw7tg',
 "Order under Section 69 Residential Tenancies Act, 2006 File Number: HOL-02735-18 J.R (the 'Landlord') applied for an order to terminate the tenancy and evict J.B (the 'Tenant') because the Tenant did not pay the rent that the Tenant owes.",
 'This application was heard in Toronto on May 28, 2018.',
 'The Landlord and the Tenant attended the hearing.',
 'The Tenant consulted with Tenant Duty Counsel prior to the hearing.',
 'Determinations: 1.',
 'The Tenant has not paid the total rent the Tenant was required to pay for the period from February 1, 2018 to May 31, 2018.',
 'Because of the arrears, the Landlord served a Notice of Termination effective April 24, 2018.',
 '2. The Tenant is in possession of the rental unit.',
 '3. The monthly rent as of the date of the hearing is 

## Updating CSV with Decision Date

In [None]:
# about 13 per second

start_time = time.time()

# Initialize a deque to store the latest 100 iteration times
time_deque = deque(maxlen = 500)

for index, row in enumerate(silver_data.itertuples()):

    # Save the start time of this iteration
    iteration_start_time = time.time()

    try:
        full_case_list = silver_data.loc[row.Index, 'metadata'] + silver_data.loc[row.Index, 'content']
        silver_data.at[row.Index, 'decision_date'] = get_hearing_date(full_case_list, prox = 20)

    except Exception as any_error:
        print(f"{any_error} with file at Df row: ", row.Index)

    # Save the end time of this iteration and push it into the deque
    iteration_end_time = time.time()
    time_deque.append(iteration_end_time - iteration_start_time)

    # progress tracker
    average_time_per_row = np.mean(time_deque)
    rows_left = len(silver_data) - (index + 1)
    estimated_time_left = rows_left * average_time_per_row

    print("Files processed: ", index + 1, "of", len(silver_data),
          "Estimated time remaining: ", time.strftime('%H:%M:%S', time.gmtime(estimated_time_left)), end = '\r')

silver_data.head()

Files processed:  679 of 679 Estimated time remaining:  00:00:00

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year,ltb_location,decision_date
0,Metadata:\nDate:\t2018-07-06\nFile number:\t\n...,SWL-17348-18.txt,"[Metadata:, Date: 2018-07-06, File number:, SW...","[Date: 2018-07-06, File number:, SWL-17348-18,...",[Order under Section 69 Residential Tenancies ...,"SWL-17348-18 (Re), 2018 CanLII 88643 (ON LTB)",SWL-17348-18,English,2018,London,07/06/2018
1,Metadata:\nDate:\t2017-05-26\nFile number:\t\n...,TEL-79722-17.txt,"[Metadata:, Date: 2017-05-26, File number:, TE...","[Date: 2017-05-26, File number:, TEL-79722-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-79722-17 (Re), 2017 CanLII 48856 (ON LTB)",TEL-79722-17,English,2017,Toronto,05/26/2017
2,Metadata:\nDate:\t2017-07-05\nFile number:\t\n...,TEL-80773-17.txt,"[Metadata:, Date: 2017-07-05, File number:, TE...","[Date: 2017-07-05, File number:, TEL-80773-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-80773-17 (Re), 2017 CanLII 60498 (ON LTB)",TEL-80773-17,English,2017,Whitby,07/05/2017
3,Metadata:\nDate:\t2017-07-18\nFile number:\t\n...,TEL-81359-17-AM.txt,"[Metadata:, Date: 2017-07-18, File number:, TE...","[Date: 2017-07-18, File number:, TEL-81359-17-...",[Order under Section 69 Residential Tenancies ...,"TEL-81359-17-AM (Re), 2017 CanLII 60052 (ON LTB)",TEL-81359-17-AM,English,2017,Toronto,07/18/2017
4,Metadata:\nDate:\t2017-08-17\nFile number:\t\n...,TEL-81405-17.txt,"[Metadata:, Date: 2017-08-17, File number:, TE...","[Date: 2017-08-17, File number:, TEL-81405-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-81405-17 (Re), 2017 CanLII 60203 (ON LTB)",TEL-81405-17,English,2017,Lindsay,08/17/2017


In [None]:
get_nulls(silver_data, 'decision_date')

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year,ltb_location,decision_date


In [None]:
evaluate(silver_data, gold_data, "decision_date", return_inaccurate = True, metric = "jaro_winkler")#.head(60)

Unnamed: 0,silver_decision_date,gold_decision_date
7,03/29/2018,04/24/2018
10,03/26/2018,10/26/0208
24,12/04/2017,01/19/2018
25,06/24/2019,11/19/2019
30,01/10/2017,02/03/2017
...,...,...
654,08/31/2018,11/01/2018
655,12/06/2018,11/06/2018
660,09/10/2018,12/04/2018
663,10/12/2018,10/09/2018


In [None]:
wrongs = evaluate(silver_data, gold_data, "decision_date", return_inaccurate = True, metric = "accuracy").index#.head(60)
# gold_data.loc[wrongs, ["hearing_date", "decision_date"]].head(20)
evaluate(silver_data, gold_data, "decision_date", return_inaccurate = True, metric = "accuracy")#.head(60)

Unnamed: 0,silver_decision_date,gold_decision_date
7,03/29/2018,04/24/2018
10,03/26/2018,10/26/0208
24,12/04/2017,01/19/2018
25,06/24/2019,11/19/2019
30,01/10/2017,02/03/2017
...,...,...
654,08/31/2018,11/01/2018
655,12/06/2018,11/06/2018
660,09/10/2018,12/04/2018
663,10/12/2018,10/09/2018


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm") # loading this outside of the function saves ~2s per function call

def find_dates_in_string(string, model=nlp):
    extracted_dates = []
    # for string in string_list:
    doc = nlp(string)
    for entity in doc.ents:
        if entity.label_ == "DATE":
            extracted_dates.append(entity.text)
    
    return list(set(extracted_dates))
    # pattern = r"(?i)(\b\w+ \d{1,2}, \d{4}\b)"
    # valid_dates = re.findall(pattern, ", ".join(extracted_dates))
    # return valid_dates

from datetime import datetime

def order_dates(dates_list):
    sorted_dates = sorted(dates_list, key=lambda x: datetime.strptime(x, '%m/%d/%Y'))
    return sorted_dates

In [None]:
# def get_hearing_date()
test_case_list = silver_data.loc[wrongs[0], 'content']
test_case_str = " ".join(test_case_list)
hdate_candidates = [line for line in test_case_str.lower().split(". ") if ("application" in line.lower() and "hear" in line.lower())]

order_dates([convert_to_datetime(date) for date in find_dates_in_list(hdate_candidates)])

['03/26/2018']

In [None]:
silver_data.loc[wrongs[0], 'metadata']

['Date: 2018-03-29',
 'File number:',
 'TNL-01582-18-AM',
 'TNL-01582-18-AM',
 'Citation: TNL-01582-18-AM (Re), 2018 CanLII 113954 (ON LTB), <https://canlii.ca/t/hwbhh>, retrieved on 2023-05-16 https://canlii.ca/t/hwbhh']

# Hearing Date

In [None]:
import re
from datetime import datetime

def clean_and_convert_date(date_str):
    """
    Gets rid of artifacts found within strings containing dates and converts them to a consistent format.

    """
    
    # Use regex to find date in the format 'Month DD, YYYY'
    match = re.search(r'([a-zA-Z]+ \d{1,2}, \d{4})', date_str)
    if match:
        date_str = match.group(1)

        # Parse date with strptime in the format 'Month DD, YYYY'
        dt = datetime.strptime(date_str, '%B %d, %Y')

        # Format date with strftime in the format 'MM/DD/YYYY'
        return dt.strftime('%m/%d/%Y')

    return None

# Test the function
date_str = "September 3, 2019 f"
new_date_str = clean_and_convert_date(date_str)
print(new_date_str)

09/03/2019


In [None]:
def extract_date_rule(text_list):

    content_str = " ".join(text_list)
    # print(content_str)

    if "date issued" in content_str.lower():
        DI_inds = find_all_positions(content_str.lower(), "date issued")
        for DI_ind in DI_inds:
            DI_substr = content_str[DI_ind - 50 : DI_ind + 50].lower()
            # print(DI_ind)
            # print(DI_substr)

            if len(DI_substr.split("date issued")) != 2:
                # should only contain 2
                pass

            # DI_sent = DI_substr.split("date issued")#.strip()
            # print(DI_substr)

            # regex pattern to find any date within the DI_sent substring
            date_pattern = r"(?i)(january|february|march|april|may|june|july|august|september|october|november|december) [0-9]{1,2}, [0-9]{4}"

            match = re.search(date_pattern, DI_substr)
            if match:
                # print(match)
                return match.group(0)
            
    elif "hear" in content_str:

        hear_inds = find_all_positions(content_str, "hear") # list of indices where "hear" appears in string
        # print(hear_inds)

        for hear_ind in hear_inds:

            hear_substr = content_str[hear_ind - 50 : hear_ind + 50]
            possible_sentences = hear_substr.split(". ")
            # print(possible_sentences)
            
            hear_sent = [sent for i, sent in enumerate(possible_sentences) if "hear" in sent][0]
            # print(hear_sent)

            if len(hear_sent.split(" on ")) != 2:
                # return None
                pass # go to next "hear" location in string
            else:
                location_sent, date = hear_sent.split(" on ") # should only split into 2 parts
                # print(location_sent)

                return date.strip()

    # if all else fails, return None
    return None

print(extract_date_rule(silver_data.loc[2, 'content']))
# print(clean_and_convert_date(extract_date_rule(silver_data.loc[56, 'content']))) # converts it into desired format
# silver_data.loc[56, 'content']

july 5, 2017


In [None]:
def extract_date_2(case_metadata: list):
    for line in case_metadata:
        if "Date: " in line:
            break
    
    # return the date in the format they gave us the data in - this might make things more useful for them idk
    return line.replace("Date: ", "")
    # return line.replace("Date: ", "")

print(extract_date_rule(silver_data.loc[0, 'content']))
print(extract_date_2(silver_data.loc[0, 'metadata']))
convert_to_datetime(extract_date_rule(silver_data.loc[0, 'content']))
convert_to_datetime(extract_date_2(silver_data.loc[0, 'metadata']))

july 6, 2018
2018-07-06


'07/06/2018'

## Updating CSV with Hearing Date

In [None]:
import time
import numpy as np
from collections import deque

start_time = time.time()

# Initialize a deque to store the latest 100 iteration times
time_deque = deque(maxlen=100)

for index, row in enumerate(silver_data.itertuples()):

    # Save the start time of this iteration
    iteration_start_time = time.time()

    try:
        # rule-based method #1 -- uses 'content'
        found_date = extract_date_rule(silver_data.loc[row.Index, 'content'])

        if found_date == None: # rule-based method #2 -- uses 'metadata'
            found_date = extract_date_2(silver_data.loc[row.Index, 'metadata'])

        # normalize the format
        # formatted_date = clean_and_convert_date(found_date)
        formatted_date = convert_to_datetime(found_date)

        # if still None, try one more time to convert it
        silver_data.at[row.Index, 'hearing_date'] = formatted_date
            
    except Exception as any_error:
        print(f"{any_error} with file at Df row: ", row.Index)

    # Save the end time of this iteration and push it into the deque
    iteration_end_time = time.time()
    time_deque.append(iteration_end_time - iteration_start_time)

    # progress tracker
    average_time_per_row = np.mean(time_deque)
    rows_left = len(silver_data) - (index + 1)
    estimated_time_left = rows_left * average_time_per_row

    print("Files processed: ", index + 1, "of", len(silver_data),
          "Estimated time remaining: ", time.strftime('%H:%M:%S', time.gmtime(estimated_time_left)), end='\r')

silver_data.head()

String does not contain a date: t with file at Df row:  46900:00
Files processed:  679 of 679 Estimated time remaining:  00:00:00

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year,ltb_location,decision_date,hearing_date
0,Metadata:\nDate:\t2018-07-06\nFile number:\t\n...,SWL-17348-18.txt,"[Metadata:, Date: 2018-07-06, File number:, SW...","[Date: 2018-07-06, File number:, SWL-17348-18,...",[Order under Section 69 Residential Tenancies ...,"SWL-17348-18 (Re), 2018 CanLII 88643 (ON LTB)",SWL-17348-18,English,2018,London,07/06/2018,07/06/2018
1,Metadata:\nDate:\t2017-05-26\nFile number:\t\n...,TEL-79722-17.txt,"[Metadata:, Date: 2017-05-26, File number:, TE...","[Date: 2017-05-26, File number:, TEL-79722-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-79722-17 (Re), 2017 CanLII 48856 (ON LTB)",TEL-79722-17,English,2017,Toronto,05/26/2017,06/07/2017
2,Metadata:\nDate:\t2017-07-05\nFile number:\t\n...,TEL-80773-17.txt,"[Metadata:, Date: 2017-07-05, File number:, TE...","[Date: 2017-07-05, File number:, TEL-80773-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-80773-17 (Re), 2017 CanLII 60498 (ON LTB)",TEL-80773-17,English,2017,Whitby,07/05/2017,07/05/2017
3,Metadata:\nDate:\t2017-07-18\nFile number:\t\n...,TEL-81359-17-AM.txt,"[Metadata:, Date: 2017-07-18, File number:, TE...","[Date: 2017-07-18, File number:, TEL-81359-17-...",[Order under Section 69 Residential Tenancies ...,"TEL-81359-17-AM (Re), 2017 CanLII 60052 (ON LTB)",TEL-81359-17-AM,English,2017,Toronto,07/18/2017,08/06/2017
4,Metadata:\nDate:\t2017-08-17\nFile number:\t\n...,TEL-81405-17.txt,"[Metadata:, Date: 2017-08-17, File number:, TE...","[Date: 2017-08-17, File number:, TEL-81405-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-81405-17 (Re), 2017 CanLII 60203 (ON LTB)",TEL-81405-17,English,2017,Lindsay,08/17/2017,08/17/2017


In [None]:
check_df(silver_data)

Df Size: 679 rows, 12 columns
------------------------------

Checking for null values...
-------------------------
raw_file_text    0
raw_file_name    0
full_cleaned     0
metadata         0
content          0
case_citation    0
file_number      0
language         0
year             0
ltb_location     1
decision_date    0
hearing_date     1
dtype: int64

Checking data types...
-------------------------
raw_file_text: str
raw_file_name: str
full_cleaned: list
metadata: list
content: list
case_citation: str
file_number: str
language: str
year: str
ltb_location: str
decision_date: str
hearing_date: str


In [None]:
get_nulls(silver_data, 'hearing_date')

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year,ltb_location,decision_date,hearing_date
469,Metadata:\nDate:\t2018-12-17\nFile number:\t\n...,TNL-05489-18.txt,"[Metadata:, Date: 2018-12-17, File number:, 65...","[Date: 2018-12-17, File number:, 659/18;, TNL-...","[CITATION: Capreit 2 Limited Partnership v., R...",Cit,TNL-05489-18;659/18,English,20T,,07/13/2018,


In [None]:
evaluate(silver_data, gold_data, "hearing_date", return_inaccurate = True, metric = "jaro_winkler")#.head(60)

Unnamed: 0,silver_hearing_date,gold_hearing_date
0,07/06/2018,07/05/2018
1,06/07/2017,05/19/2017
2,07/05/2017,06/30/2017
3,08/06/2017,07/07/2017
4,08/17/2017,08/15/2017
...,...,...
674,11/23/2018,11/20/2018
675,12/11/2018,11/23/2018
676,12/12/2018,12/04/2018
677,11/20/2018,11/02/2018


In [None]:
print(convert_to_datetime(extract_date_2(silver_data.loc[454, 'metadata'])))
silver_data.loc[454, 'metadata']

05/31/2018


['Date: 2018-05-31',
 'File number:',
 'TNL-03260-18',
 'TNL-03260-18',
 'Citation: TNL-03260-18 (Re), 2018 CanLII 113821 (ON LTB), <https://canlii.ca/t/hwbj0>, retrieved on 2023-05-16 https://canlii.ca/t/hwbj0']

# Case CanLII URL
- In the metadata, there's a hyperlink that seems to be a shortened URL to the case
- Eg.:  *'Citation: NOL-10723-12 (Re), 2013 CanLII 5182 (ON LTB), <https://canlii.ca/t/fw1m8>, retrieved on 2023-05-17'*

In [None]:
silver_data.loc[0, 'metadata']

['Date: 2018-07-06',
 'File number:',
 'SWL-17348-18',
 'SWL-17348-18',
 'Citation: SWL-17348-18 (Re), 2018 CanLII 88643 (ON LTB), <https://canlii.ca/t/hv7qd>, retrieved on 2023-05-16 https://canlii.ca/t/hv7qd']

In [None]:
import re

def get_url_from_citation_string(text: str):
    """
    Returns URL to case file given a list of strings of metadata from a case file.
    String must begin with "Citation: " and URL must be within angle brackets.

    Parameters
    ----------
    text : str
        A string of metadata from a case file.

    Returns
    -------
    str
        A string of the URL to the case file.
    """

    pattern = r"<(.*?)>"
    matches = re.findall(pattern, text)
    return matches[0]

def get_url_from_metadata(case_metadata: list):
    """
    Extract URL to case file from a list of strings of metadata from a case file.

    Parameters
    ----------
    case_metadata : list
        A list of strings of metadata from a case file.

    Returns
    -------
    str
        A string of the URL to the case file.
    """

    for line in case_metadata:
        if ("Citation:" or "Référence:") in line:
            return get_url_from_citation_string(line)
        
    return None
        
get_url_from_metadata(silver_data.loc[1, 'metadata'])

'https://canlii.ca/t/h539n'

## Updating CSV with URLs

In [None]:
import time
import numpy as np
from collections import deque

start_time = time.time()

# Initialize a deque to store the latest 100 iteration times
time_deque = deque(maxlen = 100)

for index, row in enumerate(silver_data.itertuples()):

    # Save the start time of this iteration
    iteration_start_time = time.time()

    try:
        silver_data.at[row.Index, 'url'] = get_url_from_metadata(silver_data.loc[row.Index, 'metadata'])
    except Exception as any_error:
        print(f"{any_error} with file at Df row: ", row.Index)

    # Save the end time of this iteration and push it into the deque
    iteration_end_time = time.time()
    time_deque.append(iteration_end_time - iteration_start_time)

    # progress tracker
    average_time_per_row = np.mean(time_deque)
    rows_left = len(silver_data) - (index + 1)
    estimated_time_left = rows_left * average_time_per_row

    print("Files processed: ", index + 1, "of", len(silver_data),
          "Estimated time remaining: ", time.strftime('%H:%M:%S', time.gmtime(estimated_time_left)), end='\r')

silver_data.head()

Files processed:  679 of 679 Estimated time remaining:  00:00:00

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year,ltb_location,decision_date,hearing_date,url
0,Metadata:\nDate:\t2018-07-06\nFile number:\t\n...,SWL-17348-18.txt,"[Metadata:, Date: 2018-07-06, File number:, SW...","[Date: 2018-07-06, File number:, SWL-17348-18,...",[Order under Section 69 Residential Tenancies ...,"SWL-17348-18 (Re), 2018 CanLII 88643 (ON LTB)",SWL-17348-18,English,2018,London,07/06/2018,07/06/2018,https://canlii.ca/t/hv7qd
1,Metadata:\nDate:\t2017-05-26\nFile number:\t\n...,TEL-79722-17.txt,"[Metadata:, Date: 2017-05-26, File number:, TE...","[Date: 2017-05-26, File number:, TEL-79722-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-79722-17 (Re), 2017 CanLII 48856 (ON LTB)",TEL-79722-17,English,2017,Toronto,05/26/2017,06/07/2017,https://canlii.ca/t/h539n
2,Metadata:\nDate:\t2017-07-05\nFile number:\t\n...,TEL-80773-17.txt,"[Metadata:, Date: 2017-07-05, File number:, TE...","[Date: 2017-07-05, File number:, TEL-80773-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-80773-17 (Re), 2017 CanLII 60498 (ON LTB)",TEL-80773-17,English,2017,Whitby,07/05/2017,07/05/2017,https://canlii.ca/t/h5z39
3,Metadata:\nDate:\t2017-07-18\nFile number:\t\n...,TEL-81359-17-AM.txt,"[Metadata:, Date: 2017-07-18, File number:, TE...","[Date: 2017-07-18, File number:, TEL-81359-17-...",[Order under Section 69 Residential Tenancies ...,"TEL-81359-17-AM (Re), 2017 CanLII 60052 (ON LTB)",TEL-81359-17-AM,English,2017,Toronto,07/18/2017,08/06/2017,https://canlii.ca/t/h5z3r
4,Metadata:\nDate:\t2017-08-17\nFile number:\t\n...,TEL-81405-17.txt,"[Metadata:, Date: 2017-08-17, File number:, TE...","[Date: 2017-08-17, File number:, TEL-81405-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-81405-17 (Re), 2017 CanLII 60203 (ON LTB)",TEL-81405-17,English,2017,Lindsay,08/17/2017,08/17/2017,https://canlii.ca/t/h5z3s


In [None]:
check_df(silver_data)

Df Size: 679 rows, 13 columns
------------------------------

Checking for null values...
-------------------------
raw_file_text    0
raw_file_name    0
full_cleaned     0
metadata         0
content          0
case_citation    0
file_number      0
language         0
year             0
ltb_location     1
decision_date    0
hearing_date     1
url              0
dtype: int64

Checking data types...
-------------------------
raw_file_text: str
raw_file_name: str
full_cleaned: list
metadata: list
content: list
case_citation: str
file_number: str
language: str
year: str
ltb_location: str
decision_date: str
hearing_date: str
url: str


In [None]:
silver_data.loc[2, 'metadata']
silver_data.loc[150, 'content'][-15:]

['7. Interest on the rent deposit is owing to the Tenant for the period from April 28, 2017 to April 30, 2018 8.',
 "I have considered all of the disclosed circumstances in accordance with subsection 83(2) of the Residential Tenancies Act, 2006 (the 'Act'), and find that it would be unfair to grant relief from eviction pursuant to subsection 83(1) of the Act.",
 'Specifically, the Landlord testified that she has been extremely financially prejudiced by the Tenant’s failure to pay rent.',
 'The Landlord is a student and relies on this income to pay for her tuition, living expenses and mortgage.',
 'As well, I am mindful that this has not been a long term tenancy, such that this spate of late payment could be seen as an anomaly in an otherwise extensive and unblemished tenure.',
 'I am also mindful that the Tenant is in significant arrears of rent.',
 'It is ordered that: 1.',
 'The tenancy between the Landlord and the Tenant is terminated, as of April 30, 2018.',
 'The Tenant must move 

# Adjudicating Member
- Typically in format like:
    ```[...,
    'Date',
    'Issued Gerald',
    'Taylor',
    'Member,'
    ...]```
- This seems to be formatted pretty consistently but not sure if it may present in different ways

In [None]:
silver_data.loc[2, 'metadata']
silver_data.loc[130, 'content'][-15:]

['If the Tenant does not make full payment in accordance with this paragraph and by the appropriate deadline, then the Landlord may file this order with the Court Enforcement Office (Sheriff) so that the eviction may be enforced.',
 '9. The Tenant may make a motion to the Board under subsection 74(11) of the Act to set aside this part of the order if she pays the amount required under that subsection on or after January 3, 2018 but before the Sheriff gives vacant possession to the Landlord.',
 'The Tenant is only entitled to make this motion once during the period of the tenancy agreement with the Landlord.',
 'THE APPLICATION FOR PERSISTENT LATE PAYMENT',
 '10. If the Tenant voids the eviction order above pursuant to paragraph 8 or 9 then: (1) The Tenant shall pay to the Landlord new rent on time and in full for the period January 1, 2018 to December 31, 2018.',
 '(2) If the Tenant fails to make any one of the payments in accordance with paragraph 10(1) of this order, the Landlord may

In [None]:
import re

def get_adj_member(list_of_strings: list):
    text = " ".join(list_of_strings)
    pattern = r"Date Issued(.*?)Member"
    matches = re.findall(pattern, text, re.DOTALL)
    # extracted_text = [match.strip() for match in matches]
    extracted_text = list(set(match.strip() for match in matches))
    if len(extracted_text) > 0:
        return ", ".join(extracted_text) # returns a list of matches and sometimes there's more than one match so we just take the first one -- there are never two
    elif "date issued" in text.lower():
        DI_inds = find_all_positions(text.lower(), "date issued")
        for DI_ind in DI_inds:
            DI_substr = text[DI_ind - 50 : DI_ind + 50].lower()
            if len(DI_substr.split("date issued")) != 2:
                # should only contain 2
                pass

            DI_sent = DI_substr.split("date issued")[1].strip()
            if ", " in DI_sent: # there should be a comma be just in case, it doesn't hurt to have this (and this to try the "hear" method after iterating over all of these if none work)
                DI_sent = DI_sent.split(", ")[0]

            if "member" in DI_sent:
                DI_sent = DI_sent.replace("member", "")

            return DI_sent.title().strip()

test = silver_data.loc[453, 'content']#[-15:]

get_adj_member(test)

'Nancy Morris'

### Updating CSV with Adjudicating Member

In [None]:
# import time
# import numpy as np
# from collections import deque

start_time = time.time()

# Initialize a deque to store the latest 100 iteration times
time_deque = deque(maxlen = 500)

for index, row in enumerate(silver_data.itertuples()):

    # Save the start time of this iteration
    iteration_start_time = time.time()

    try:
        silver_data.at[row.Index, 'adjudicating_member'] = get_adj_member(silver_data.loc[row.Index, 'content']).replace("Vice Chair", "").replace("Vice-Chair", "").strip()
    except Exception as any_error:
        print(f"{any_error} with file at Df row: ", row.Index)

    # Save the end time of this iteration and push it into the deque
    iteration_end_time = time.time()
    time_deque.append(iteration_end_time - iteration_start_time)

    # progress tracker
    average_time_per_row = np.mean(time_deque)
    rows_left = len(silver_data) - (index + 1)
    estimated_time_left = rows_left * average_time_per_row

    print("Files processed: ", index + 1, "of", len(silver_data),
          "Estimated time remaining: ", time.strftime('%H:%M:%S', time.gmtime(estimated_time_left)), end = '\r')

silver_data.head()

'NoneType' object has no attribute 'replace' with file at Df row:  469
'NoneType' object has no attribute 'replace' with file at Df row:  655
'NoneType' object has no attribute 'replace' with file at Df row:  663
Files processed:  679 of 679 Estimated time remaining:  00:00:00

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year,ltb_location,decision_date,hearing_date,url,adjudicating_member
0,Metadata:\nDate:\t2018-07-06\nFile number:\t\n...,SWL-17348-18.txt,"[Metadata:, Date: 2018-07-06, File number:, SW...","[Date: 2018-07-06, File number:, SWL-17348-18,...",[Order under Section 69 Residential Tenancies ...,"SWL-17348-18 (Re), 2018 CanLII 88643 (ON LTB)",SWL-17348-18,English,2018,London,07/06/2018,07/06/2018,https://canlii.ca/t/hv7qd,Kevin Lundy
1,Metadata:\nDate:\t2017-05-26\nFile number:\t\n...,TEL-79722-17.txt,"[Metadata:, Date: 2017-05-26, File number:, TE...","[Date: 2017-05-26, File number:, TEL-79722-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-79722-17 (Re), 2017 CanLII 48856 (ON LTB)",TEL-79722-17,English,2017,Toronto,05/26/2017,06/07/2017,https://canlii.ca/t/h539n,Laura Hartslief
2,Metadata:\nDate:\t2017-07-05\nFile number:\t\n...,TEL-80773-17.txt,"[Metadata:, Date: 2017-07-05, File number:, TE...","[Date: 2017-07-05, File number:, TEL-80773-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-80773-17 (Re), 2017 CanLII 60498 (ON LTB)",TEL-80773-17,English,2017,Whitby,07/05/2017,07/05/2017,https://canlii.ca/t/h5z39,Ruth Carey
3,Metadata:\nDate:\t2017-07-18\nFile number:\t\n...,TEL-81359-17-AM.txt,"[Metadata:, Date: 2017-07-18, File number:, TE...","[Date: 2017-07-18, File number:, TEL-81359-17-...",[Order under Section 69 Residential Tenancies ...,"TEL-81359-17-AM (Re), 2017 CanLII 60052 (ON LTB)",TEL-81359-17-AM,English,2017,Toronto,07/18/2017,08/06/2017,https://canlii.ca/t/h5z3r,Shelby Whittick
4,Metadata:\nDate:\t2017-08-17\nFile number:\t\n...,TEL-81405-17.txt,"[Metadata:, Date: 2017-08-17, File number:, TE...","[Date: 2017-08-17, File number:, TEL-81405-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-81405-17 (Re), 2017 CanLII 60203 (ON LTB)",TEL-81405-17,English,2017,Lindsay,08/17/2017,08/17/2017,https://canlii.ca/t/h5z3s,Laura Hartslief


In [None]:
check_df(silver_data)

Df Size: 679 rows, 14 columns
------------------------------

Checking for null values...
-------------------------
raw_file_text          0
raw_file_name          0
full_cleaned           0
metadata               0
content                0
case_citation          0
file_number            0
language               0
year                   0
ltb_location           1
decision_date          0
hearing_date           1
url                    0
adjudicating_member    3
dtype: int64

Checking data types...
-------------------------
raw_file_text: str
raw_file_name: str
full_cleaned: list
metadata: list
content: list
case_citation: str
file_number: str
language: str
year: str
ltb_location: str
decision_date: str
hearing_date: str
url: str
adjudicating_member: str


In [None]:
nulls_adj = get_nulls(silver_data, 'adjudicating_member', return_index = False)
nulls_inds = nulls_adj.index.tolist()
nulls_adj

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year,ltb_location,decision_date,hearing_date,url,adjudicating_member
469,Metadata:\nDate:\t2018-12-17\nFile number:\t\n...,TNL-05489-18.txt,"[Metadata:, Date: 2018-12-17, File number:, 65...","[Date: 2018-12-17, File number:, 659/18;, TNL-...","[CITATION: Capreit 2 Limited Partnership v., R...",Cit,TNL-05489-18;659/18,English,20T,,07/13/2018,,https://canlii.ca/t/hwpvq,
655,Metadata:\nDate:\t2018-11-06\nFile number:\t\n...,TSL-97498-18-RV2-AM.txt,"[Metadata:, Date: 2018-11-06, File number:, TS...","[Date: 2018-11-06, File number:, TSL-97498-18-...",[Amended Order Order under Sections 21.1 and 2...,"TSL-97498-18-RV2-AM (Re), 2018 CanLII 141664 (...",TSL-97498-18-RV2-AM,English,2018,Toronto,12/06/2018,11/01/2018,https://canlii.ca/t/j0fjl,
663,Metadata:\nDate:\t2018-10-09\nFile number:\t\n...,TSL-98384-18-AM.txt,"[Metadata:, Date: 2018-10-09, File number:, TS...","[Date: 2018-10-09, File number:, TSL-98384-18,...",[Amended Order Order under Section 69 Resident...,"TSL-98384-18-AM (Re), 2018 CanLII 120875 (ON LTB)",TSL-98384-18,English,2018,Toronto,10/12/2018,09/24/2018,https://canlii.ca/t/hwmcl,


In [None]:
evaluate(silver_data, gold_data, "adjudicating_member", return_inaccurate = True, metric = "jaro_winkler").head(60)

Unnamed: 0,silver_adjudicating_member,gold_adjudicating_member
14,Sylvia Watson,Nancy Morris
24,Ruth Carey,Laura Hartslief
59,Karen Wallace,Sandra Macchione
75,Alex Brkic,Avril Cardoso
87,Karen Wallace,Avril Cardoso
90,Renée Lang,Renee Lang
109,Ibi Olabode,Alex Brkic
133,Sylvie Charron,Emily Crocco
156,Renée Lang,Renee Lang
160,Sylvia Watson,Harry Cho


In [None]:
silver_data.file_number.value_counts()

TEL-91060-18-RV                                                                 2
HOL-03461-18-RV                                                                 2
TEL-95255-18-AM                                                                 2
SWL-17348-18                                                                    1
TNL-03847-18                                                                    1
                                                                               ..
TEL-04297-19                                                                    1
TEL-04353-19                                                                    1
TEL-04558-19                                                                    1
TEL-04599-19                                                                    1
TST-07083-19-IN;TST-06337-19-IN;TST-07023-19;TSL-07607-19-IN;TSL-07961-19-IN    1
Name: file_number, Length: 676, dtype: int64

In [None]:
silver_data

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year,ltb_location,decision_date,hearing_date,url,adjudicating_member
0,Metadata:\nDate:\t2018-07-06\nFile number:\t\n...,SWL-17348-18.txt,"[Metadata:, Date: 2018-07-06, File number:, SW...","[Date: 2018-07-06, File number:, SWL-17348-18,...",[Order under Section 69 Residential Tenancies ...,"SWL-17348-18 (Re), 2018 CanLII 88643 (ON LTB)",SWL-17348-18,English,2018,London,07/06/2018,07/06/2018,https://canlii.ca/t/hv7qd,Kevin Lundy
1,Metadata:\nDate:\t2017-05-26\nFile number:\t\n...,TEL-79722-17.txt,"[Metadata:, Date: 2017-05-26, File number:, TE...","[Date: 2017-05-26, File number:, TEL-79722-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-79722-17 (Re), 2017 CanLII 48856 (ON LTB)",TEL-79722-17,English,2017,Toronto,05/26/2017,06/07/2017,https://canlii.ca/t/h539n,Laura Hartslief
2,Metadata:\nDate:\t2017-07-05\nFile number:\t\n...,TEL-80773-17.txt,"[Metadata:, Date: 2017-07-05, File number:, TE...","[Date: 2017-07-05, File number:, TEL-80773-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-80773-17 (Re), 2017 CanLII 60498 (ON LTB)",TEL-80773-17,English,2017,Whitby,07/05/2017,07/05/2017,https://canlii.ca/t/h5z39,Ruth Carey
3,Metadata:\nDate:\t2017-07-18\nFile number:\t\n...,TEL-81359-17-AM.txt,"[Metadata:, Date: 2017-07-18, File number:, TE...","[Date: 2017-07-18, File number:, TEL-81359-17-...",[Order under Section 69 Residential Tenancies ...,"TEL-81359-17-AM (Re), 2017 CanLII 60052 (ON LTB)",TEL-81359-17-AM,English,2017,Toronto,07/18/2017,08/06/2017,https://canlii.ca/t/h5z3r,Shelby Whittick
4,Metadata:\nDate:\t2017-08-17\nFile number:\t\n...,TEL-81405-17.txt,"[Metadata:, Date: 2017-08-17, File number:, TE...","[Date: 2017-08-17, File number:, TEL-81405-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-81405-17 (Re), 2017 CanLII 60203 (ON LTB)",TEL-81405-17,English,2017,Lindsay,08/17/2017,08/17/2017,https://canlii.ca/t/h5z3s,Laura Hartslief
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
674,Metadata:\nDate:\t2018-11-23\nFile number:\t\n...,TSL-99691-18.txt,"[Metadata:, Date: 2018-11-23, File number:, TS...","[Date: 2018-11-23, File number:, TSL-99691-18,...",[Order under Section 69 Residential Tenancies ...,"TSL-99691-18 (Re), 2018 CanLII 141675 (ON LTB)",TSL-99691-18,English,2018,Toronto,11/23/2018,11/23/2018,https://canlii.ca/t/j0fk1,David Lee
675,Metadata:\nDate:\t2018-11-29\nFile number:\t\n...,TSL-99824-18.txt,"[Metadata:, Date: 2018-11-29, File number:, TS...","[Date: 2018-11-29, File number:, TSL-99824-18,...",[Order under Section 69 Residential Tenancies ...,"TSL-99824-18 (Re), 2018 CanLII 141673 (ON LTB)",TSL-99824-18,English,2018,Toronto,11/29/2018,12/11/2018,https://canlii.ca/t/j0fk2,Renée Lang
676,Metadata:\nDate:\t2018-12-12\nFile number:\t\n...,TSL-99900-18.txt,"[Metadata:, Date: 2018-12-12, File number:, TS...","[Date: 2018-12-12, File number:, TSL-99900-18,...",[Order under Section 69 Residential Tenancies ...,"TSL-99900-18 (Re), 2018 CanLII 140403 (ON LTB)",TSL-99900-18,English,2018,Toronto,12/12/2018,12/12/2018,https://canlii.ca/t/hzzb6,David Mungovan
677,Metadata:\nDate:\t2018-11-20\nFile number:\t\n...,TSL-99965-18.txt,"[Metadata:, Date: 2018-11-20, File number:, TS...","[Date: 2018-11-20, File number:, TSL-99965-18,...",[Order under Section 69 Residential Tenancies ...,"TSL-99965-18 (Re), 2018 CanLII 141672 (ON LTB)",TSL-99965-18,English,2018,Toronto,11/20/2018,11/20/2018,https://canlii.ca/t/j0fk5,David Lee


In [None]:
# silver_data.to_csv(f"pproc_{len(silver_data)}_files_1.csv", index = False)

In [None]:
# silver_data.loc[:10, ].to_csv(f"sample_corpus.csv", index = False)

# Rule-Based Case OutCome Extraction
- Extracting SPAN is SOLVED
- Classifying the span is NOT SOLVED - need to fix the new labels generated by Allard_1

In [None]:
# gold_data.columns # looking for case outcome column name
len(gold_data['case_outcome'].unique()) # only 47 unique outcomes across 679 cases - that means it can probably be summarized effectively
gold_data['case_outcome'].unique()

array(['No relief', 'Payment plan', 'Postponement of eviction',
       'No relief, but arrears calculated based on the lawful monthly rent ($1400), despite the landlord claiming it to be $1500',
       'Order under review reversed in part to correct serious error',
       'Eviction refused; Tenant pays arrears only',
       'Extended termination date',
       'Tenants already vacated rental unit. Tenants pay full amount owed.',
       'Other: Standard eviction order but if arrears are paid, tenant can stay as long as all future payments for 1 year are paid on time (subject to s. 78)',
       'Postponement of eviction but conditioned on the fact that all rent is paid on time for one year period',
       'Ordered payment of arrears, no eviction',
       'Conditional order to preserve tenancy', 'Conditional relief',
       'The effect of the decision is essentially to postpone eviction, but the member hasn\'t said so outright. If the tenant fails to make the payment by the specified date,

In [None]:
reclassified_outcomes = {
    'no_relief': [
        'No relief',
        'No relief, but arrears calculated based on the lawful monthly rent ($1400), despite the landlord claiming it to be $1500',
        'Tenants already vacated rental unit. Tenants pay full amount owed.',
        'Ordered payment of arrears, no eviction',
        'Conditional order to preserve tenancy',
        'Conditional relief',
        'Payment of damages (not a rent dispute)',
        'Request to review is denied',
        'No relief; rent abatement due to maintenance repairs',
        'Application dismissed',
        'Relief not to evict. Tenant has paid all arrears and there was a material error on the landlord\'s part.',
        'Notice of termination was found to be invalid so Landlord requested consent of Board to withdraw application of non-payment of rent which was granted.',
        'Application Dismissed',
        'No eviction granted, payment plan merely involved paying rent on time',
        'Tenant ordered to pay adjusted arrears cost',
        'Order to pay rent on time',
        'Tenant must pay arrears with EI lump-sum check or else he will be evicted',
        'landlord was not seeking eviction but seeking to terminate tenancy because of persistent late payment; relief granted (not terminating tenancy) if tenant can pay in time and in full for the next 11 months',
        'No relief, because tenant can pay off the arrears with the help of social assistance',
        'No relief because tenants could pay the balance',
        'No relief because tenant can pay the balance'
    ],
    'no_payment': [
        'Payment plan',
        'Eviction refused; Tenant pays arrears only',
        'Payment and eviction',
        'Eviction order set aside',
        'Pay on time order'
    ],
    'other': [
        'Postponement of eviction',
        'Order under review reversed in part to correct serious error',
        'Extended termination date',
        'Other: Standard eviction order but if arrears are paid, tenant can stay as long as all future payments for 1 year are paid on time (subject to s. 78)',
        'Postponement of eviction but conditioned on the fact that all rent is paid on time for one year period',
        'Probation',
        'Relief from eviction subject to conditions',
        'Conditional order',
        'Conditional Order',
        'Landlord\'s Application for Eviction was dismissed',
        'Abatement of Rent',
        'Voiding of order',
        'Tenant forced out, landlord wants space. All rent was paid.',
        'Tenant shall pay rent on time from February 2020 - January 2021. Additionally, tenant must pay application cost + $20 NSF charges incurred by the Landlord',
        'Full relief',
        'Interim order'
    ]
}

for new_class, old_classes in reclassified_outcomes.items():
    for old_class in old_classes:
        gold_data.loc[gold_data['case_outcome'] == old_class, 'new_case_outcome'] = new_class

gold_data['new_case_outcome'].value_counts()

no_relief     442
other         155
no_payment     74
Name: new_case_outcome, dtype: int64

In [None]:
import re
import itertools

def find_all_positions(text: str, keyword: str):
    """
    Finds all positions of a keyword in a given text.

    This function searches for a keyword in a given text and returns a list of positions where the keyword is found.

    Parameters
    ----------
    text : str
        The text to search within.
    keyword : str
        The keyword to find in the text.

    Returns
    -------
    list
        A list of integers representing the positions of the keyword in the text.

    Examples
    --------
    >>> find_all_positions("This is an example sentence.", "example")
    [11]
    """
    positions = []
    start = 0
    while True:
        index = text.find(keyword, start)
        if index == -1:
            break
        positions.append(index)
        start = index + 1
    return positions

def get_outcome_span(text: str, return_truncated: bool = True):
    """
    Extracts the outcome span from a given text using different methods.

    This function extracts the outcome span from a given text using multiple methods. It first attempts to find
    the span between occurrences of the phrases "accordance with" and "ordered". If that method fails, it then
    tries to find the span after the phrase "it is ordered". If that also fails, it looks for the span after the
    phrase "find". The function returns the extracted outcome span as a cleaned string.

    Parameters
    ----------
    text : str
        The text from which to extract the outcome span.

    Returns
    -------
    str or None
        The extracted outcome span as a cleaned string, or None if no span is found.

    Examples
    --------
    >>> get_outcome_span(unstructured_case_file)
    "In accordance with the order, it is ordered that the defendant pays a fine."
    """

    ############### FIRST METHOD ################

    for keyword in ['in accordance with', 'grant', 'relief', 'fair']: # these all seem common but none seem to exist in 100% of cases

        if keyword in text:

            # find all occurrences of 'in accordance with' and 'ordered'
            accordance_with_indices = [m.end() for m in re.finditer(keyword, text)]
            ordered_indices = [m.start() for m in re.finditer("ordered", text)]

            # generate all possible pairs of indices
            index_pairs = list(itertools.product(accordance_with_indices, ordered_indices))

            # filter pairs where 'accordance with' index is less than 'ordered' index
            index_pairs = [(i, j) for (i, j) in index_pairs if i < j]
            if index_pairs:
                # find the pair with the shortest distance between indices
                min_distance_pair = min(index_pairs, key = lambda x: x[1] - x[0])
                try:
                    best_subset = text[min_distance_pair[0] - 300 : min_distance_pair[1] + 400].strip()
                except IndexError:
                    best_subset = text[min_distance_pair[0] - 600 : min_distance_pair[1]].strip()

                best_subset = best_subset.split(". ")

                if best_subset:

                    sent_id = [idx for idx, i in enumerate(best_subset) if keyword in i.lower()][0]

                    clean_outcome = best_subset[sent_id]

                    # return None
                    # print("METHOD 1")

                    # return JUST the (presumably) most relevant outcome span (after cleaning it up a bit)
                    if return_truncated:
                        clean_outcome = re.sub(r'\[\d+\]', '', clean_outcome)
                        clean_outcome = re.sub(r'^\d+\.\s*', '', clean_outcome).strip() # removes numbers from the start of the string such as "16. " from start of string

                        if ")" in clean_outcome[:10] and "(" not in clean_outcome[:10]:
                            clean_outcome = clean_outcome.split(")")[1].strip()
                        return clean_outcome

                    # return all case file text until the end of the outcome span
                    else:
                        return text[: text.find(clean_outcome) + len(clean_outcome)]

                else:
                    continue # to next match of all matches of the keyword

    ################ SECOND METHOD ################

    keyword = "it is ordered"
    if keyword in text.lower():
        matches = find_all_positions(text.lower(), keyword)

        for match in matches:
            try: # match + 400 chars
                clean_outcome = ". ".join(text[match - 400 : match + 400].split(". ")[1:-1]) 
            except IndexError: # match idx until end of string (+ 400 is sometimes out of range)
                clean_outcome = ". ".join(text[match - 600 :].split(". ")[1:-1])

            # return None
            # print("METHOD 2")
            if clean_outcome:

                if return_truncated:
                    clean_outcome = re.sub(r'\[\d+\]', '', clean_outcome)
                    clean_outcome = re.sub(r'^\d+\.\s*', '', clean_outcome).strip() # removes numbers from the start of the string such as "16. " from start of string

                    if ")" in clean_outcome[:10] and "(" not in clean_outcome[:10]:
                        clean_outcome = clean_outcome.split(")")[1].strip()
                    return clean_outcome

                # return all case file text until the end of the outcome span
                else:
                    return text[: text.find(clean_outcome) + len(clean_outcome)]
                
            # if this somehow returns nothing, continue to the next match in the "it is ordered" matches
            else:
                continue # to next match of all matches of the keyword

    ############### THIRD METHOD ################

    keyword = " find " # spaces to prevent "finding" or other derivations from being included -- specifically looking for statements like "I find that..."
    if keyword in text.lower():
        matches = find_all_positions(text.lower(), keyword)
        for match in matches:

            try: # match + 400 chars
                clean_outcome = ". ".join(text[match - 400 : match + 400].split(". ")[1:-1]) 
            except IndexError: # match idx until end of string (+ 400 is sometimes out of range)
                clean_outcome = ". ".join(text[match - 600 :].split(". ")[1:-1])

            # return None

            # if the outcome doesn't end with a period, add one. it looks nicer :)
            # print("METHOD 3")
            if clean_outcome:
                
                if return_truncated:
                    clean_outcome = re.sub(r'\[\d+\]', '', clean_outcome)
                    clean_outcome = re.sub(r'^\d+\.\s*', '', clean_outcome).strip() # removes numbers from the start of the string such as "16. " from start of string

                    if ")" in clean_outcome[:10] and "(" not in clean_outcome[:10]:
                        clean_outcome = clean_outcome.split(")")[1].strip()
                    return clean_outcome
                else:
                    return text[: text.find(clean_outcome) + len(clean_outcome)]
                
            # if this somehow returns nothing, continue to the next match in the " find " matches
            else:
                continue # to next match of all matches of the keyword

    # if absolutely nothing works, return none and try Longformer or something idk
    return None

test_row = 50
# for test_row in range(20, 40):
test_str = " ".join(silver_data.loc[test_row, 'content'])
print(silver_data.loc[test_row, 'url'])
cospan = get_outcome_span(test_str, return_truncated = True)
if cospan:
    print(f"ROW: {test_row}")
    print(f"TOKENS: {len(cospan.split())}")
    print()
    print(cospan)

https://canlii.ca/t/h5xx1
ROW: 50
TOKENS: 47

I have considered all of the disclosed circumstances in accordance with subsection 83(2) of the Act, and find that it would not be unfair to grant relief from eviction subject to the condition(s) set out in this order pursuant to subsection 83(1)(a) and 204(1) of the Act


Updating CSV with Case Outcome Span

In [None]:
asdf

NameError: name 'asdf' is not defined

In [None]:
import time
import numpy as np
from collections import deque

start_time = time.time()

# Initialize a deque to store the latest 100 iteration times
time_deque = deque(maxlen = 500)

for index, row in enumerate(silver_data.itertuples()):

    # Save the start time of this iteration
    iteration_start_time = time.time()

    try:
        pass
        content_str = " ".join(silver_data.at[row.Index, 'content']) # doing df.loc[] broke everything for some reason (maybe a memory thing? idk really but this worked)
        silver_data.at[row.Index, 'outcome_span'] = get_outcome_span(content_str, return_truncated = True)
    except Exception as any_error:
        print(f"{any_error} with file at Df row: ", row.Index)

    # Save the end time of this iteration and push it into the deque
    iteration_end_time = time.time()
    time_deque.append(iteration_end_time - iteration_start_time)

    # progress tracker
    average_time_per_row = np.mean(time_deque)
    rows_left = len(silver_data) - (index + 1)
    estimated_time_left = rows_left * average_time_per_row

    print("Files processed: ", index + 1, "of", len(silver_data),
          "Estimated time remaining: ", time.strftime('%H:%M:%S', time.gmtime(estimated_time_left)), end = '\r')
    # break

silver_data.head(10)

Files processed:  679 of 679 Estimated time remaining:  00:00:00

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year,ltb_location,decision_date,hearing_date,url,adjudicating_member,outcome_span
0,Metadata:\nDate:\t2018-07-06\nFile number:\t\n...,SWL-17348-18.txt,"[Metadata:, Date: 2018-07-06, File number:, SW...","[Date: 2018-07-06, File number:, SWL-17348-18,...",[Order under Section 69 Residential Tenancies ...,"SWL-17348-18 (Re), 2018 CanLII 88643 (ON LTB)",SWL-17348-18,English,2018,London,07/06/2018,07/06/2018,https://canlii.ca/t/hv7qd,Kevin Lundy,I have considered all of the disclosed circums...
1,Metadata:\nDate:\t2017-05-26\nFile number:\t\n...,TEL-79722-17.txt,"[Metadata:, Date: 2017-05-26, File number:, TE...","[Date: 2017-05-26, File number:, TEL-79722-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-79722-17 (Re), 2017 CanLII 48856 (ON LTB)",TEL-79722-17,English,2017,Toronto,05/26/2017,06/07/2017,https://canlii.ca/t/h539n,Laura Hartslief,I have considered all of the disclosed circums...
2,Metadata:\nDate:\t2017-07-05\nFile number:\t\n...,TEL-80773-17.txt,"[Metadata:, Date: 2017-07-05, File number:, TE...","[Date: 2017-07-05, File number:, TEL-80773-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-80773-17 (Re), 2017 CanLII 60498 (ON LTB)",TEL-80773-17,English,2017,Whitby,07/05/2017,07/05/2017,https://canlii.ca/t/h5z39,Ruth Carey,I have considered all of the disclosed circums...
3,Metadata:\nDate:\t2017-07-18\nFile number:\t\n...,TEL-81359-17-AM.txt,"[Metadata:, Date: 2017-07-18, File number:, TE...","[Date: 2017-07-18, File number:, TEL-81359-17-...",[Order under Section 69 Residential Tenancies ...,"TEL-81359-17-AM (Re), 2017 CanLII 60052 (ON LTB)",TEL-81359-17-AM,English,2017,Toronto,07/18/2017,08/06/2017,https://canlii.ca/t/h5z3r,Shelby Whittick,I have considered all of the disclosed circums...
4,Metadata:\nDate:\t2017-08-17\nFile number:\t\n...,TEL-81405-17.txt,"[Metadata:, Date: 2017-08-17, File number:, TE...","[Date: 2017-08-17, File number:, TEL-81405-17,...",[Order under Section 69 Residential Tenancies ...,"TEL-81405-17 (Re), 2017 CanLII 60203 (ON LTB)",TEL-81405-17,English,2017,Lindsay,08/17/2017,08/17/2017,https://canlii.ca/t/h5z3s,Laura Hartslief,I have considered all of the disclosed circums...
5,Metadata:\nDate:\t2018-01-26\nFile number:\t\n...,TNL-00793-18 TNL-01183-18.txt,"[Metadata:, Date: 2018-01-26, File number:, TN...","[Date: 2018-01-26, File number:, TNL-00793-18;...",[Order under Section 69 Residential Tenancies ...,"TNL-00793-18 (Re), 2018 CanLII 42597 (ON LTB)",TNL-00793-18;TNL-01183-18,English,20T,Toronto,01/26/2018,01/26/2018,https://canlii.ca/t/hs0d2,Sylvia Watson,I have also considered all of the disclosed ci...
6,Metadata:\nDate:\t2018-01-22\nFile number:\t\n...,TNL-00822-18.txt,"[Metadata:, Date: 2018-01-22, File number:, TN...","[Date: 2018-01-22, File number:, TNL-00822-18,...",[Order under Section 69 Residential Tenancies ...,"TNL-00822-18 (Re), 2018 CanLII 42600 (ON LTB)",TNL-00822-18,English,2018,Toronto,01/22/2018,03/01/2018,https://canlii.ca/t/hs0d4,Neil Kaufman,No circumstances were disclosed to me to suppo...
7,Metadata:\nDate:\t2018-03-29\nFile number:\t\n...,TNL-01582-18-AM.txt,"[Metadata:, Date: 2018-03-29, File number:, TN...","[Date: 2018-03-29, File number:, TNL-01582-18-...",[Amended Order Order under Section 69 Resident...,"TNL-01582-18-AM (Re), 2018 CanLII 113954 (ON LTB)",TNL-01582-18-AM,English,2018,Newmarket,03/29/2018,03/29/2018,https://canlii.ca/t/hwbhh,Harry Cho,I have considered all of the disclosed circums...
8,Metadata:\nDate:\t2018-03-29\nFile number:\t\n...,TNL-01839-18.txt,"[Metadata:, Date: 2018-03-29, File number:, TN...","[Date: 2018-03-29, File number:, TNL-01839-18,...",[Order under Section 69 Residential Tenancies ...,"TNL-01839-18 (Re), 2018 CanLII 48234 (ON LTB)",TNL-01839-18,English,2018,Newmarket,03/29/2018,03/29/2018,https://canlii.ca/t/hs8lg,Harry Cho,I have considered all of the disclosed circums...
9,Metadata:\nDate:\t2018-04-03\nFile number:\t\n...,TNL-02075-18.txt,"[Metadata:, Date: 2018-04-03, File number:, TN...","[Date: 2018-04-03, File number:, TNL-02075-18,...",[Order under Section 69 Residential Tenancies ...,"TNL-02075-18 (Re), 2018 CanLII 113854 (ON LTB)",TNL-02075-18,English,2018,Newmarket,04/03/2018,04/15/2018,https://canlii.ca/t/hwbhj,Sylvia Watson,I have considered all of the disclosed circums...


In [None]:
asdf


NameError: name 'asdf' is not defined

# Testing Metadata Extraction Model
- report to be written in based on data from WANDB
- wandb report link: https://wandb.ai/kmaurinjones/huggingface/reports/FLAN-T5-Metadata-Extractor-Model--Vmlldzo0NDk4MDEx

### General Cleaning functions (used with model)

In [None]:
import re

def general_cleaning(raw_file_str: str):
    # gets rid of tabs, non-breaking spaces, leading/trailing whitespace, removes empty lines, and "\xa0"
    generally_cleaned_str = [line.replace("\t", " ").replace("\xa0", "").strip() for line in raw_file_str.split('\n') if line.strip() != '']
    return generally_cleaned_str

def remove_whitespace_and_underscores(string):
    # Remove consecutive whitespace
    string = re.sub(r'\s+', ' ', string)

    # Remove more than three consecutive underscores
    string = re.sub(r'_+', '', string)

    return string.strip()

In [None]:
import transformers
# from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
model_name = "metadata_extractor_flant5_small"

# folder where the model files are located -- unzip before running
model_dir = f"/Users/kmaurinjones/Desktop/School/UBC/UBC_Coursework/capstone/Allard_A_Capstone/models/metadata_extractor/{model_name}"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

In [None]:
def extract_metadata_t5(raw_case_file_text: str, model = model, tokenizer = tokenizer):
    # do general case file cleaning
    clean_file_list = general_cleaning(raw_case_file_text)
    clean_file_str = " ".join([line for line in clean_file_list if ("metadata:" or "content:") not in line.lower()])
    print(clean_file_str)
    
    # run model on cleaned case file text
    inputs = ["extract metadata boundary:" + clean_file_str] # PREFIX = "extract metadata boundary:"

    # print("INPUT:", inputs)
    inputs = tokenizer(inputs, max_length = 256, truncation = True, return_tensors = "pt")
    output = model.generate(**inputs, num_beams = 8, do_sample = True, min_length = 1, max_length = 128)
    decoded_output = tokenizer.batch_decode(output, skip_special_tokens = True)[0]
    # print("OUTPUT:", decoded_output)

    return decoded_output
    
    # return metadata and content as lists

In [None]:
row_num = 3
# print(silver_data.loc[row_num, 'raw_file_text'])
[line for line in general_cleaning(silver_data.loc[row_num, 'raw_file_text']) if ("metadata:" or "content:") not in line.lower()]

['Date: 2017-07-18',
 'File number:',
 'TEL-81359-17-AM',
 'TEL-81359-17-AM',
 'Citation: TEL-81359-17-AM (Re), 2017 CanLII 60052 (ON LTB), <https://canlii.ca/t/h5z3r>, retrieved on 2023-05-16 https://canlii.ca/t/h5z3r',
 'Content:',
 'Order under Section 69',
 'Residential Tenancies Act, 2006',
 'And section 21.1 of the Statutory Powers',
 'Procedure Act',
 'File Number: TEL-81359-17-AM',
 'F.I.',
 "(the 'Landlord') applied for an order to terminate the tenancy and evict A.G.",
 "and T.B. (the 'Tenants') because the Tenants did not pay the rent that the",
 'Tenants owe. This is an L1 application.',
 'The',
 'Landlord also applied for an order to terminate the tenancy and evict the',
 'Tenants because they have been persistently late in paying their rent. This',
 'is an L2 application.',
 'This',
 'application was heard in Toronto on July 7, 2017.',
 'The Landlord and the second-named Tenant attended the hearing.',
 'This amended order is issued to correct clerical errors in the',
 'or

In [None]:
test_case_file = silver_data.loc[row_num, 'raw_file_text']
print(f"GOAL: {' '.join(silver_data.loc[row_num, 'metadata'])}")
print()
print(f"MODEL: {extract_metadata_t5(raw_case_file_text = test_case_file)}")

GOAL: Date: 2017-07-18 File number: TEL-81359-17-AM TEL-81359-17-AM Citation: TEL-81359-17-AM (Re), 2017 CanLII 60052 (ON LTB), <https://canlii.ca/t/h5z3r>, retrieved on 2023-05-16 https://canlii.ca/t/h5z3r

Date: 2017-07-18 File number: TEL-81359-17-AM TEL-81359-17-AM Citation: TEL-81359-17-AM (Re), 2017 CanLII 60052 (ON LTB), <https://canlii.ca/t/h5z3r>, retrieved on 2023-05-16 https://canlii.ca/t/h5z3r Content: Order under Section 69 Residential Tenancies Act, 2006 And section 21.1 of the Statutory Powers Procedure Act File Number: TEL-81359-17-AM F.I. (the 'Landlord') applied for an order to terminate the tenancy and evict A.G. and T.B. (the 'Tenants') because the Tenants did not pay the rent that the Tenants owe. This is an L1 application. The Landlord also applied for an order to terminate the tenancy and evict the Tenants because they have been persistently late in paying their rent. This is an L2 application. This application was heard in Toronto on July 7, 2017. The Landlord a