# LTB Location Extraction

In [1]:
import pandas as pd
import ast

def read_csv_with_lists(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Iterate over each column
    for column in df.columns:
        # Check if the column contains strings that represent lists
        if df[column].dtype == object:
            try:
                # Convert the strings to lists using ast.literal_eval
                df[column] = df[column].apply(ast.literal_eval)
            except (ValueError, SyntaxError):
                # Skip the column if it cannot be converted to a list
                pass
    
    return df

In [2]:
partner_gold_df = read_csv_with_lists('data/gold_labels_with_files.csv')
partner_gold_df['file_number'] = partner_gold_df['file_number_gold_cleaned']#.columns
partner_gold_df.head(2)

Unnamed: 0,What is the file number of the case?,file_number_gold_cleaned,raw_file_text,raw_file_name,Timestamp,Email Address,What was the date of the hearing? [mm/dd/yyyy],What was the date of the decision? [mm/dd/yyyy],Who was the member adjudicating the decision?,What was the location of the landlord tenant board?,...,"If yes to the previous question, which of the following were applicable to the tenant?",Did the decision state the tenant was given prior notice for the eviction?,"If the tenant was given prior notice for the eviction, how much notice was given?",Did the decisions state postponement would result in the tenant accruing additional arrears?,Which other specific applications of the landlord or the tenant were mentioned?,Did the decision mention the validity of an N4 eviction notice?,Were there detail(s) in the decision not captured by this questionnaire that should be included?,Exec Review,Review Status,file_number
0,CEL-62600-16,CEL-62600-16,Metadata:\nDate:\t2017-01-18\nFile number:\t\n...,CEL-62600-16.txt,1/31/2021 22:26:19,jac.huang@mail.utoronto.ca,1/5/2017,1/18/2017,Avril Cardoso,Mississauga,...,,Yes,Not stated,No,L1: Application to Evict a Tenant for Non-paym...,No,,,,CEL-62600-16
1,CEL-62852-16,CEL-62852-16,Metadata:\nDate:\t2017-01-09\nFile number:\t\n...,CEL-62852-16.txt,1/31/2021 22:35:03,jac.huang@mail.utoronto.ca,12/14/2016,1/9/2017,Tiisetso Russell,Mississauga,...,"reliance on social assistance, children's scho...",Yes,Not stated,No,No other specific applications were mentioned,No,,,,CEL-62852-16


In [3]:
gold_df = read_csv_with_lists('data/outcome_extraction_testing.csv')
print(gold_df.value_counts('new_case_outcome', normalize = True))
gold_df#.head()

new_case_outcome
No relief            0.639881
Relief               0.349702
Conditional Order    0.010417
dtype: float64


Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year,ltb_location,decision_date,hearing_date,url,adjudicating_member,new_case_outcome
0,Metadata:\nDate:\t2017-01-18\nFile number:\t\n...,CEL-62600-16.txt,"[Metadata:, Date: 2017-01-18, File number:, CE...","[Date: 2017-01-18, File number:, CEL-62600-16,...",[Arrears Worksheet File Number: CEL-62600-16 T...,"CEL-62600-16 (Re), 2017 CanLII 9545 (ON LTB)",CEL-62600-16,English,2016,Mississauga,01/18/2017,01/30/2017,https://canlii.ca/t/gxq6n,Avril Cardoso,No relief
1,Metadata:\nDate:\t2017-01-09\nFile number:\t\n...,CEL-62852-16.txt,"[Metadata:, Date: 2017-01-09, File number:, CE...","[Date: 2017-01-09, File number:, CEL-62852-16,...",[Arrears Worksheet File Number: CEL-62852-16 T...,"CEL-62852-16 (Re), 2017 CanLII 9535 (ON LTB)",CEL-62852-16,English,2016,Mississauga,01/09/2017,01/09/2017,https://canlii.ca/t/gxq6r,Tiisetso Russell,Relief
2,Metadata:\nDate:\t2017-01-09\nFile number:\t\n...,CEL-63024-16.txt,"[Metadata:, Date: 2017-01-09, File number:, CE...","[Date: 2017-01-09, File number:, CEL-63024-16,...",[Arrears Worksheet File Number: CEL-63024-16 T...,"CEL-63024-16 (Re), 2017 CanLII 9543 (ON LTB)",CEL-63024-16,English,2016,Mississauga,01/09/2017,01/09/2017,https://canlii.ca/t/gxq6s,Tiisetso Russell,Relief
3,Metadata:\nDate:\t2017-01-20\nFile number:\t\n...,CEL-63056-16.txt,"[Metadata:, Date: 2017-01-20, File number:, CE...","[Date: 2017-01-20, File number:, CEL-63056-16,...",[Arrears Worksheet File Number: CEL-63056-16 T...,"CEL-63056-16 (Re), 2017 CanLII 9537 (ON LTB)",CEL-63056-16,English,2016,Mississauga,01/09/2017,01/09/2017,https://canlii.ca/t/gxq6t,Tiisetso Russell,No relief
4,Metadata:\nDate:\t2017-02-03\nFile number:\t\n...,CEL-63193-16.txt,"[Metadata:, Date: 2017-02-03, File number:, CE...","[Date: 2017-02-03, File number:, CEL-63193-16,...",[Arrears Worksheet File Number: CEL-63193-16 T...,"CEL-63193-16 (Re), 2017 CanLII 30828 (ON LTB)",CEL-63193-16,English,2016,Mississauga,01/10/2017,02/03/2017,https://canlii.ca/t/h3w7b,Karen Wallace,No relief
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,Metadata:\nDate:\t2018-12-13\nFile number:\t\n...,TSL-98918-18-RV.txt,"[Metadata:, Date: 2018-12-13, File number:, TS...","[Date: 2018-12-13, File number:, TSL-98918-18-...",[Order under Section 21.2 of the Statutory Pow...,"TSL-98918-18-RV (Re), 2018 CanLII 141679 (ON LTB)",TSL-98918-18-RV,English,2018,Toronto,11/08/2018,12/13/2018,https://canlii.ca/t/j0fjv,Nancy Henderson,No relief
668,Metadata:\nDate:\t2018-11-23\nFile number:\t\n...,TSL-99691-18.txt,"[Metadata:, Date: 2018-11-23, File number:, TS...","[Date: 2018-11-23, File number:, TSL-99691-18,...",[Order under Section 69 Residential Tenancies ...,"TSL-99691-18 (Re), 2018 CanLII 141675 (ON LTB)",TSL-99691-18,English,2018,Toronto,11/23/2018,11/23/2018,https://canlii.ca/t/j0fk1,David Lee,No relief
669,Metadata:\nDate:\t2018-11-29\nFile number:\t\n...,TSL-99824-18.txt,"[Metadata:, Date: 2018-11-29, File number:, TS...","[Date: 2018-11-29, File number:, TSL-99824-18,...",[Order under Section 69 Residential Tenancies ...,"TSL-99824-18 (Re), 2018 CanLII 141673 (ON LTB)",TSL-99824-18,English,2018,Toronto,11/29/2018,12/11/2018,https://canlii.ca/t/j0fk2,Renée Lang,No relief
670,Metadata:\nDate:\t2018-12-12\nFile number:\t\n...,TSL-99900-18.txt,"[Metadata:, Date: 2018-12-12, File number:, TS...","[Date: 2018-12-12, File number:, TSL-99900-18,...",[Order under Section 69 Residential Tenancies ...,"TSL-99900-18 (Re), 2018 CanLII 140403 (ON LTB)",TSL-99900-18,English,2018,Toronto,12/12/2018,12/12/2018,https://canlii.ca/t/hzzb6,David Mungovan,No relief


In [4]:
# def remove_schedule(case_content_str: str):
#     """
#     Removes the "schedule 1" section from a given string and returns the modified string.

#     Args:
#         case_content_str (str): The input string from which the "schedule 1" section will be removed.

#     Returns:
#         str: The modified string with the "schedule 1" section removed.

#     Raises:
#         TypeError: If `case_content_str` is not a string.

#     Examples:
#         >>> remove_schedule("This is the case content. Schedule 1 contains additional details.")
#         "This is the case content."
#     """

#     if not isinstance(case_content_str, str):
#         raise TypeError("`case_content_str` must be a string variable")

#     # including the "1" because it seems to always be present, even if there is only 1 schedule in the case
#     if "schedule 1" in case_content_str.lower():
#         start_idx = case_content_str.lower().find("schedule 1")
#         case_content_str = case_content_str[:start_idx].strip()

#     # return it whether the schedule is present or not -- better for the pipeline
#     return case_content_str.strip()

# print(remove_schedule("This is a ScHedule  test."))
# print(remove_schedule("This is a ScHedule 1 test."))

Functions

In [5]:
import re

def get_postal_code(text: str):
    """
    Finds a postal code in the format "L4Z2G5" within the given text.

    Args:
        text (str): The input text to search for a postal code.

    Returns:
        str: The postal code found in the text. Returns an empty string if no postal code is found.

    Examples:
        >>> find_postal_code("This is a sample text with a postal code L4Z2G5.")
        "L4Z2G5"
    """

    pattern = r"\b[A-Za-z]\d[A-Za-z]\d[A-Za-z]\d\b"
    match = re.search(pattern, text)

    if match:
        return match.group()
    else:
        return None

def find_closest_subset(text: str, keywords: list):
    """
    Finds a subset of the given text where a date and any of the given keywords appear with the smallest distance between them,
    but only if the subset appears before the word "determination" in the lowercase text and does not contain the word "member".

    Args:
        text (str): The input text to search for the subset.
        keywords (list): The list of keywords to search for.

    Returns:
        tuple: A tuple containing the subset of the text where the date and keyword appear with the smallest distance between them,
               and the corresponding keyword. Returns an empty string and None if no match is found or if the subset appears after "determination"
               or contains the word "member".

    Examples:
        >>> find_closest_subset("The event will take place on April 23, 2018. The application was heard on April 25, 2018.", ["heard", "event"])
        ("The event will take place on April 23, 2018.", "event")

    """

    pattern = r"\b[A-Z][a-z]+ \d{1,2}, \d{4}\b"
    date_matches = re.findall(pattern, text)
    keyword_positions = [(m.start(), m.end(), keyword) for keyword in keywords for m in re.finditer(keyword, text)]

    if not date_matches or not keyword_positions:
        return "", None

    smallest_distance = float('inf')
    best_subset = ""
    best_keyword = None
    
    for date in date_matches:
        for start, end, keyword in keyword_positions:
            distance = abs(start - text.find(date))
            subset = text[min(start, text.find(date)): max(end, text.find(date))]

            if distance < smallest_distance and text.lower().find(best_subset.lower()) < (text.lower().find("determination") or text.lower().find("it is determinatined that")) and ("member" or "with the request to review") not in subset.lower():
                smallest_distance = distance
                best_subset = subset
                best_keyword = keyword

    if text.lower().find(best_subset.lower()) >= text.lower().find("determination") or "member" in best_subset.lower():
        return "", None

    return best_subset, best_keyword


def get_ltb_location_by_postal_code(case_content_str: str):
    """
    Helps to extract the location information from the given case content string using postal code lookup.

    Args:
        case_content_str (str): The case content string to extract the location from.

    Returns:
        str or None: Subset of text from the passed case string wherein the location appears near the postal code.

    Examples:
        >>> get_ltb_location_by_postal_code("The application was heard at L4Z 2G5.")
        "Mississauga"
    """

    # if there isn't a postal code, return None right away
    if not get_postal_code(case_content_str):
        return None

    pc_idx = case_content_str.find(get_postal_code(case_content_str))
    subset = case_content_str[pc_idx - 30 : pc_idx]

    if "ON" in subset:
        # subset = case_content_str[pc_idx - 30 : pc_idx].split("ON")[:-1]
        subset = subset.split("ON")[:-1]
    elif "Ontario" in subset:
        # subset = case_content_str[pc_idx - 30 : pc_idx].split("Ontario")[:-1]
        subset = subset.split("Ontario")[:-1]

    subset = " ".join(subset)
    # print(subset)
    
    if "floor" in subset.lower():
        floor_idx = subset.lower().find("floor")
        # print(floor_idx)
        subset = subset[floor_idx + len("floor") :].strip()
    
    return subset

def get_ltb_location(case_content_str: str):
    """
    Extracts the location information from the given case content string.

    Args:
        case_content_str (str): The case content string to extract the location from.

    Returns:
        str or None: The extracted location information if found, otherwise None.

    Examples:
        >>> get_ltb_location("The application was heard in Newmarket.")
        "Newmarket"
    """

    keywords = ["application was heard", "applications were heard", "was heard", "were heard together",
                "was held", "set to be heard",
                # "heard by telephone", "heard by teleconference", "heard via teleconference",
                "heard by", "heard by", "heard via",
                "motion were heard", "motion was heard", "came before the board in",
                "was then heard in", "were then heard in"]

    subset, keyword = find_closest_subset(text = case_content_str, keywords = keywords)

    if subset:
        subset = subset.replace(keyword, "")
        subset = subset.split()
        subset = [tok for tok in subset if tok not in ['in', 'on', 'via', 'together', 'by']]
        subset = " ".join(subset).strip()
        subset = subset.replace("With The Request To Review", "")

    if subset: # sometimes the hearing location is redacted and replaced with [CITY]
        if str(subset) != "[CITY]":
            return subset.title().replace("And Avenue, Unit 2 ", "").strip()

    # otherwise, go by postal code
    subset = get_ltb_location_by_postal_code(case_content_str = case_content_str)
    if subset:
        return subset.title().replace("And Avenue, Unit 2 ", "").strip()
    else:
        return None

Testing against gold_df data

In [6]:
row = 434
print(row)

case_content = " ".join(gold_df.loc[row, 'content'])
print(gold_df.loc[row, 'url'])
# print(case_content)
print(gold_df.loc[row, 'ltb_location'])
print()
# print(get_postal_code(case_content))

get_ltb_location(case_content_str = case_content)

434
https://canlii.ca/t/hwbhn
Toronto



'With The Request To Review'

In [7]:
import jellyfish

In [8]:
total = 0
nopes = []

for row in gold_df.index:
    # print(row)
    case_content = " ".join(gold_df.loc[row, 'content'])
    # print(gold_df.loc[row, 'url'])
    
    gold_ltb = str(gold_df.loc[row, 'ltb_location']).strip()
    silv_ltb = str(get_ltb_location(case_content_str = case_content)).strip()

    if gold_ltb == silv_ltb: # accuracy
    # if jellyfish.jaro_winkler_similarity(gold_ltb, silv_ltb) > 0.5: # jw sim
        total += 1
    else:
        nopes.append(row)
        # print(gold_df.loc[row, 'url'])
        # print(row)
        # print(gold_ltb)
        # print(silv_ltb)
        # print()

print(total / len(gold_df))

print(len(nopes))
# for id, nope in enumerate(nopes):
#     print(gold_df.loc[nope, 'url'])
#     print(nope)
#     print(gold_df.loc[nope, 'ltb_location'])
#     print()
#     if id == 10:
#         break

0.9523809523809523
32
