# Adjudicating Member Extraction

In [1]:
import pandas as pd
import ast

def read_csv_with_lists(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Iterate over each column
    for column in df.columns:
        # Check if the column contains strings that represent lists
        if df[column].dtype == object:
            try:
                # Convert the strings to lists using ast.literal_eval
                df[column] = df[column].apply(ast.literal_eval)
            except (ValueError, SyntaxError):
                # Skip the column if it cannot be converted to a list
                pass
    
    return df

In [2]:
partner_gold_df = read_csv_with_lists('data/gold_labels_with_files.csv')
partner_gold_df['file_number'] = partner_gold_df['file_number_gold_cleaned']#.columns
partner_gold_df.head(2)

Unnamed: 0,What is the file number of the case?,file_number_gold_cleaned,raw_file_text,raw_file_name,Timestamp,Email Address,What was the date of the hearing? [mm/dd/yyyy],What was the date of the decision? [mm/dd/yyyy],Who was the member adjudicating the decision?,What was the location of the landlord tenant board?,...,"If yes to the previous question, which of the following were applicable to the tenant?",Did the decision state the tenant was given prior notice for the eviction?,"If the tenant was given prior notice for the eviction, how much notice was given?",Did the decisions state postponement would result in the tenant accruing additional arrears?,Which other specific applications of the landlord or the tenant were mentioned?,Did the decision mention the validity of an N4 eviction notice?,Were there detail(s) in the decision not captured by this questionnaire that should be included?,Exec Review,Review Status,file_number
0,CEL-62600-16,CEL-62600-16,Metadata:\nDate:\t2017-01-18\nFile number:\t\n...,CEL-62600-16.txt,1/31/2021 22:26:19,jac.huang@mail.utoronto.ca,1/5/2017,1/18/2017,Avril Cardoso,Mississauga,...,,Yes,Not stated,No,L1: Application to Evict a Tenant for Non-paym...,No,,,,CEL-62600-16
1,CEL-62852-16,CEL-62852-16,Metadata:\nDate:\t2017-01-09\nFile number:\t\n...,CEL-62852-16.txt,1/31/2021 22:35:03,jac.huang@mail.utoronto.ca,12/14/2016,1/9/2017,Tiisetso Russell,Mississauga,...,"reliance on social assistance, children's scho...",Yes,Not stated,No,No other specific applications were mentioned,No,,,,CEL-62852-16


In [3]:
gold_df = read_csv_with_lists('data/outcome_extraction_testing.csv')
print(gold_df.value_counts('new_case_outcome', normalize = True))
gold_df#.head()

new_case_outcome
No relief            0.639881
Relief               0.349702
Conditional Order    0.010417
dtype: float64


Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year,ltb_location,decision_date,hearing_date,url,adjudicating_member,new_case_outcome
0,Metadata:\nDate:\t2017-01-18\nFile number:\t\n...,CEL-62600-16.txt,"[Metadata:, Date: 2017-01-18, File number:, CE...","[Date: 2017-01-18, File number:, CEL-62600-16,...",[Arrears Worksheet File Number: CEL-62600-16 T...,"CEL-62600-16 (Re), 2017 CanLII 9545 (ON LTB)",CEL-62600-16,English,2016,Mississauga,01/18/2017,01/30/2017,https://canlii.ca/t/gxq6n,Avril Cardoso,No relief
1,Metadata:\nDate:\t2017-01-09\nFile number:\t\n...,CEL-62852-16.txt,"[Metadata:, Date: 2017-01-09, File number:, CE...","[Date: 2017-01-09, File number:, CEL-62852-16,...",[Arrears Worksheet File Number: CEL-62852-16 T...,"CEL-62852-16 (Re), 2017 CanLII 9535 (ON LTB)",CEL-62852-16,English,2016,Mississauga,01/09/2017,01/09/2017,https://canlii.ca/t/gxq6r,Tiisetso Russell,Relief
2,Metadata:\nDate:\t2017-01-09\nFile number:\t\n...,CEL-63024-16.txt,"[Metadata:, Date: 2017-01-09, File number:, CE...","[Date: 2017-01-09, File number:, CEL-63024-16,...",[Arrears Worksheet File Number: CEL-63024-16 T...,"CEL-63024-16 (Re), 2017 CanLII 9543 (ON LTB)",CEL-63024-16,English,2016,Mississauga,01/09/2017,01/09/2017,https://canlii.ca/t/gxq6s,Tiisetso Russell,Relief
3,Metadata:\nDate:\t2017-01-20\nFile number:\t\n...,CEL-63056-16.txt,"[Metadata:, Date: 2017-01-20, File number:, CE...","[Date: 2017-01-20, File number:, CEL-63056-16,...",[Arrears Worksheet File Number: CEL-63056-16 T...,"CEL-63056-16 (Re), 2017 CanLII 9537 (ON LTB)",CEL-63056-16,English,2016,Mississauga,01/09/2017,01/09/2017,https://canlii.ca/t/gxq6t,Tiisetso Russell,No relief
4,Metadata:\nDate:\t2017-02-03\nFile number:\t\n...,CEL-63193-16.txt,"[Metadata:, Date: 2017-02-03, File number:, CE...","[Date: 2017-02-03, File number:, CEL-63193-16,...",[Arrears Worksheet File Number: CEL-63193-16 T...,"CEL-63193-16 (Re), 2017 CanLII 30828 (ON LTB)",CEL-63193-16,English,2016,Mississauga,01/10/2017,02/03/2017,https://canlii.ca/t/h3w7b,Karen Wallace,No relief
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,Metadata:\nDate:\t2018-12-13\nFile number:\t\n...,TSL-98918-18-RV.txt,"[Metadata:, Date: 2018-12-13, File number:, TS...","[Date: 2018-12-13, File number:, TSL-98918-18-...",[Order under Section 21.2 of the Statutory Pow...,"TSL-98918-18-RV (Re), 2018 CanLII 141679 (ON LTB)",TSL-98918-18-RV,English,2018,Toronto,11/08/2018,12/13/2018,https://canlii.ca/t/j0fjv,Nancy Henderson,No relief
668,Metadata:\nDate:\t2018-11-23\nFile number:\t\n...,TSL-99691-18.txt,"[Metadata:, Date: 2018-11-23, File number:, TS...","[Date: 2018-11-23, File number:, TSL-99691-18,...",[Order under Section 69 Residential Tenancies ...,"TSL-99691-18 (Re), 2018 CanLII 141675 (ON LTB)",TSL-99691-18,English,2018,Toronto,11/23/2018,11/23/2018,https://canlii.ca/t/j0fk1,David Lee,No relief
669,Metadata:\nDate:\t2018-11-29\nFile number:\t\n...,TSL-99824-18.txt,"[Metadata:, Date: 2018-11-29, File number:, TS...","[Date: 2018-11-29, File number:, TSL-99824-18,...",[Order under Section 69 Residential Tenancies ...,"TSL-99824-18 (Re), 2018 CanLII 141673 (ON LTB)",TSL-99824-18,English,2018,Toronto,11/29/2018,12/11/2018,https://canlii.ca/t/j0fk2,Renée Lang,No relief
670,Metadata:\nDate:\t2018-12-12\nFile number:\t\n...,TSL-99900-18.txt,"[Metadata:, Date: 2018-12-12, File number:, TS...","[Date: 2018-12-12, File number:, TSL-99900-18,...",[Order under Section 69 Residential Tenancies ...,"TSL-99900-18 (Re), 2018 CanLII 140403 (ON LTB)",TSL-99900-18,English,2018,Toronto,12/12/2018,12/12/2018,https://canlii.ca/t/hzzb6,David Mungovan,No relief


In [18]:
def find_all_positions(text: str, keyword: str):
    """
    Finds all positions of a keyword in a given text.

    This function searches for a keyword in a given text and returns a list of positions where the keyword is found.

    Parameters
    ----------
    text : str
        The text to search within.
    keyword : str
        The keyword to find in the text.

    Returns
    -------
    list
        A list of integers representing the positions of the keyword in the text.

    Examples
    --------
    >>> find_all_positions("This is an example sentence.", "example")
    [11]
    """
    positions = []
    start = 0
    while True:
        index = text.find(keyword, start)
        if index == -1:
            break
        positions.append(index)
        start = index + 1
    return positions

In [75]:
def remove_schedule(case_content_str: str):
    """
    Removes the "schedule 1" section from a given string and returns the modified string.

    Args:
        case_content_str (str): The input string from which the "schedule 1" section will be removed.

    Returns:
        str: The modified string with the "schedule 1" section removed.

    Raises:
        TypeError: If `case_content_str` is not a string.

    Examples:
        >>> remove_schedule("This is the case content. Schedule 1 contains additional details.")
        "This is the case content."
    """

    if not isinstance(case_content_str, str):
        raise TypeError("`case_content_str` must be a string variable")

    # including the "1" because it seems to always be present, even if there is only 1 schedule in the case
    if "schedule 1" in case_content_str.lower():
        start_idx = case_content_str.lower().find("schedule 1")
        case_content_str = case_content_str[:start_idx].strip()

    # return it whether the schedule is present or not -- better for the pipeline
    return case_content_str.strip()

print(remove_schedule("This is a ScHedule  test."))
print(remove_schedule("This is a ScHedule 1 test."))

This is a ScHedule  test.
This is a


In [8]:
row = 50
case_content = " ".join(gold_df.loc[row, 'content'])
case_content

"File Numbers: CEL-73963-18-RV CEL-76175-18-SA Order under Section 21.2 of the Statutory Powers Procedure Act and the Residential Tenancies Act, 2006 Review and Motion Order IS (the 'Landlord') applied for an order to terminate the tenancy and evict ZSP and BH (the 'Tenants') because the Tenants did not pay the rent that the Tenants owe (L1 application). The Landlord’s L1 application was resolved by order CEL-73963-18 issued on April 13, 2018. The Tenants requested a review of that order. On May 22, 2018 interim order CEL-73963-18-RV-IN was issued staying the order issued on April 13, 2018. The Landlord also filed an L4 application for an order to terminate the tenancy and evict the Tenants and for an order to have the Tenants pay the rent they owe because they failed to meet a condition specified in prior order CEL-73963-18 issued on April 13, 2018. The Landlord's L4 application was resolved ex parte (without a hearing) by order CEL-76175-18, issued on May 24, 2018. The Tenants filed 

In [203]:
# def get_adjudicating_member(case_content_str: str):
#     keyword_1 = "date issued" # this is the most reliable one
#     keyword_2 = "date of reasons" # first fallback
#     keyword_3 = "date order issued" # second fallback

#     # find which is best for the case (in order of best option to worst option)
#     if keyword_1 in case_content_str.lower():
#         keyword = keyword_1
#         # kw_idx = case_content_str.lower().find(keyword_1)
    
#     elif keyword_2 in case_content_str.lower():
#         keyword = keyword_2
#         # kw_idx = case_content_str.lower().find(keyword_2)

#     elif keyword_3 in case_content_str.lower():
#         keyword = keyword_3
#         # kw_idx = case_content_str.lower().find(keyword_3)

#     # if nothing is found, better to return nothing than to return something clearly incorrect
#     if not keyword:
#         return "nan"
    
#     # getting index of whichever keyword was found first
#     # kw_idx = case_content_str.lower().find(keyword)

#     # removing schedule if there is one
#     case_content_str = remove_schedule(case_content_str = case_content_str)

#     subset = case_content_str[kw_idx + len(keyword): kw_idx + 100] # subsetting to an arbitrary distance after the keyword location
#     subset = subset.split(", ")[0].strip()

#     # removing "member" if applicable
#     if "member" in subset.lower():
#         memb_idx = subset.lower().find("member")
#         subset = subset[: memb_idx].strip()

#     # removing "vice chair" if applicable
#     if "vice chair" in subset.lower():
#         memb_idx = subset.lower().find("vice chair")
#         subset = subset[: memb_idx].strip()

#     # removing "vice chair" if applicable
#     if "vice-chair" in subset.lower():
#         memb_idx = subset.lower().find("vice-chair")
#         subset = subset[: memb_idx].strip()

#     return subset

In [236]:
def get_adjudicating_member(case_content_str: str):
    """
    Retrieves the adjudicating member(s) mentioned in the given case content string.

    Args:
        case_content_str (str): The input string containing the case content.

    Returns:
        str: The adjudicating member(s) mentioned in the case content. If no adjudicating member is found, returns "nan".

    Examples:
        >>> get_adjudicating_member("This is the entire case file. There are sentences and other text.")
        "Name of Adjudicating Member"

    Notes:
        The function looks for specific keywords in the `case_content_str` to identify the adjudicating member(s).
        The keywords are evaluated in the following order: "date issued", "date of reasons", and "date order issued".
        If multiple instances of the same keyword are found, the function extracts the adjacent text and processes it to retrieve the member(s).
        If only one instance of the keyword is found, the function extracts the adjacent text and processes it to retrieve the member(s).
        If no adjudicating member is found, the function returns "nan".

    Raises:
        TypeError: If `case_content_str` is not a string.

    """

    keyword_1 = "date issued" # this is the most reliable one
    keyword_2 = "date of reasons" # first fallback
    keyword_3 = "date order issued" # second fallback

    # find which is best for the case (in order of best option to worst option)
    if keyword_1 in case_content_str.lower():
        keyword = keyword_1
        # kw_idx = case_content_str.lower().find(keyword_1)
    
    elif keyword_2 in case_content_str.lower():
        keyword = keyword_2
        # kw_idx = case_content_str.lower().find(keyword_2)

    elif keyword_3 in case_content_str.lower():
        keyword = keyword_3
        # kw_idx = case_content_str.lower().find(keyword_3)

    # if nothing is found, better to return nothing than to return something clearly incorrect
    if not keyword:
        return "nan"
    
    # getting index of whichever keyword was found first
    kw_idxs = find_all_positions(text = case_content_str.lower(), keyword = keyword)
    
    # removing schedule if there is one
    # case_content_str = remove_schedule(case_content_str = case_content_str)
    
    ### If there are multiple members found ###

    if len(kw_idxs) > 1:

        adj_membs = []

        for kw_idx in kw_idxs:
                
            subset = case_content_str[kw_idx + len(keyword): kw_idx + 100] # subsetting to an arbitrary distance after the keyword location
            subset = subset.split(", ")[0].strip()

            # removing "member" if applicable
            if "member" in subset.lower():
                memb_idx = subset.lower().find("member")
                subset = subset[: memb_idx].strip()

            # removing "vice chair" if applicable
            if "vice chair" in subset.lower():
                memb_idx = subset.lower().find("vice chair")
                subset = subset[: memb_idx].strip()

            # removing "vice chair" if applicable
            if "vice-chair" in subset.lower():
                memb_idx = subset.lower().find("vice-chair")
                subset = subset[: memb_idx].strip()

            # return subset
            adj_membs.append(subset)

        return ", ".join(list(set([memb for memb in adj_membs if memb != ""])))
    
    ### If there's only one member found ###

    kw_idx = case_content_str.lower().find(keyword)

    subset = case_content_str[kw_idx + len(keyword): kw_idx + 100] # subsetting to an arbitrary distance after the keyword location
    subset = subset.split(", ")[0].strip()

    # removing "member" if applicable
    if "member" in subset.lower():
        memb_idx = subset.lower().find("member")
        subset = subset[: memb_idx].strip()

    # removing "vice chair" if applicable
    if "vice chair" in subset.lower():
        memb_idx = subset.lower().find("vice chair")
        subset = subset[: memb_idx].strip()

    # removing "vice chair" if applicable
    if "vice-chair" in subset.lower():
        memb_idx = subset.lower().find("vice-chair")
        subset = subset[: memb_idx].strip()

    return subset

row = 63
case_content = " ".join(gold_df.loc[row, 'content'])
case_content
case_content_str = " ".join(gold_df.loc[row, 'content'])
print(gold_df.loc[row, 'url']) # no adjudicating member
print(f"Gold ADJMEM:\t{gold_df.loc[row, 'adjudicating_member']}")

found_memb = get_adjudicating_member(case_content_str = case_content_str)
print(f"Silver ADJMEM:\t{found_memb}")

https://canlii.ca/t/hv7kx
Gold ADJMEM:	Alex Brkic
Silver ADJMEM:	Alex Brkic


Preliminary scoring

In [233]:
total_found = 0
inaccs = []

for row in gold_df.index:

    case_content = " ".join(gold_df.loc[row, 'content'])
    case_content_str = " ".join(gold_df.loc[row, 'content'])
    gold_mem = gold_df.loc[row, 'adjudicating_member']
    silv_mem = get_adjudicating_member(case_content_str = case_content_str)

    if gold_mem == silv_mem:
        total_found += 1
    else:
        inaccs.append(row)

print(total_found / len(gold_df))
print(total_found, len(gold_df))
inaccs

0.9910714285714286
666 672


[361, 459, 559, 627, 649, 657]

Solid initial score. Next I'll do some error analysis to see how to improve things

In [198]:
# for row in inaccs:
# row = inaccs[7]
row = 482
case_content = " ".join(gold_df.loc[row, 'content'])
case_content_str = " ".join(gold_df.loc[row, 'content'])
gold_mem = gold_df.loc[row, 'adjudicating_member']
print(gold_df.loc[row, 'url']) # no adjudicating member
print(f"Gold ADJMEM:\t{gold_mem}")
silv_mem = get_adjudicating_member(case_content_str = case_content_str)
print(silv_mem)


https://canlii.ca/t/hwbld
Gold ADJMEM:	Sylvia Watson
Sylvia Watson


In [115]:
5 * 45000 / 24 / 365

25.684931506849313

In [157]:
import re

def find_hyphens(raw_str: str, num_hyphens: int):
# def find_hyphens(raw_str: str, keyword: str):
    """
    Finds the index of the first occurrence of exactly `num_hyphens` hyphens in the given string.

    Args:
        raw_str (str): The input string to search for hyphens.
        num_hyphens (int): The exact number of hyphens to search for.

    Returns:
        int: The index of the first occurrence of exactly `num_hyphens` hyphens in `raw_str`.

    Raises:
        ValueError: If `num_hyphens` is less than 1.

    Examples:
        >>> find_exact_hyphen_index("This is a test --- with hyphens -- and more hyphens ---", 3)
        20
    """

    # if num_hyphens < 1:
    #     raise ValueError("`num_hyphens` must be 1 or greater.")

    pattern = r"\b-{" + str(num_hyphens) + r"}\b"
    # pattern = r"\b-{" + keyword + r"}\b"
    match = re.search(pattern, raw_str)
    if match:
        return match.start()
    else:
        return -1


In [168]:
def general_cleaning(raw_file_str: str):
    """
    Performs general cleaning on a raw file string.

    This function removes tabs, non-breaking spaces, leading/trailing whitespace, empty lines, 
    and "\xa0" characters. This function operates line-by-line for the input text and only keeps 
    non-empty lines after stripping.

    Parameters
    ----------
    raw_file_str : str
        The raw file content as a string, where different lines are separated by '\n'.

    Returns
    -------
    list
        A list of cleaned lines. Each element of the list is a cleaned string corresponding to a non-empty 
        line in the input string. Tabs and "\xa0" characters are replaced with spaces, leading/trailing 
        whitespaces are removed.

    Examples
    --------
    >>> general_cleaning("  First line \t \n \xa0 \nSecond line \n   Third line\t")
    ['First line', 'Second line', 'Third line']
    """

    # gets rid of tabs, non-breaking spaces, leading/trailing whitespace, removes empty lines, and "\xa0"
    generally_cleaned_list = [line.replace("\t", " ").replace("\xa0", "").strip() for line in raw_file_str.split('\n') if line.strip() != '']
    return generally_cleaned_list

def remove_whitespace_and_underscores(string):
    """
    Removes consecutive whitespace and more than three consecutive underscores from a given string.
    
    Parameters
    ----------
    string : str
        The input string to be processed.
        
    Returns
    -------
    str
        The processed string with consecutive whitespace and more than three consecutive underscores removed.
    
    Examples
    --------
    >>> remove_whitespace_and_underscores("Hello    world___")
    'Hello world'
    
    >>> remove_whitespace_and_underscores("   This    string_has___many____underscores  ")
    'This string_has_many_underscores'
    """
    # Remove consecutive whitespace
    string = re.sub(r'\s+', ' ', string)

    # Remove more than three consecutive underscores
    string = re.sub(r'_+', '', string)

    return string.strip()

In [190]:
row = 361
raw_str = gold_df.loc[row, "raw_file_text"].replace("\xa0", "").strip()
print(gold_df.loc[row, 'url']) # no adjudicating member
idx = raw_str.lower().find("______________\n")
# print(raw_str[idx - 50: idx + 50])
subset = raw_str[idx + len("______________\n"): idx + 150]
print(subset)
subset_list = general_cleaning(subset)
good = " ".join([remove_whitespace_and_underscores(line) for line in subset_list])
# good
# raw_str

https://canlii.ca/t/hw9hk
Date Issued                        
                                         Member,
Landlord and Tenant Board

Toronto East-RO
2275 Mi


In [53]:
gold_df.loc[459, 'content']
# gold_df.loc[649, 'content'] # "Date Order"
gold_df.loc[657, 'content'] # "Date Order"

find_all_positions(text = " ".join(gold_df.loc[459, 'content']).lower(), keyword = "schedule")
gold_df.loc[459, 'content']

# for row in [459, 649, 657]:
for row in gold_df.index:
    case_content_list = gold_df.loc[row, 'content']
    case_content_str = " ".join(gold_df.loc[row, 'content'])
    # if "date issued" not in case_content.lower():
    #     print(row)
    #     print(gold_df.loc[row, 'adjudicating_member']) # no adjudicating member
    #     print(gold_df.loc[row, 'url']) # no adjudicating member
    if "schedule 1" in case_content_str.lower():
        print(row)
        line_num = [line_idx for line_idx, line in enumerate(case_content_list) if line.lower().split()[0] == "schedule"]
        if line_num:
            line_num = line_num[0] # to get the int, not a list of an int
            print(f"Found in line: {line_num}")
            print(case_content_list[line_num])
            print(f"Length: {len(case_content_list)}")
            print(gold_df.loc[row, 'url']) # no adjudicating member
        print()
    if row > 10:
        break

0
Found in line: 82
Schedule 1 SUMMARY OF CALCULATIONS File Number: CEL-62600-16 Reasons for amount owing Period Amount Arrears: (up to the termination date in the Notice of Termination) September 1, 2016 to November 5, 2016 $3,809.32 Less the amount the Tenant paid to the Landlord -$5,280.00 Plus compensation: (from the day after the termination date in the Notice to the date of the order) November 6, 2016 to January 18, 2017 $4,281.64 Less the rent deposit: -$1,760.00 Less the interest owing on the rent deposit: October 1, 2015 to November 5, 2016 -$38.75 Amount owing to the Landlord on the order date:(total of previous boxes) $1,012.21 Additional costs the Tenant must pay to the Landlord: $170.00 Plus daily compensation owing for each day of occupation starting January 19, 2017: $57.86 (per day) Total the Tenant must pay the Landlord as the tenancy is terminated: $1,182.21, + $57.86 per day starting January 19, 2017
Length: 83
https://canlii.ca/t/gxq6n

3
Found in line: 33
Schedule 

In [55]:
print("I went to the beach today".find("beach"))
"I went to the beach today"[14]

14


'b'