# Hearing & Decision Dates Extraction

In [1]:
import pandas as pd
import ast

def read_csv_with_lists(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Iterate over each column
    for column in df.columns:
        # Check if the column contains strings that represent lists
        if df[column].dtype == object:
            try:
                # Convert the strings to lists using ast.literal_eval
                df[column] = df[column].apply(ast.literal_eval)
            except (ValueError, SyntaxError):
                # Skip the column if it cannot be converted to a list
                pass
    
    return df

In [2]:
partner_gold_df = read_csv_with_lists('data/gold_labels_with_files.csv')
partner_gold_df['file_number'] = partner_gold_df['file_number_gold_cleaned']#.columns
partner_gold_df.head(2)

Unnamed: 0,What is the file number of the case?,file_number_gold_cleaned,raw_file_text,raw_file_name,Timestamp,Email Address,What was the date of the hearing? [mm/dd/yyyy],What was the date of the decision? [mm/dd/yyyy],Who was the member adjudicating the decision?,What was the location of the landlord tenant board?,...,"If yes to the previous question, which of the following were applicable to the tenant?",Did the decision state the tenant was given prior notice for the eviction?,"If the tenant was given prior notice for the eviction, how much notice was given?",Did the decisions state postponement would result in the tenant accruing additional arrears?,Which other specific applications of the landlord or the tenant were mentioned?,Did the decision mention the validity of an N4 eviction notice?,Were there detail(s) in the decision not captured by this questionnaire that should be included?,Exec Review,Review Status,file_number
0,CEL-62600-16,CEL-62600-16,Metadata:\nDate:\t2017-01-18\nFile number:\t\n...,CEL-62600-16.txt,1/31/2021 22:26:19,jac.huang@mail.utoronto.ca,1/5/2017,1/18/2017,Avril Cardoso,Mississauga,...,,Yes,Not stated,No,L1: Application to Evict a Tenant for Non-paym...,No,,,,CEL-62600-16
1,CEL-62852-16,CEL-62852-16,Metadata:\nDate:\t2017-01-09\nFile number:\t\n...,CEL-62852-16.txt,1/31/2021 22:35:03,jac.huang@mail.utoronto.ca,12/14/2016,1/9/2017,Tiisetso Russell,Mississauga,...,"reliance on social assistance, children's scho...",Yes,Not stated,No,No other specific applications were mentioned,No,,,,CEL-62852-16


In [3]:
gold_df = read_csv_with_lists('data/outcome_extraction_testing.csv')
print(gold_df.value_counts('new_case_outcome', normalize = True))
# gold_df['new_hearing_date'] = partner_gold_df
gold_df.head(1)

new_case_outcome
No relief            0.639881
Relief               0.349702
Conditional Order    0.010417
dtype: float64


Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year,ltb_location,decision_date,hearing_date,url,adjudicating_member,new_case_outcome
0,Metadata:\nDate:\t2017-01-18\nFile number:\t\n...,CEL-62600-16.txt,"[Metadata:, Date: 2017-01-18, File number:, CE...","[Date: 2017-01-18, File number:, CEL-62600-16,...",[Arrears Worksheet File Number: CEL-62600-16 T...,"CEL-62600-16 (Re), 2017 CanLII 9545 (ON LTB)",CEL-62600-16,English,2016,Mississauga,01/18/2017,01/30/2017,https://canlii.ca/t/gxq6n,Avril Cardoso,No relief


In [4]:
# Assuming "raw_file_name" is a column in both partner_gold_df and gold_df

# Perform inner join based on "raw_file_name" column
merged_df = gold_df.merge(partner_gold_df[["raw_file_name", "What was the date of the hearing? [mm/dd/yyyy]"]], on = "raw_file_name", how = "inner")

# Copy the values from column A of partner_gold_df to a new column in gold_df
gold_df["new_hearing_date"] = merged_df["What was the date of the hearing? [mm/dd/yyyy]"]
gold_df.head(1)

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year,ltb_location,decision_date,hearing_date,url,adjudicating_member,new_case_outcome,new_hearing_date
0,Metadata:\nDate:\t2017-01-18\nFile number:\t\n...,CEL-62600-16.txt,"[Metadata:, Date: 2017-01-18, File number:, CE...","[Date: 2017-01-18, File number:, CEL-62600-16,...",[Arrears Worksheet File Number: CEL-62600-16 T...,"CEL-62600-16 (Re), 2017 CanLII 9545 (ON LTB)",CEL-62600-16,English,2016,Mississauga,01/18/2017,01/30/2017,https://canlii.ca/t/gxq6n,Avril Cardoso,No relief,1/5/2017


In [7]:
# Assuming "file_number" is a column in both partner_gold_df and gold_df

# Perform inner join based on "file_number" column
merged_df = gold_df.merge(partner_gold_df[["raw_file_name", "What was the date of the decision? [mm/dd/yyyy]"]], on = "raw_file_name", how = "inner")

# Copy the values from column A of partner_gold_df to a new column in gold_df
gold_df["new_decision_date"] = merged_df["What was the date of the decision? [mm/dd/yyyy]"]
gold_df.head(10)

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year,ltb_location,decision_date,hearing_date,url,adjudicating_member,new_case_outcome,new_hearing_date,new_decision_date
0,Metadata:\nDate:\t2017-01-18\nFile number:\t\n...,CEL-62600-16.txt,"[Metadata:, Date: 2017-01-18, File number:, CE...","[Date: 2017-01-18, File number:, CEL-62600-16,...",[Arrears Worksheet File Number: CEL-62600-16 T...,"CEL-62600-16 (Re), 2017 CanLII 9545 (ON LTB)",CEL-62600-16,English,2016,Mississauga,01/18/2017,01/30/2017,https://canlii.ca/t/gxq6n,Avril Cardoso,No relief,1/5/2017,1/18/2017
1,Metadata:\nDate:\t2017-01-09\nFile number:\t\n...,CEL-62852-16.txt,"[Metadata:, Date: 2017-01-09, File number:, CE...","[Date: 2017-01-09, File number:, CEL-62852-16,...",[Arrears Worksheet File Number: CEL-62852-16 T...,"CEL-62852-16 (Re), 2017 CanLII 9535 (ON LTB)",CEL-62852-16,English,2016,Mississauga,01/09/2017,01/09/2017,https://canlii.ca/t/gxq6r,Tiisetso Russell,Relief,12/14/2016,1/9/2017
2,Metadata:\nDate:\t2017-01-09\nFile number:\t\n...,CEL-63024-16.txt,"[Metadata:, Date: 2017-01-09, File number:, CE...","[Date: 2017-01-09, File number:, CEL-63024-16,...",[Arrears Worksheet File Number: CEL-63024-16 T...,"CEL-63024-16 (Re), 2017 CanLII 9543 (ON LTB)",CEL-63024-16,English,2016,Mississauga,01/09/2017,01/09/2017,https://canlii.ca/t/gxq6s,Tiisetso Russell,Relief,1/4/2017,1/9/2017
3,Metadata:\nDate:\t2017-01-20\nFile number:\t\n...,CEL-63056-16.txt,"[Metadata:, Date: 2017-01-20, File number:, CE...","[Date: 2017-01-20, File number:, CEL-63056-16,...",[Arrears Worksheet File Number: CEL-63056-16 T...,"CEL-63056-16 (Re), 2017 CanLII 9537 (ON LTB)",CEL-63056-16,English,2016,Mississauga,01/09/2017,01/09/2017,https://canlii.ca/t/gxq6t,Tiisetso Russell,No relief,1/4/2017,1/9/2017
4,Metadata:\nDate:\t2017-02-03\nFile number:\t\n...,CEL-63193-16.txt,"[Metadata:, Date: 2017-02-03, File number:, CE...","[Date: 2017-02-03, File number:, CEL-63193-16,...",[Arrears Worksheet File Number: CEL-63193-16 T...,"CEL-63193-16 (Re), 2017 CanLII 30828 (ON LTB)",CEL-63193-16,English,2016,Mississauga,01/10/2017,02/03/2017,https://canlii.ca/t/h3w7b,Karen Wallace,No relief,2/2/2017,2/3/2017
5,Metadata:\nDate:\t2017-02-03\nFile number:\t\n...,CEL-63559-17.txt,"[Metadata:, Date: 2017-02-03, File number:, CE...","[Date: 2017-02-03, File number:, CEL-63559-17,...",[Arrears Worksheet File Number: CEL-63559-17 T...,"CEL-63559-17 (Re), 2017 CanLII 30833 (ON LTB)",CEL-63559-17,English,2017,Barrie,02/03/2017,02/03/2017,https://canlii.ca/t/h3w7c,Avril Cardoso,Relief,1/27/2017,2/3/2017
6,Metadata:\nDate:\t2017-03-13\nFile number:\t\n...,CEL-63931-17.txt,"[Metadata:, Date: 2017-03-13, File number:, CE...","[Date: 2017-03-13, File number:, CEL-63931-17,...",[Arrears Worksheet File Number: CEL-63931-17 T...,"CEL-63931-17 (Re), 2017 CanLII 28582 (ON LTB)",CEL-63931-17,English,2017,Elgin,03/13/2017,03/13/2017,https://canlii.ca/t/h3r34,Avril Cardoso,Relief,2/28/2017,3/13/2017
7,Metadata:\nDate:\t2017-03-27\nFile number:\t\n...,CEL-63965-17-RV.txt,"[Metadata:, Date: 2017-03-27, File number:, CE...","[Date: 2017-03-27, File number:, CEL-63965-17-...",[Order under Section 21.2 of the Statutory Pow...,"CEL-63965-17-RV (Re), 2017 CanLII 28692 (ON LTB)",CEL-63965-17-RV,English,2017,Mississauga,02/10/2017,03/27/2017,https://canlii.ca/t/h3r35,Karen Wallace,No relief,3/16/2017,3/27/2017
8,Metadata:\nDate:\t2017-04-28\nFile number:\t\n...,CEL-64051-17.txt,"[Metadata:, Date: 2017-04-28, File number:, CE...","[Date: 2017-04-28, File number:, CEL-64051-17,...",[Order under Sections 69 and 35 Residential Te...,"CEL-64051-17 (Re), 2017 CanLII 28803 (ON LTB)",CEL-64051-17,English,2017,Mississauga,03/28/2017,03/28/2017,https://canlii.ca/t/h3r36,Avril Cardoso,Relief,3/16/2017,3/28/2017
9,Metadata:\nDate:\t2017-04-13\nFile number:\t\n...,CEL-64556-17-RV.txt,"[Metadata:, Date: 2017-04-13, File number:, CE...","[Date: 2017-04-13, File number:, CEL-64556-17-...",[Arrears Worksheet File Number: CEL-64556-17-R...,"CEL-64556-17-RV (Re), 2017 CanLII 28662 (ON LTB)",CEL-64556-17-RV,English,2017,Mississauga,03/16/2017,04/13/2017,https://canlii.ca/t/h3r3c,Karen Wallace,No relief,4/11/2017,4/13/2017


In [6]:
asdf

NameError: name 'asdf' is not defined

In [None]:
from datetime import datetime

def convert_date_format(date_str):

    # this makes things smoother for the pipeline
    if not date_str:
        return None

    try:
        # Parse the date
        date = datetime.strptime(date_str, "%B %d, %Y")
    except ValueError:
        # If parsing fails, try parsing with abbreviated month name
        date = datetime.strptime(date_str, "%b %d, %Y")

    # Convert to new format
    month = date.strftime("%m").lstrip('0')
    day = date.strftime("%d").lstrip('0')
    year = date.strftime("%Y")

    new_format_date = f"{month}/{day}/{year}"

    return new_format_date

# Usage
print(convert_date_format("February 13, 2018"))  # Returns: 2/13/2018
print(convert_date_format("Aug 1, 2018"))  # Returns: 8/1/2018

2/13/2018
8/1/2018


# Hearing Date

In [None]:
import re

def find_date(text: str):
    """
    Finds a date in the format "Month Day, Year" within the given text.

    Args:
        text (str): The input text to search for a date.

    Returns:
        str: The date found in the text. Returns an empty string if no date is found.

    Examples:
        >>> find_date("The event will take place on April 23, 2018.")
        "April 23, 2018"
    """

    pattern = r"\b[A-Z][a-z]+ \d{1,2}, \d{4}\b"
    match = re.search(pattern, text)

    if match:
        return match.group()
    else:
        return ""

def get_hearing_date(case_content_str: str):
    """
    Extracts the hearing date from the given case content string.

    Args:
        case_content_str (str): The case content string to extract the hearing date from.

    Returns:
        str or None: The extracted hearing date in the format "Month Day, Year" if found, otherwise None.

    Examples:
        >>> get_hearing_date("The application was heard on April 23, 2018. It is determined that...")
        "April 23, 2018"
    """

    for keyword in ["determinations:", "it is determined"]:
        if keyword in case_content_str.lower():
            kw_idx = case_content_str.find(keyword)
            break
        else:
            kw_idx = -1

    subset = case_content_str[case_content_str.lower().find("application") - 10: kw_idx].strip()
    date = find_date(subset)

    if date:
        return convert_date_format(find_date(date).strip())
        
    # otherwise return None
    return None

row = 430 # 432 - hdate is March 22, 2018, decdate is May 1, 2018
print(row)

case_content = " ".join(gold_df.loc[row, 'content'])
print(gold_df.loc[row, 'url'])
print(gold_df.loc[row, 'file_number'])
# print(case_content)
print(gold_df.loc[row, 'new_hearing_date'])
# print(gold_df.loc[row, 'new_decision_date'])
print()
# print(case_content)

get_hearing_date(case_content_str = case_content)

430
https://canlii.ca/t/hwbhj
TNL-02075-18
3/28/2018



'3/28/2018'

# Decision Date

In [None]:
import re
from dateutil.parser import parse
import spacy
nlp = spacy.load("en_core_web_sm")

def find_date(text: str):
    """
    Finds a date in the format "Month Day, Year" within the given text.

    Args:
        text (str): The input text to search for a date.

    Returns:
        str: The date found in the text. Returns an empty string if no date is found.

    Examples:
        >>> find_date("The event will take place on April 23, 2018.")
        "April 23, 2018"
    """

    pattern = r"\b[A-Z][a-z]+ \d{1,2}, \d{4}\b"
    match = re.search(pattern, text)

    if match:
        return match.group()
    else:
        return ""

def extract_date(text, nlp = nlp):
    """
    Extracts a date from a string of text using spaCy's entity recognition.

    Args:
        text (str): The text to extract the date from.

    Returns:
        str: The extracted date string, or an empty string if no date is found.

    Examples:
        >>> extract_date("The event will take place on April 23, 2018.")
        "April 23, 2018"
    """

    doc = nlp(text)

    for entity in doc.ents:
        if entity.label_ == "DATE":
            return entity.text

    return ""

def convert_date(date_str):
    """
    Parses a date string in any format and converts it to the format "Month Day, Year".

    Args:
        date_str (str): The date string to parse.

    Returns:
        str: The parsed date string in the format "Month Day, Year", or an empty string if parsing fails.

    Examples:
        >>> convert_date("2022-05-31")
        "May 31, 2022"

        >>> convert_date("05/31/2018")
        "May 31, 2018"
    """

    try:
        parsed_date = parse(date_str)
        formatted_date = parsed_date.strftime("%B %d, %Y")
        return formatted_date
    except ValueError:
        return ""

def get_decision_date(case_content_str: str):
    """
    Extracts the decision date from the given case content string.

    Args:
        case_content_str (str): The case content string to extract the decision date from.

    Returns:
        str or None: The extracted decision date in the format "Month Day, Year" if found, otherwise None.

    Examples:
        >>> get_decision_date("The date order issued on April 23, 2018 states...")
        "April 23, 2018"
    """

    # intentionally searches these in this order. Any amendment would be the most recent date
    for keyword in ['date order amended', 'date issued', 'date order issued']: 
        if keyword in case_content_str.lower():
            # print("OPTION 1")
            di_idx = case_content_str.lower().find(keyword)
            subset = case_content_str[di_idx - 30 : di_idx].strip().split(". ")[-1]
            return convert_date_format(find_date(subset).strip())
    
    else:
        # print("OPTION 2")
        if "date" in case_content_str.lower()[: 500]:
            subset = case_content_str[: 500]
            date_idx = case_content_str.lower().find('date')
            subset = case_content_str[date_idx + len('date') : date_idx + len('date') + 50].strip()
            subset = extract_date(subset).strip()
            return convert_date_format(find_date(subset).strip())
    
    # otherwise return None
    return None

def get_decision_date2(case_mdata_str: str):
    """
    Extracts the decision date from the given case metadata string.

    Args:
        case_mdata_str (str): The case metadata string containing the date in the format "YYYY-MM-DD".

    Returns:
        str: The extracted decision date in the format "MM/DD/YYYY", with leading zeros removed (lstrip() method)

    Examples:
        >>> get_decision_date2("Date: 2022-05-01 ....")
        "5/1/2022"
    """
    date = case_mdata_str.split()[1]
    split = date.split("-")
    data = split[1].lstrip('0') + "-" + split[2].lstrip('0') + "-" + split[0].lstrip('0')
    return data.replace("-", "/")

row = 469 # 432 - hdate is March 22, 2018, decdate is May 1, 2018
# print(row)

case_content = " ".join(gold_df.loc[row, 'content'])
# print(gold_df.loc[row, 'url'])
# print(gold_df.loc[row, 'file_number'])
# print(case_content)
print(gold_df.loc[row, 'new_hearing_date'])
print(gold_df.loc[row, 'new_decision_date'])
# print()
# print(case_content)

print(get_hearing_date(case_content_str = case_content))
print(get_decision_date(case_content_str = case_content))

2023-06-13 23:29:01.835491: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


7/23/2018
7/26/2018
7/23/2018
7/26/2018


Checking to make sure *something* can actually be extracted from most files -- there are also some files (very few but it does happen) that just don't have a hearing or decision date (ex: TNL-05489-18)

In [None]:
h_dates = 0
dec_dates = 0
bad_hdates = []
bad_decdates = []

for row in gold_df.index:
    # print(row)

    case_content = " ".join(gold_df.loc[row, 'content'])
    case_mdata = " ".join(gold_df.loc[row, 'metadata'])
    # print(gold_df.loc[row, 'url'])
    # print(gold_df.loc[row, 'file_number'])
    # print(case_content)
    # print(gold_df.loc[row, 'new_hearing_date'])
    # print(gold_df.loc[row, 'new_decision_date'])
    # print()
    # print(case_content)

    h_date = get_hearing_date(case_content_str = case_content)
    # dec_date = get_decision_date2(case_mdata_str = case_content)
    dec_date = get_decision_date(case_content_str = case_content)

    if h_date:
        h_dates += 1
    else:
        bad_hdates.append(row)

    if dec_date:
        dec_dates += 1
    else:
        bad_decdates.append(row)

print(h_dates / len(gold_df))
print(dec_dates / len(gold_df))
print(bad_hdates)
print(bad_decdates)

0.9985119047619048
0.9985119047619048
[459]
[459]


In [None]:
row = 0 # 459 is a mess, as always
print(row)

case_content = " ".join(gold_df.loc[row, 'content'])
print(gold_df.loc[row, 'url'])
print(gold_df.loc[row, 'file_number'])
# print(case_content)
print(gold_df.loc[row, 'new_hearing_date'])
print(gold_df.loc[row, 'new_decision_date'])
print()
# print(case_content)

print(get_hearing_date(case_content_str = case_content))
print(get_decision_date(case_content_str = case_content))

0
https://canlii.ca/t/gxq6n
CEL-62600-16
1/5/2017
1/18/2017

1/5/2017
1/18/2017


In [None]:
# import jellyfish

In [None]:
total_hears = 0
total_decs = 0
nopes = []

for row in gold_df.index:
    # print(row)
    case_content = " ".join(gold_df.loc[row, 'content'])
    # print(gold_df.loc[row, 'url'])
    
    gold_hear = gold_df.loc[row, 'new_hearing_date']
    gold_dec = gold_df.loc[row, 'new_decision_date']
    silv_hear = get_hearing_date(case_content_str = case_content)
    # silv_hear = convert_date_format(silv_hear)
    silv_dec = get_decision_date(case_content_str = case_content)
    # silv_dec = convert_date_format(silv_dec)

    # print(gold_hear)
    # print(gold_dec)
    # print(silv_hear)
    # print(silv_dec)
    # print()

    if (gold_hear and silv_hear):
        if gold_hear == silv_hear: # accuracy
            total_hears += 1
        else:
            nopes.append(row)
    if (gold_dec and silv_dec):
        if gold_dec == silv_dec: # accuracy
            total_decs += 1
        else:
            nopes.append(row)
    else:
        nopes.append(row)

    # if row == 5:
        # break

print(f"Hearing dates accuracy:  {total_hears / len(gold_df)}")
print(f"Decision dates accuracy: {total_decs / len(gold_df)}")

print(len(nopes))
print(nopes[:10])
for id, nope in enumerate(nopes):
    print(gold_df.loc[nope, 'url'])
    print(nope)
    print(gold_df.loc[nope, 'new_hearing_date'])
    print(gold_df.loc[nope, 'new_decision_date'])
    print(get_hearing_date(case_content_str = case_content))
    print(get_decision_date(case_content_str = case_content))
    print()
    if id == 5:
        break

Hearing dates accuracy:  0.8005952380952381
Decision dates accuracy: 0.9270833333333334
182
[4, 7, 9, 11, 16, 17, 19, 19, 20, 20]
https://canlii.ca/t/h3w7b
4
2/2/2017
2/3/2017
11/2/2018
11/20/2018

https://canlii.ca/t/h3r35
7
3/16/2017
3/27/2017
11/2/2018
11/20/2018

https://canlii.ca/t/h3r3c
9
4/11/2017
4/13/2017
11/2/2018
11/20/2018

https://canlii.ca/t/h52xx
11
5/15/2017
5/16/2017
11/2/2018
11/20/2018

https://canlii.ca/t/h52z6
16
5/9/2017
5/12/2017
11/2/2018
11/20/2018

https://canlii.ca/t/h59dh
17
6/16/2017
6/19/2017
11/2/2018
11/20/2018



# `get_decision_date2()`

In [None]:
def get_decision_date2(case_mdata_str: str):
    """
    Extracts the decision date from the given case metadata string.

    Args:
        case_mdata_str (str): The case metadata string containing the date in the format "YYYY-MM-DD".

    Returns:
        str: The extracted decision date in the format "MM/DD/YYYY", with leading zeros removed (lstrip() method)

    Examples:
        >>> get_decision_date2("Date: 2022-05-01 ....")
        "5/1/2022"
    """
    date = case_mdata_str.split()[1]
    split = date.split("-")
    data = split[1].lstrip('0') + "-" + split[2].lstrip('0') + "-" + split[0].lstrip('0')
    return data.replace("-", "/")

row = 460 # 459 is a mess, as always
print(row)

case_content = " ".join(gold_df.loc[row, 'content'])
case_mdata = " ".join(gold_df.loc[row, 'metadata'])
print(case_mdata)
print(gold_df.loc[row, 'url'])
print(gold_df.loc[row, 'file_number'])
# print(case_content)
print(gold_df.loc[row, 'new_hearing_date'])
print(gold_df.loc[row, 'new_decision_date'])
print()
# print(case_content)
print(get_hearing_date(case_content_str = case_content))
# print(get_decision_date(case_content_str = case_content))
print(get_decision_date2(case_mdata_str = case_mdata))

460
Date: 2018-07-11 File number: TNL-05535-18 TNL-05535-18 Citation: TNL-05535-18 (Re), 2018 CanLII 113837 (ON LTB), <https://canlii.ca/t/hwbk4>, retrieved on 2023-05-16 https://canlii.ca/t/hwbk4
https://canlii.ca/t/hwbk4
TNL-05535-18
7/9/2018
7/11/2018

7/9/2018
7/11/2018


# Trying Something New

In [None]:
# def find_keyword_span(keyword, string):
#     # Convert the string into a list of words
#     words = string.split()

#     # Calculate the number of tenths (10ths) in the string
#     num_tenths = len(words) // 10

#     # Find the index of the keyword within the words list
#     keyword_index = words.index(keyword)

#     # Calculate the percentile range of the keyword
#     percentile_range = (keyword_index // num_tenths) * 10

#     # Return the percentile range
#     return percentile_range

# def convert_date_back(date_string):
#     # Split the date string into month, day, and year
#     month, day, year = date_string.split('/')

#     # Define a dictionary to map month numbers to month names
#     month_names = {
#         '1': 'January',
#         '2': 'February',
#         '3': 'March',
#         '4': 'April',
#         '5': 'May',
#         '6': 'June',
#         '7': 'July',
#         '8': 'August',
#         '9': 'September',
#         '10': 'October',
#         '11': 'November',
#         '12': 'December'
#     }

#     # Get the month name from the dictionary
#     month_name = month_names[month]

#     # Create the converted date string
#     converted_date = f"{month_name} {day}, {year}"

#     return converted_date

In [None]:
# def find_keyword_span(keyword, string):

#     kw_idx = string.find(keyword)

#     span = string[kw_idx - 100 : kw_idx + 100]
    
#     return span.strip()

# for row in gold_df.index:
#     case_content = " ".join(gold_df.loc[row, 'content'])
#     case_mdata = " ".join(gold_df.loc[row, 'metadata'])
#     # print(gold_df.loc[row, 'url'])
#     full_case_str = case_mdata + " " + case_content

#     hdate = gold_df.loc[row, 'new_hearing_date']
#     hdate_origin = convert_date_back(hdate)
#     decdate = gold_df.loc[row, 'new_decision_date']
#     decdate_origin = convert_date_back(decdate)

#     gold_df.loc[row, 'hearing_date_origin'] = hdate_origin
#     gold_df.loc[row, 'decision_date_origin'] = decdate_origin

#     gold_df.loc[row, 'hearing_date_span'] = find_keyword_span(keyword = hdate_origin, string = full_case_str)
#     gold_df.loc[row, 'decision_date_span'] = find_keyword_span(keyword = decdate_origin, string = full_case_str)

# gold_df.head(2)

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year,ltb_location,...,hearing_date,url,adjudicating_member,new_case_outcome,new_hearing_date,new_decision_date,hearing_date_origin,decision_date_origin,hearing_date_span,decision_date_span
0,Metadata:\nDate:\t2017-01-18\nFile number:\t\n...,CEL-62600-16.txt,"[Metadata:, Date: 2017-01-18, File number:, CE...","[Date: 2017-01-18, File number:, CEL-62600-16,...",[Arrears Worksheet File Number: CEL-62600-16 T...,"CEL-62600-16 (Re), 2017 CanLII 9545 (ON LTB)",CEL-62600-16,English,2016,Mississauga,...,01/30/2017,https://canlii.ca/t/gxq6n,Avril Cardoso,No relief,1/5/2017,1/18/2017,"January 5, 2017","January 18, 2017",nt remained in the unit after the termination ...,"ll pay to the Landlord $1,012.21*, which repre..."
1,Metadata:\nDate:\t2017-01-09\nFile number:\t\n...,CEL-62852-16.txt,"[Metadata:, Date: 2017-01-09, File number:, CE...","[Date: 2017-01-09, File number:, CEL-62852-16,...",[Arrears Worksheet File Number: CEL-62852-16 T...,"CEL-62852-16 (Re), 2017 CanLII 9535 (ON LTB)",CEL-62852-16,English,2016,Mississauga,...,01/09/2017,https://canlii.ca/t/gxq6r,Tiisetso Russell,Relief,12/14/2016,1/9/2017,"December 14, 2016","January 9, 2017",the Tenant did not pay the rent that the Tenan...,"t Date November 18, 2016 (Day after terminatio..."
