In [109]:
import pandas as pd
import ast
import re

def read_csv_with_lists(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Iterate over each column
    for column in df.columns:
        # Check if the column contains strings that represent lists
        if df[column].dtype == object:
            try:
                # Convert the strings to lists using ast.literal_eval
                df[column] = df[column].apply(ast.literal_eval)
            except (ValueError, SyntaxError):
                # Skip the column if it cannot be converted to a list
                pass
    
    return df

In [104]:
gold_df = read_csv_with_lists('data/outcome_extraction_testing.csv')
gold_df.head()

Unnamed: 0,raw_file_text,raw_file_name,full_cleaned,metadata,content,case_citation,file_number,language,year,ltb_location,decision_date,hearing_date,url,adjudicating_member,new_case_outcome
0,Metadata:\nDate:\t2017-01-18\nFile number:\t\n...,CEL-62600-16.txt,"[Metadata:, Date: 2017-01-18, File number:, CE...","[Date: 2017-01-18, File number:, CEL-62600-16,...",[Arrears Worksheet File Number: CEL-62600-16 T...,"CEL-62600-16 (Re), 2017 CanLII 9545 (ON LTB)",CEL-62600-16,English,2016,Mississauga,01/18/2017,01/30/2017,https://canlii.ca/t/gxq6n,Avril Cardoso,No relief
1,Metadata:\nDate:\t2017-01-09\nFile number:\t\n...,CEL-62852-16.txt,"[Metadata:, Date: 2017-01-09, File number:, CE...","[Date: 2017-01-09, File number:, CEL-62852-16,...",[Arrears Worksheet File Number: CEL-62852-16 T...,"CEL-62852-16 (Re), 2017 CanLII 9535 (ON LTB)",CEL-62852-16,English,2016,Mississauga,01/09/2017,01/09/2017,https://canlii.ca/t/gxq6r,Tiisetso Russell,Relief
2,Metadata:\nDate:\t2017-01-09\nFile number:\t\n...,CEL-63024-16.txt,"[Metadata:, Date: 2017-01-09, File number:, CE...","[Date: 2017-01-09, File number:, CEL-63024-16,...",[Arrears Worksheet File Number: CEL-63024-16 T...,"CEL-63024-16 (Re), 2017 CanLII 9543 (ON LTB)",CEL-63024-16,English,2016,Mississauga,01/09/2017,01/09/2017,https://canlii.ca/t/gxq6s,Tiisetso Russell,Relief
3,Metadata:\nDate:\t2017-01-20\nFile number:\t\n...,CEL-63056-16.txt,"[Metadata:, Date: 2017-01-20, File number:, CE...","[Date: 2017-01-20, File number:, CEL-63056-16,...",[Arrears Worksheet File Number: CEL-63056-16 T...,"CEL-63056-16 (Re), 2017 CanLII 9537 (ON LTB)",CEL-63056-16,English,2016,Mississauga,01/09/2017,01/09/2017,https://canlii.ca/t/gxq6t,Tiisetso Russell,No relief
4,Metadata:\nDate:\t2017-02-03\nFile number:\t\n...,CEL-63193-16.txt,"[Metadata:, Date: 2017-02-03, File number:, CE...","[Date: 2017-02-03, File number:, CEL-63193-16,...",[Arrears Worksheet File Number: CEL-63193-16 T...,"CEL-63193-16 (Re), 2017 CanLII 30828 (ON LTB)",CEL-63193-16,English,2016,Mississauga,01/10/2017,02/03/2017,https://canlii.ca/t/h3w7b,Karen Wallace,No relief


In [105]:
# row = 0

keyword = "accordance with"
# keyword = "based on"
# keyword = "considered"

found_total = 0

for row in gold_df.index:
    gold_outcome = gold_df.loc[row, 'new_case_outcome']
    # print(gold_outcome)

    case_text = " ".join(gold_df.loc[row, 'content'])
    # if case_text.find(keyword) != -1:
    if keyword in case_text.lower():
        found_total += 1

print(f"{found_total / len(gold_df.index)}; {found_total} / {len(gold_df.index)}")

0.9419642857142857; 633 / 672


In [107]:
def find_all_positions(text, keyword):
    positions = []
    start = 0
    while True:
        index = text.find(keyword, start)
        if index == -1:
            break
        positions.append(index)
        start = index + 1
    return positions

In [112]:
start_boundary = "accordance with"
end_boundary = "ordered that"

found_total = 0

max_outcome_len = 0
total_outcome_len = 0

for row in gold_df.index:
    gold_outcome = gold_df.loc[row, 'new_case_outcome']

    case_text = " ".join(gold_df.loc[row, 'content'])
    start_bound_positions = find_all_positions(case_text.lower(), start_boundary)

    proximity = 1000 # number of characters after the start boundary to look for the end boundary

    for pos in start_bound_positions:
        near_text = case_text[pos - 100 : pos + int(proximity)]
        near_text = ". ".join(near_text.split(". ")[1:])

        # Use a while loop to increase the proximity if end_boundary is not found in near_text
        while end_boundary not in near_text and proximity < len(case_text) - pos:
            proximity *= 1.5
            near_text = case_text[pos - 100 : pos + int(proximity)]
            near_text = ". ".join(near_text.split(". ")[1:])

        end_bounds_finds = find_all_positions(near_text, end_boundary)

        if end_boundary in near_text:
            found_total += 1
            subset2 = near_text[:near_text.find(end_boundary)]
            outcome = ". ".join(subset2.split(". ")[:-1])
            outcome = re.sub(r'^\d+\.\s*', '', outcome).strip() # removes "16. " from start of string
            gold_df.loc[row, 'outcome_text'] = outcome
            outcome_len = len(outcome.split(' '))
            if outcome_len > max_outcome_len:
                max_outcome_len = outcome_len
            total_outcome_len += outcome_len
            break
        else:
            gold_df.loc[row, 'outcome_text'] = "NEED OTHER METHOD"

    # Reset the proximity back to the original value for the next row
    proximity = 1000

print(found_total / len(gold_df.index))
print(max_outcome_len)
print(total_outcome_len / len(gold_df.index))

0.9211309523809523
4235
221.10267857142858


In [113]:
gold_df['outcome_text'].tolist()[:10]

["The N5 Notice has a termination date of November 17, 2016 and alleges that the conduct of the Tenant substantially interfered with the reasonable enjoyment of the residential complex and the Tenant has wilfully or negligently damaged the rental unit or residential complex. 5. The N5 Notice sets out the dates, times and specific allegations against the Tenant. 6. Because this is a first N5 Notice, the Tenant has an opportunity to void the notice in accordance with section 64(3) of the Act by correcting the identified problems within seven days of the date the notice was served. In this case, the seven day period begins the day following service of the N5 Notice, from October 28th to November 3rd, 2016. The Landlord's Legal Representative confirmed that the first N5 Notice was not voided because the conduct has not stopped since the notice was served and the Tenant has not paid $500.00 required to replace or replace the damaged property. Window screens 7. The Tenant negligently caused 