# Pre-Processing to Match Annotated Data
- Ultimate goal: Match everything as closely as possible (even if it doesn't always make sense to)

## Imports

In [1]:
import os
import pandas as pd
import time
import numpy as np
from collections import deque

# Gold Data (Annotated by Partner)

## Renaming `gold_data` columns

In [2]:
gold_data = pd.read_csv("data/gold_labels_with_files.csv")

# making new names for the columns in gold_data

new_names = {
    'Timestamp': 'timestamp',
    'Email Address': 'email_address',
    'What is the file number of the case?': 'file_number_gold',
    'What was the date of the hearing? [mm/dd/yyyy]': 'hearing_date',
    'What was the date of the decision? [mm/dd/yyyy]': 'decision_date',
    'Who was the member adjudicating the decision?': 'adjudicating_member',
    'What was the location of the landlord tenant board?': 'ltb_location',
    'Did the decision state the landlord was represented?': 'landlord_represented',
    'Did the decision state the landlord attended the hearing?': 'landlord_attended_hearing',
    'Did the decision state the tenant was represented?': 'tenant_represented',
    'Did the decision state the tenant attended the hearing?': 'tenant_attended_hearing',
    'Did the decision state the landlord was a not-for-profit landlord (e.g. Toronto Community Housing)?': 'landlord_nonprofit',
    'Did the decision state the tenant was collecting a subsidy?': 'tenant_collecting_subsidy',
    'What was the outcome of the case?': 'case_outcome',
    'What was the length of the tenancy, or in other words, how long had the tenants lived at the residence in question? ': 'tenancy_length',
    'What was the monthly rent?': 'monthly_rent',
    'What was the amount of the rental deposit? ': 'rental_deposit',
    'If any rent increases occurred, what was the rent after the increase(s)?': 'rent_after_increase',
    'If any rent increases occurred, when did the rent increase(s) come into effect? ': 'rent_increase_effect_date',
    'What was the total amount of arrears?': 'total_arrears',
    'Over how many months did the arrears accumulate? ': 'arrears_duration',
    'If the tenant made a payment on the arrears after the eviction notice was served and/or prior to the hearing, what was the amount of the payment? ': 'arrears_payment_amount',
    'Did the decision mention a history of arrears by the tenant separate from the arrears in the current claim (more than one period of arrears, recurrently coming in and out of arrears, arrears with previous landlord, etc.)?': 'tenant_arrears_history_mentioned',
    'If the tenant had a history of arrears, did the decision mention a history of the tenant making payments on those arrears (separate from any payments made in response to the present eviction notice/hearing)?': 'tenant_arrears_payment_history_mentioned',
    'How frequently were rent payments made late?': 'rent_payments_late_frequency',
    'Did the member find the tenant had or seemed to have the ability to pay rent, but chose not do so?': 'tenant_ability_to_pay_rent',
    'What were the specific mental, medical, or physical conditions of the tenant, if any? ': 'tenant_conditions',
    'Did the decision state that the tenant had children living with them?': 'tenant_children_present',
    'How many total children did the tenant have living with them? ': 'total_children',
    'How many total children aged 17 or younger did the tenant have living with them?': 'children_17_or_younger',
    'How many total children aged 13 or younger did the tenant have living with them? ': 'children_13_or_younger',
    'How many total children aged 4 or younger did the tenant have living with them?': 'children_4_or_younger',
    'Did the decision state any of the children had mental, medical or physical conditions?': 'children_conditions_mentioned',
    'If yes to the previous question, did the decision state these conditions would make moving particularly burdensome?': 'conditions_making_moving_burdensome',
    'Was the tenant employed at the time of the hearing?': 'tenant_employed',
    'If the tenant was not employed, did the decision state the tenant was receiving any form of government assistance (e.g. OW, childcare benefits, ODSP, OSAP)?': 'tenant_government_assistance',
    'If the tenant was employed, did the decision state any doubts about the stability of employment e.g. lack of guaranteed hours, contract work, etc.?': 'employment_stability_doubts',
    'Did the member find the tenant had sufficient income to pay rent?': 'sufficient_income_to_pay_rent',
    'What was the total income of the tenant’s household? ': 'total_household_income',
    'Did the decision mention the tenant lost their job leading up to or during the period of the hearing?': 'tenant_job_loss_mentioned',
    'Did the decision mention any other extenuating circumstances experienced by the tenant leading up to or during the period of the claim (e.g. hospitalization, death in the family, etc.)?': 'tenant_extenuating_circumstances',
    'Did the tenant propose a payment plan?': 'tenant_proposed_payment_plan',
    'If the tenant did propose a payment plan, did the member accept the proposed payment plan?': 'accepted_proposed_payment_plan',
    'If a payment plan was ordered, what was the length of the payment plan? ': 'payment_plan_length',
    'Did the decision mention the tenant’s difficulty finding alternative housing for any reason e.g.physical limitations, reliance on social assistance, etc.?': 'tenant_difficulty_finding_housing',
    'If yes to the previous question, which of the following were applicable to the tenant?': 'applicable_difficulty_reasons',
    'Did the decision state the tenant was given prior notice for the eviction?': 'tenant_prior_notice_given',
    'If the tenant was given prior notice for the eviction, how much notice was given?': 'prior_notice_duration',
    'Did the decisions state postponement would result in the tenant accruing additional arrears?': 'postponement_additional_arrears',
    'Which other specific applications of the landlord or the tenant were mentioned?': 'mentioned_applications',
    'Did the decision mention the validity of an N4 eviction notice?': 'validity_of_N4_notice_mentioned',
    'Were there detail(s) in the decision not captured by this questionnaire that should be included?': 'additional_details_in_decision',
    'Exec Review': 'executive_review',
    'Review Status': 'review_status'
}

gold_data = gold_data.rename(columns = new_names)
# sorting by file_number -- so that ordering of the new data annotations can match this and be more easily compared
gold_data = gold_data.sort_values(by = ['file_number_gold'], ascending = True).reset_index(drop = True)
gold_data.columns

Index(['file_number_gold', 'file_number_gold_cleaned', 'raw_file_text',
       'raw_file_name', 'timestamp', 'email_address', 'hearing_date',
       'decision_date', 'adjudicating_member', 'ltb_location',
       'landlord_represented', 'landlord_attended_hearing',
       'tenant_represented', 'tenant_attended_hearing', 'landlord_nonprofit',
       'tenant_collecting_subsidy', 'case_outcome', 'tenancy_length',
       'monthly_rent', 'rental_deposit', 'rent_after_increase',
       'rent_increase_effect_date', 'total_arrears', 'arrears_duration',
       'arrears_payment_amount', 'tenant_arrears_history_mentioned',
       'tenant_arrears_payment_history_mentioned',
       'rent_payments_late_frequency', 'tenant_ability_to_pay_rent',
       'tenant_conditions', 'tenant_children_present', 'total_children',
       'children_17_or_younger', 'children_13_or_younger',
       'children_4_or_younger', 'children_conditions_mentioned',
       'conditions_making_moving_burdensome', 'tenant_employed',

In [268]:
import re

def general_cleaning(raw_file_str: str):
    # gets rid of tabs, non-breaking spaces, leading/trailing whitespace, removes empty lines, and "\xa0"
    generally_cleaned_str = [line.replace("\t", " ").replace("\xa0", "").strip() for line in raw_file_str.split('\n') if line.strip() != '']
    return generally_cleaned_str

def remove_whitespace_and_underscores(string):
    # Remove consecutive whitespace
    string = re.sub(r'\s+', ' ', string)

    # Remove more than three consecutive underscores
    string = re.sub(r'_+', '', string)

    return string.strip()

def separate_file_sections(text_list):
    metadata_list = []
    content_list = []

    is_metadata = True
    is_content = False

    for line in text_list:
        if line.strip() == 'Metadata:':
            is_metadata = True
            is_content = False
        elif line.strip() == 'Content:':
            is_metadata = False
            is_content = True
        elif is_metadata:
            metadata_list.append(remove_whitespace_and_underscores(line))
        elif is_content:
            content_list.append(remove_whitespace_and_underscores(line))

    return metadata_list, content_list

def merge_numerical_entries(strings_list):
    """
    Turns something like
        [..., '3.',
        'The tenant took occupancy of the rental unit in or about the beginning of December 2016.', ...]
    into
        [..., '3. The tenant took occupancy of the rental unit in or about the beginning of December 2016.', ...]
    
    """
    for i in range(len(strings_list) - 2, -1, -1):
        if re.fullmatch(r'\d+\.', strings_list[i]):
            strings_list[i] += ' ' + strings_list[i + 1]
            del strings_list[i + 1]
    return strings_list

def move_trailing_numbers(strings_list):
    """
    Turns something like
        [..., 'Credibility of the Parties 4.',
        'The Landlord said about two to three months ago he ...', ...]
    into
        [..., 'Credibility of the Parties',
        '4. The Landlord said about two to three months ago he...', ...]
    
    """
    for i in range(len(strings_list) - 1, -1, -1):
        match = re.search(r'\s+(\d{2}\.)$', strings_list[i])
        if match:
            number = match.group(1)
            strings_list[i] = re.sub(r'\s+\d{1,2}\.$', '', strings_list[i])
            strings_list[i + 1] = number + ' ' + strings_list[i + 1]
    return strings_list

import re

def remove_end_tag_and_restructure(metadata_list: list):

    cleaned_str = " ".join(metadata_list)

    # this doesn't add any meaning to the case details we need to extract, and instead just adds noise to the extraction process + adds extra unnecessary tokens
    if cleaned_str.find("If you have any questions about this order") > (len(cleaned_str) - 500):
        cleaned_str = cleaned_str[: cleaned_str.find("If you have any questions about this order")].strip() # ending tag removed
    
    # otherwise just do everything else
    cleaned_str = cleaned_str.replace(". ", ".\n")
    # cleaned_str = cleaned_str.replace(". ", ".\n") # deprecated by regex approach
    # trimmed_list = [line.strip() for line in re.split(r'(?<!\d)\. ', cleaned_str) if line.strip() != ''] # deprecated by regex approach
    cleaned_str = re.sub(r'(?<!\d)\. ', "\n", cleaned_str)
    trimmed_list = [line.strip() for line in cleaned_str.split('\n') if line.strip() != '']
    trimmed_list = merge_numerical_entries(trimmed_list)
    trimmed_list = move_trailing_numbers(trimmed_list)
    return trimmed_list

# file_name = "CEL-74519-18.txt"
# # row of this particular case
# case_file_ind = silver_data.loc[silver_data['raw_file_name'] == file_name].index.tolist()[0]
# test_text = silver_data.loc[206, "raw_file_text"]#.item()

# metadata, content = separate_file_sections(general_cleaning(test_text))
# remove_end_tag_and_restructure(content)

In [269]:
import time
import numpy as np
from collections import deque

start_time = time.time()

# Initialize a deque to store the latest 100 iteration times
time_deque = deque(maxlen = 500)

cases_contents = []
cases_metadata = []
full_cleaned = []

raw_files = gold_data['raw_file_text'].tolist()
for index, raw_file in enumerate(raw_files):
    iteration_start_time = time.time()
    better_file = general_cleaning(raw_file)
    try:
        metadata_list, content_list = separate_file_sections(better_file)
        full_cleaned.append(better_file)
        # cases_metadata.append(remove_end_tag_and_restructure(metadata_list)) # removing a bit more text if possible
        cases_metadata.append(metadata_list) # removing a bit more text if possible
        cases_contents.append(remove_end_tag_and_restructure(content_list))

        # Save the end time of this iteration and push it into the deque
        iteration_end_time = time.time()
        time_deque.append(iteration_end_time - iteration_start_time)

        # progress tracker
        average_time_per_file = np.mean(time_deque)
        files_left = len(raw_files) - (index + 1)
        estimated_time_left = files_left * average_time_per_file

        print(f"Files processed: {index + 1} of {len(raw_files)}, Estimated time remaining: {time.strftime('%H:%M:%S', time.gmtime(estimated_time_left))}", end='\r')
    except Exception as any_error:
        print(f"{any_error} with file at Df row: ", index)

gold_data['full_cleaned'] = full_cleaned
gold_data['metadata'] = cases_metadata
gold_data['content'] = cases_contents
gold_data.head()

Files processed: 679 of 679, Estimated time remaining: 00:00:00

Unnamed: 0,file_number_gold,file_number,raw_file_text,raw_file_name,timestamp,email_address,hearing_date,decision_date,adjudicating_member,ltb_location,...,prior_notice_duration,postponement_additional_arrears,mentioned_applications,validity_of_N4_notice_mentioned,additional_details_in_decision,executive_review,review_status,full_cleaned,metadata,content
0,SWL-17348-18,SWL-17348-18,Metadata:\nDate:\t2018-07-06\nFile number:\t\n...,SWL-17348-18.txt,11/26/2020 13:40:11,johnnymetzger6@gmail.com,07/05/2018,07/06/2018,Kevin Lundy,London,...,Not stated,No,L1: Application to Evict a Tenant for Non-paym...,No,,AW,Complete,"[Metadata:, Date: 2018-07-06, File number:, SW...","[Date: 2018-07-06, File number:, SWL-17348-18,...",[Order under Section 69 Residential Tenancies ...
1,TEL-79722-17,TEL-79722-17,Metadata:\nDate:\t2017-05-26\nFile number:\t\n...,TEL-79722-17.txt,1/31/2021 19:55:04,griffin.murphy@mail.utoronto.ca,05/19/2017,05/26/2017,Laura Hartslief,Toronto,...,Not stated,Yes,L1: Application to Evict a Tenant for Non-paym...,No,"Tenant’s conduct (racial slurs, aggressive beh...",,,"[Metadata:, Date: 2017-05-26, File number:, TE...","[Date: 2017-05-26, File number:, TEL-79722-17,...",[Order under Section 69 Residential Tenancies ...
2,TEL-80773-17,TEL-80773-17,Metadata:\nDate:\t2017-07-05\nFile number:\t\n...,TEL-80773-17.txt,1/31/2021 21:08:55,griffin.murphy@mail.utoronto.ca,06/30/2017,07/05/2017,Ruth Carey,Whitby,...,Not stated,Yes,No other specific applications were mentioned,No,,,,"[Metadata:, Date: 2017-07-05, File number:, TE...","[Date: 2017-07-05, File number:, TEL-80773-17,...",[Order under Section 69 Residential Tenancies ...
3,TEL-81359-17-AM,TEL-81359-17-AM,Metadata:\nDate:\t2017-07-18\nFile number:\t\n...,TEL-81359-17-AM.txt,1/31/2021 21:21:33,griffin.murphy@mail.utoronto.ca,07/07/2017,07/18/2017,Shelby Whittick,Toronto,...,Not stated,Yes,L1: Application to Evict a Tenant for Non-paym...,No,,,,"[Metadata:, Date: 2017-07-18, File number:, TE...","[Date: 2017-07-18, File number:, TEL-81359-17-...",[Order under Section 69 Residential Tenancies ...
4,TEL-81405-17,TEL-81405-17,Metadata:\nDate:\t2017-08-17\nFile number:\t\n...,TEL-81405-17.txt,1/31/2021 21:32:22,griffin.murphy@mail.utoronto.ca,08/15/2017,08/17/2017,Laura Hartslief,Lindsay,...,Not stated,Yes,No other specific applications were mentioned,No,,,,"[Metadata:, Date: 2017-08-17, File number:, TE...","[Date: 2017-08-17, File number:, TEL-81405-17,...",[Order under Section 69 Residential Tenancies ...


In [13]:
gold_data.loc[0, 'raw_file_text'][700:850]

'nant remained in the unit after the termination\ndate (the ‘L2 Application’).\n\xa0\nThese applications were heard in [CITY] on\nJuly 5, 2018.\xa0 Only the Land'

In [49]:
# import spacy

# nlp = spacy.load("en_core_web_sm") # loading this outside of the function saves ~2s per function call

# def find_dates_in_string(string, model=nlp):
#     extracted_dates = []
#     # for string in string_list:
#     doc = nlp(string)
#     for entity in doc.ents:
#         if entity.label_ == "DATE":
#             extracted_dates.append(entity.text)
    
#     return list(set(extracted_dates))
#     # pattern = r"(?i)(\b\w+ \d{1,2}, \d{4}\b)"
#     # valid_dates = re.findall(pattern, ", ".join(extracted_dates))
#     # return valid_dates

from datetime import datetime

def order_dates(dates_list):
    sorted_dates = sorted(dates_list, key=lambda x: datetime.strptime(x, '%m/%d/%Y'))
    return sorted_dates

from dateutil.parser import parse

def convert_to_datetime(date_str):
    # Parse date using dateutil.parser.parse
    dt = parse(date_str)
    
    # Format date with strftime in the format 'MM/DD/YYYY'
    return dt.strftime('%m/%d/%Y')

gold_data['hearing_date'] = gold_data['hearing_date'].apply(lambda x: convert_to_datetime(x))
gold_data['decision_date'] = gold_data['decision_date'].apply(lambda x: convert_to_datetime(x))
gold_data.loc[0, 'hearing_date']

2023-05-30 14:30:19.145979: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


'07/05/2018'

In [114]:
from datetime import datetime

def convert_date_to_words(date_str):
    date = datetime.strptime(date_str, '%m/%d/%Y')
    formatted_date = date.strftime('%B %d, %Y')
    return formatted_date

convert_date_to_words(gold_data.loc[3, 'hearing_date'])

'July 07, 2017'

In [53]:
import re

def extract_dates_regex(string):
    date_pattern = r'\b\w+ \d{1,2}, \d{4}\b'
    dates = re.findall(date_pattern, string)
    return list(set(dates))

In [32]:
from collections import Counter

def count_words(string):
    words = string.split()
    word_counts = Counter(words)
    return word_counts
    # return dict(word_counts)

def sort_dict_by_values(dictionary):
    sorted_dict = dict(sorted(dictionary.items(), key=lambda item: item[1]))
    return sorted_dict

In [67]:
import re

def remove_punctuation(word):
    cleaned_word = re.sub(r'^\W+|\W+$', '', word)
    return cleaned_word

In [249]:
import math

def cosine_similarity(dict1, dict2):
    common_words = set(dict1.keys()).intersection(set(dict2.keys()))
    common_words = set(dict1).intersection(set(dict2))

    dot_product = sum(dict1[word] * dict2[word] for word in common_words)
    magnitude1 = math.sqrt(sum(dict1[word]**2 for word in dict1))
    magnitude2 = math.sqrt(sum(dict2[word]**2 for word in dict2))

    # if somehow this ever happens, return 0 instead of throwing an error
    if magnitude1 == 0 or magnitude2 == 0:
        return 0

    similarity = dot_product / (magnitude1 * magnitude2)
    return similarity

# Example usage
case_file1 = "This is the first case file."
case_file2 = "This is the seconds case file."

words_counts1 = count_words(case_file1.lower())#.most_common(20)
words_counts2 = count_words(case_file2.lower())#.most_common(20)

similarity = cosine_similarity(words_counts1, words_counts2)
print(f"Similarity: {similarity}")


Similarity: 0.8333333333333335


# DONT DELETE THIS

In [261]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# You need to download the set of stop words the first time
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
def remove_stopwords(text, stopwords = stopwords):
    word_tokens = word_tokenize(text)

    filtered_text = [word for word in word_tokens if word.casefold() not in stop_words]
    return " ".join(filtered_text)

text = "This is a sample sentence, showing off the stop words filtration."

print(remove_stopwords(text))

sample sentence , showing stop words filtration .


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kmaurinjones/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kmaurinjones/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [280]:
test_row = 0
test_case_file_str = " ".join((gold_data.loc[test_row, 'metadata'] + gold_data.loc[test_row, 'content']))#.lower()
# test_case_file_str = " ".join(gold_data.loc[test_row, 'full_cleaned'])#.lower()
print(len(test_case_file_str))
print(len(remove_stopwords(test_case_file_str)))

6435
4625


In [349]:
import numpy as np

def get_word_dist(case_file_str: str, date_str: str, proximity: int = 100, num_words = 30):

    # hearing_date_words = convert_date_to_words(date_str) # retrieved dates from regex expression
    hdate_idx = case_file_str.find(date_str) # finds date in case file, gets its starting index
    if hdate_idx == -1:
        return None
    else:
        hdate_subset = case_file_str[hdate_idx - proximity : hdate_idx + proximity] # gets case file text within 200 (before and after) hearing date mention
        words_counts = count_words(hdate_subset.lower()).most_common(num_words)
        cleaned_word_counts = [(remove_punctuation(word), count) for word, count in words_counts]
        return dict(cleaned_word_counts)

def get_hearing_date_cssim(case_file_str: str):
    found_dates = extract_dates_regex(case_file_str)

    # print(f"GOLD DATE: {gold_data.loc[test_row, 'hearing_date']}\n")
    # print(found_dates)

    best_similarity = -np.inf # setting similarity to nothing to start
    best_date_candidate = None

    for date in found_dates:
        word_dist = get_word_dist(case_file_str = case_file_str, date_str = date, proximity = 100)
        csim = cosine_similarity(word_dist, dict(master_hdate_dist))
        if csim > best_similarity:
            best_similarity = csim
            best_date_candidate = date

    return best_date_candidate

test_row = 21
test_case_file_str_1 = " ".join(gold_data.loc[test_row, 'full_cleaned'])
print(convert_to_datetime(get_hearing_date_cssim(test_case_file_str_1)))
print()
test_case_file_str_2 = gold_data.loc[test_row, 'raw_file_text']
print(convert_to_datetime(get_hearing_date_cssim(test_case_file_str_2)))
print()

03/03/2020

03/03/2020



In [283]:
# test_row = 10
master_hdate_dist = Counter()
for row in gold_data.index:
    # test_case_file_str = gold_data.loc[row, 'raw_file_text'].lower()
    test_case_file_str = " ".join(gold_data.loc[row, 'full_cleaned']).lower()
    # test_case_file_str = remove_stopwords(test_case_file_str)
    # hdate = gold_data.loc[row, 'decision_date']
    hdate = gold_data.loc[row, 'hearing_date']
    hdate_words = convert_date_to_words(hdate).lower()
    for hdate_word in [hdate_words, hdate_words.replace(" 0", " ")]: # sometimes they put the 0 in front of the day, sometimes not -- very annoying but need this to be robust
        if hdate_word in test_case_file_str: # try with and without "0" padding
            # print(hdate_word)
            dist = get_word_dist(case_file_str = test_case_file_str, date_str = hdate_word, proximity = 100, num_words = 30)
            # print(dist)
            for word, count in dist.items():
                if word in master_hdate_dist:
                    master_hdate_dist[word] += count
                else:
                    master_hdate_dist[word] = count

master_hdate_dist = master_hdate_dist.most_common(30)
master_hdate_dist

[('the', 5158),
 ('in', 1315),
 ('on', 1185),
 ('tenant', 1185),
 ('was', 1103),
 ('heard', 1036),
 ('application', 924),
 ('this', 896),
 ('attended', 893),
 ('and', 892),
 ('hearing', 813),
 ('2018', 703),
 ('landlord’s', 618),
 ('landlord', 586),
 ('representative', 570),
 ('toronto', 543),
 ('tenants', 519),
 ('rent', 472),
 ('that', 430),
 ('not', 419),
 ('pay', 374),
 ('did', 365),
 ('only', 290),
 ('date', 261),
 ('termination', 258),
 ('determinations', 252),
 ('after', 246),
 ('unit', 245),
 ('of', 245),
 ('owes', 235)]

In [215]:
gold_data.loc[0, 'raw_file_text']
convert_date_to_words(gold_data.loc[0, 'hearing_date'])

'July 05, 2018'

In [239]:
test_row = 10
test_case_file_str = gold_data.loc[test_row, 'raw_file_text']
get_hearing_date_cssim(test_case_file_str)

GOLD DATE: 03/22/2018



'March 22, 2018'

In [351]:
# about 13 per second
import time

new_data = {}
new_data['hearing_date_raw_text'] = []
new_data['hearing_date_cleaned_text'] = []

start_time = time.time()

# Initialize a deque to store the latest 100 iteration times
time_deque = deque(maxlen = 500)

for index, row in enumerate(gold_data.itertuples()):

    # Save the start time of this iteration
    iteration_start_time = time.time()

    try:
        test_case_file_str = gold_data.loc[row.Index, 'raw_file_text']
        found_date = get_hearing_date_cssim(test_case_file_str)
        if found_date:
            found_date = convert_to_datetime(found_date)

        new_data['hearing_date_raw_text'].append(found_date) # this way it'll either convert it or add the null value

        test_case_file_str = " ".join(gold_data.loc[row.Index, 'full_cleaned'])
        found_date = get_hearing_date_cssim(test_case_file_str)
        if found_date:
            found_date = convert_to_datetime(found_date)

        new_data['hearing_date_cleaned_text'].append(found_date)

    except Exception as any_error:
        print(f"{any_error} with file at Df row: ", row.Index)

    # Save the end time of this iteration and push it into the deque
    iteration_end_time = time.time()
    time_deque.append(iteration_end_time - iteration_start_time)

    # progress tracker
    average_time_per_row = np.mean(time_deque)
    rows_left = len(gold_data) - (index + 1)
    estimated_time_left = rows_left * average_time_per_row

    print("Files processed: ", index + 1, "of", len(gold_data),
          "Estimated time remaining: ", time.strftime('%H:%M:%S', time.gmtime(estimated_time_left)), end = '\r')

new_df = pd.DataFrame(new_data)
new_df

Files processed:  679 of 679 Estimated time remaining:  00:00:00

Unnamed: 0,hearing_date_raw_text,hearing_date_cleaned_text
0,07/07/2018,07/17/2018
1,06/06/2017,06/06/2017
2,07/16/2017,05/29/2017
3,07/07/2017,07/07/2017
4,08/15/2017,08/15/2017
...,...,...
674,11/20/2018,11/20/2018
675,11/23/2018,12/10/2018
676,12/04/2018,12/04/2018
677,11/02/2018,11/02/2018


In [357]:
from sklearn.metrics import accuracy_score

print(accuracy_score(gold_data['hearing_date'], new_df['hearing_date_raw_text']))
print(accuracy_score(gold_data['hearing_date'], new_df['hearing_date_cleaned_text']))
for gold, pred in zip(gold_data['hearing_date'], new_df['hearing_date_cleaned_text']):
    if gold != pred:
        print(gold, pred)

0.5625920471281296
0.7422680412371134
07/05/2018 07/17/2018
05/19/2017 06/06/2017
06/30/2017 05/29/2017
01/19/2018 02/02/2018
06/01/2018 03/23/2018
08/28/2019 08/31/2018
07/25/2018 08/11/2018
01/22/2020 04/16/2019
01/17/2018 12/01/2017
01/05/2017 01/29/2017
02/02/2017 02/14/2017
03/07/2017 04/21/2017
06/16/2017 05/05/2017
05/15/2017 08/03/2017
07/11/2017 08/06/2017
07/21/2017 10/22/2016
11/27/2017 09/18/2017
10/24/2017 11/06/2017
01/25/2018 11/02/2017
01/23/2018 12/19/2017
02/27/2018 04/29/2017
02/08/2018 11/30/2017
02/08/2018 04/01/2018
06/15/2018 03/07/2018
03/15/2018 01/17/2018
04/27/2018 05/14/2018
03/01/2018 03/16/2018
06/26/2018 07/01/2018
07/24/2018 06/22/2018
07/27/2018 06/20/2018
11/02/2018 12/08/2018
09/11/2018 10/05/2018
09/21/2018 10/08/2018
01/18/2019 08/19/2011
01/23/2019 09/07/2018
03/12/2019 11/21/2018
11/15/2016 01/20/2017
02/01/2017 02/18/2017
12/12/2017 08/01/2017
05/16/2018 05/18/2018
06/20/2017 07/03/2017
06/23/2017 06/28/2017
02/14/2018 01/02/2018
05/30/2018 02/26

# Silver Data
- Only 678 of 702 case files match

## Creating `silver_data` df from `gold_data` raw text

In [5]:
silver_data = gold_data.copy()
silver_data = silver_data.drop(columns = [col for col in silver_data.columns if col not in ['raw_file_name', 'raw_file_text']])
silver_data

Unnamed: 0,raw_file_text,raw_file_name
0,Metadata:\nDate:\t2018-07-06\nFile number:\t\n...,SWL-17348-18.txt
1,Metadata:\nDate:\t2017-05-26\nFile number:\t\n...,TEL-79722-17.txt
2,Metadata:\nDate:\t2017-07-05\nFile number:\t\n...,TEL-80773-17.txt
3,Metadata:\nDate:\t2017-07-18\nFile number:\t\n...,TEL-81359-17-AM.txt
4,Metadata:\nDate:\t2017-08-17\nFile number:\t\n...,TEL-81405-17.txt
...,...,...
674,Metadata:\nDate:\t2018-11-23\nFile number:\t\n...,TSL-99691-18.txt
675,Metadata:\nDate:\t2018-11-29\nFile number:\t\n...,TSL-99824-18.txt
676,Metadata:\nDate:\t2018-12-12\nFile number:\t\n...,TSL-99900-18.txt
677,Metadata:\nDate:\t2018-11-20\nFile number:\t\n...,TSL-99965-18.txt


In [6]:
gold_data = gold_data.rename(columns = {'file_number_gold_cleaned': 'file_number',
                                        'board_location': 'ltb_location'})
gold_data.columns.tolist()

['file_number_gold',
 'file_number',
 'raw_file_text',
 'raw_file_name',
 'timestamp',
 'email_address',
 'hearing_date',
 'decision_date',
 'adjudicating_member',
 'ltb_location',
 'landlord_represented',
 'landlord_attended_hearing',
 'tenant_represented',
 'tenant_attended_hearing',
 'landlord_nonprofit',
 'tenant_collecting_subsidy',
 'case_outcome',
 'tenancy_length',
 'monthly_rent',
 'rental_deposit',
 'rent_after_increase',
 'rent_increase_effect_date',
 'total_arrears',
 'arrears_duration',
 'arrears_payment_amount',
 'tenant_arrears_history_mentioned',
 'tenant_arrears_payment_history_mentioned',
 'rent_payments_late_frequency',
 'tenant_ability_to_pay_rent',
 'tenant_conditions',
 'tenant_children_present',
 'total_children',
 'children_17_or_younger',
 'children_13_or_younger',
 'children_4_or_younger',
 'children_conditions_mentioned',
 'conditions_making_moving_burdensome',
 'tenant_employed',
 'tenant_government_assistance',
 'employment_stability_doubts',
 'sufficient_i

# `get_nulls()`
- for checking df after each addition to it

In [7]:
def get_nulls(df, col, return_index = False):
    # returns a list of the indices of null values in a column of a dataframe
    null_rows = silver_data[silver_data[col].isnull()] # df of all rows with null ltb_location
    nulls_inds = null_rows.index.tolist()

    if return_index:
        return nulls_inds
    else:
        return null_rows
    
get_nulls(silver_data, 'raw_file_text', return_index = False)

Unnamed: 0,raw_file_text,raw_file_name


# Get `hearing_date`