# Finetuning Flant5 model to find metadata section of case file

## Tasks:
1. Get raw case file text
2. Do General cleaning of case file (remove encoding errors, extra lines, etc)
3. Remove "Metadata:" and "Content:" markers
    - only from training data
4. Add "metadata start" and "metadata end" markers to clearly indicate metadata boundaries
    - not sure if this is necessary
5. Create csv of training data -- pairs of (full case file, correctly identified metadata section)
6. Train model on these pairs
    - When feeding data to model, preprocess accordingly, one step of which should be to clip the entire length of the case file if it exceeds the max length of the model, because, since the metadata is always first, it shouldn't affect the model's ability to identify the metadata section
    - Then with the identified metadata section, compare that to the actual metadata section and calculate accuracy between two sections (pred vs actual)

In [1]:
import os
import pandas as pd
import time
import numpy as np
from collections import deque

# Loading in Raw Data
- assume something like raw text or a txt file will be loading into the system

In [2]:
# # formatted_cases_path = "/Users/kmaurinjones/Desktop/School/UBC/UBC_Coursework/capstone/Allard_A_Capstone/scraping/45k_formatted_cases/"
# folder_path = "./raw_case_files/"

# # def create_master_dictionary(directory):
# master_dict = {}
# master_dict['raw_file_name'] = []
# master_dict['raw_file_text'] = []

# # Iterate over .txt files in the folder
# for file_name in os.listdir(folder_path):
#     file_path = os.path.join(folder_path, file_name)
    
#     # Check if the file is a .txt file
#     if os.path.isfile(file_path) and file_name.endswith('.txt'):
        
#         # Read the contents of the .txt file
#         with open(file_path, 'r') as file:
#             contents = file.read()
            
#             # Append the contents to the list in the master_dict
#             master_dict['raw_file_name'].append(file_name)
#             master_dict['raw_file_text'].append(contents)
# # master_dict

In [4]:
master_raw_data = pd.read_csv("large_files/44k_cases_pproc_filenums.csv")
master_raw_data.head()

Unnamed: 0,year,raw_file_name,raw_file_text,full_cleaned,metadata,content,language,case_citation,file_number
0,2013,NOL-10723-12.txt,Metadata:\nDate:\t2013-01-08\nFile number:\t\n...,"['Metadata:', 'Date: 2013-01-08', 'File number...","['Date: 2013-01-08', 'File number:', 'NOL-1072...","['Order under Section 69', 'Residential Tenanc...",English,"NOL-10723-12 (Re), 2013 CanLII 5182 (ON LTB)",NOL-10723-12
1,2013,TNL-43964-13.txt,Metadata:\nDate:\t2013-05-02\nFile number:\t\n...,"['Metadata:', 'Date: 2013-05-02', 'File number...","['Date: 2013-05-02', 'File number:', 'TNL-4396...","[""Order under section 69 Residential Tenancies...",English,"TNL-43964-13 (Re), 2013 CanLII 36866 (ON LTB)",TNL-43964-13
2,2013,TNL-45470-13.txt,Metadata:\nDate:\t2013-06-17\nFile number:\t\n...,"['Metadata:', 'Date: 2013-06-17', 'File number...","['Date: 2013-06-17', 'File number:', 'TNL-4547...","[""Order under Section 69 Residential Tenancies...",English,"TNL-45470-13 (Re), 2013 CanLII 44492 (ON LTB)",TNL-45470-13
3,2013,TEL-33159-13; \n TET-33272-13.txt,Metadata:\nDate:\t2013-02-25\nFile number:\t\n...,"['Metadata:', 'Date: 2013-02-25', 'File number...","['Date: 2013-02-25', 'File number:', 'TEL-3315...",['Order under sections 31 and 69 Residential T...,English,"TEL-33159-13 (Re), 2013 CanLII 50418 (ON LTB)",TEL-33159-13; TET-33272-13
4,2013,TNL-39747-12.txt,Metadata:\nDate:\t2013-02-07\nFile number:\t\n...,"['Metadata:', 'Date: 2013-02-07', 'File number...","['Date: 2013-02-07', 'File number:', 'TNL-3974...","[""Order under Section 68 Residential Tenancies...",English,"TNL-39747-12 (Re), 2013 CanLII 10834 (ON LTB)",TNL-39747-12


In [5]:
master_raw_data = master_raw_data.drop(columns = [col for col in master_raw_data.columns if col not in ["raw_file_name", "raw_file_text"]])
master_raw_data

Unnamed: 0,raw_file_name,raw_file_text
0,NOL-10723-12.txt,Metadata:\nDate:\t2013-01-08\nFile number:\t\n...
1,TNL-43964-13.txt,Metadata:\nDate:\t2013-05-02\nFile number:\t\n...
2,TNL-45470-13.txt,Metadata:\nDate:\t2013-06-17\nFile number:\t\n...
3,TEL-33159-13; \n TET-33272-13.txt,Metadata:\nDate:\t2013-02-25\nFile number:\t\n...
4,TNL-39747-12.txt,Metadata:\nDate:\t2013-02-07\nFile number:\t\n...
...,...,...
43693,TSL-72336-16.txt,Metadata:\nDate:\t2016-05-06\nFile number:\t\n...
43694,CET-61080-16; \n 1204.txt,Metadata:\nDate:\t2016-11-21\nFile number:\t\n...
43695,TET-68734-16.txt,Metadata:\nDate:\t2016-08-24\nFile number:\t\n...
43696,SWT-92676-16-RV.txt,Metadata:\nDate:\t2016-10-31\nFile number:\t\n...


# General Cleaning
- Take in raw case file text (long string) and:
    - remove encoding errors
    - remove extra lines
    - remove extra spaces
    - remove extra tabs
    - remove extra carriage returns

In [7]:
import re

def general_cleaning(raw_file_str: str):
    # gets rid of tabs, non-breaking spaces, leading/trailing whitespace, removes empty lines, and "\xa0"
    generally_cleaned_str = [line.replace("\t", " ").replace("\xa0", "").strip() for line in raw_file_str.split('\n') if line.strip() != '']
    return generally_cleaned_str

def remove_whitespace_and_underscores(string):
    # Remove consecutive whitespace
    string = re.sub(r'\s+', ' ', string)

    # Remove more than three consecutive underscores
    string = re.sub(r'_+', '', string)

    return string.strip()

def separate_file_sections(text_list):
    metadata_list = []
    content_list = []

    is_metadata = True
    is_content = False

    for line in text_list:
        if line.strip() == 'Metadata:':
            is_metadata = True
            is_content = False
        elif line.strip() == 'Content:':
            is_metadata = False
            is_content = True
        elif is_metadata:
            metadata_list.append(remove_whitespace_and_underscores(line))
        elif is_content:
            content_list.append(remove_whitespace_and_underscores(line))

    return metadata_list, content_list

In [13]:
def merge_numerical_entries(strings_list):
    """
    Turns something like
        [..., '3.',
        'The tenant took occupancy of the rental unit in or about the beginning of December 2016.', ...]
    into
        [..., '3. The tenant took occupancy of the rental unit in or about the beginning of December 2016.', ...]
    
    """
    for i in range(len(strings_list) - 2, -1, -1):
        if re.fullmatch(r'\d+\.', strings_list[i]):
            strings_list[i] += ' ' + strings_list[i + 1]
            del strings_list[i + 1]
    return strings_list

def move_trailing_numbers(strings_list):
    """
    Turns something like
        [..., 'Credibility of the Parties 4.',
        'The Landlord said about two to three months ago he ...', ...]
    into
        [..., 'Credibility of the Parties',
        '4. The Landlord said about two to three months ago he...', ...]
    
    """
    for i in range(len(strings_list) - 1, -1, -1):
        match = re.search(r'\s+(\d{2}\.)$', strings_list[i])
        if match:
            number = match.group(1)
            strings_list[i] = re.sub(r'\s+\d{1,2}\.$', '', strings_list[i])
            strings_list[i + 1] = number + ' ' + strings_list[i + 1]
    return strings_list

def remove_end_tag_and_restructure(metadata_list: list):

    cleaned_str = " ".join(metadata_list)

    # this doesn't add any meaning to the case details we need to extract, and instead just adds noise to the extraction process + adds extra unnecessary tokens
    if cleaned_str.find("If you have any questions about this order") > (len(cleaned_str) - 500):
        cleaned_str = cleaned_str[: cleaned_str.find("If you have any questions about this order")].strip() # ending tag removed
    
    # otherwise just do everything else
    cleaned_str = cleaned_str.replace(". ", ".\n")
    cleaned_str = re.sub(r'(?<!\d)\. ', "\n", cleaned_str)
    trimmed_list = [line.strip() for line in cleaned_str.split('\n') if line.strip() != '']
    # trimmed_list = merge_numerical_entries(trimmed_list)
    # trimmed_list = move_trailing_numbers(trimmed_list)
    return trimmed_list

file_name = "CEL-74519-18.txt"
# row of this particular case
case_file_ind = master_raw_data.loc[master_raw_data['raw_file_name'] == file_name].index.tolist()[0]
test_text = master_raw_data.loc[206, "raw_file_text"]#.item()

metadata, content = separate_file_sections(general_cleaning(test_text))
remove_end_tag_and_restructure(content)

["Order under Section 78(11) Residential Tenancies Act, 2006 File Number: EAL-28249-12-SA PPMI (the 'Landlord') applied for an order to terminate the tenancy and evict JP and SO (the 'Tenants') because they failed to meet a condition specified in order EAL-22059-12 issued on March 27, 2012.",
 "The Landlord's application was resolved by order EAL-28249-12, issued on November 28, 2012.",
 'The Tenants filed a motion to set aside order EAL-28249-12.',
 'This motion was heard in Ottawa on January 3, 2013.',
 'The Landlord’s Legal Representative, AK and the Tenant SO attended the hearing.',
 'The parties mutually agreed to resolve all matters at issue in the application and requested an order on consent.',
 'I was satisfied that the parties understood the consequences of the joint submission.',
 'At the hearing, the parties agreed: 1.',
 'The set aside motion should be granted.',
 '2.',
 'The amount outstanding to January 31, 2012, inclusive of rent arrears and costs is $149.68.',
 'On con

In [14]:
import time
import numpy as np
from collections import deque

start_time = time.time()

# Initialize a deque to store the latest 100 iteration times
time_deque = deque(maxlen = 500)

cases_contents = []
cases_metadata = []
full_cleaned = []

raw_files = master_raw_data['raw_file_text'].tolist()
for index, raw_file in enumerate(raw_files):
    iteration_start_time = time.time()
    better_file = general_cleaning(raw_file)
    try:
        metadata_list, content_list = separate_file_sections(better_file)
        full_cleaned.append(better_file)
        cases_metadata.append(metadata_list) # removing a bit more text if possible

        # cases_contents.append(content_list)
        cases_contents.append(remove_end_tag_and_restructure(content_list))

        # Save the end time of this iteration and push it into the deque
        iteration_end_time = time.time()
        time_deque.append(iteration_end_time - iteration_start_time)

        # progress tracker
        average_time_per_file = np.mean(time_deque)
        files_left = len(raw_files) - (index + 1)
        estimated_time_left = files_left * average_time_per_file

        print(f"Files processed: {index + 1} of {len(raw_files)}, Estimated time remaining: {time.strftime('%H:%M:%S', time.gmtime(estimated_time_left))}", end='\r')
    except Exception as any_error:
        print(f"{any_error} with file at Df row: ", index)

master_raw_data['full_cleaned'] = full_cleaned
master_raw_data['metadata'] = cases_metadata
master_raw_data['content'] = cases_contents
master_raw_data.head()

Files processed: 43698 of 43698, Estimated time remaining: 00:00:00

Unnamed: 0,raw_file_name,raw_file_text,full_cleaned,metadata,content
0,NOL-10723-12.txt,Metadata:\nDate:\t2013-01-08\nFile number:\t\n...,"[Metadata:, Date: 2013-01-08, File number:, NO...","[Date: 2013-01-08, File number:, NOL-10723-12,...",[Order under Section 69 Residential Tenancies ...
1,TNL-43964-13.txt,Metadata:\nDate:\t2013-05-02\nFile number:\t\n...,"[Metadata:, Date: 2013-05-02, File number:, TN...","[Date: 2013-05-02, File number:, TNL-43964-13,...",[Order under section 69 Residential Tenancies ...
2,TNL-45470-13.txt,Metadata:\nDate:\t2013-06-17\nFile number:\t\n...,"[Metadata:, Date: 2013-06-17, File number:, TN...","[Date: 2013-06-17, File number:, TNL-45470-13,...",[Order under Section 69 Residential Tenancies ...
3,TEL-33159-13; \n TET-33272-13.txt,Metadata:\nDate:\t2013-02-25\nFile number:\t\n...,"[Metadata:, Date: 2013-02-25, File number:, TE...","[Date: 2013-02-25, File number:, TEL-33159-13;...",[Order under sections 31 and 69 Residential Te...
4,TNL-39747-12.txt,Metadata:\nDate:\t2013-02-07\nFile number:\t\n...,"[Metadata:, Date: 2013-02-07, File number:, TN...","[Date: 2013-02-07, File number:, TNL-39747-12,...",[Order under Section 68 Residential Tenancies ...


In [16]:
# master_raw_data.loc[0, "content"]

# Creating Training Data

In [17]:
import time
import numpy as np
from collections import deque

start_time = time.time()

# Initialize a deque to store the latest 500 iteration times
time_deque = deque(maxlen = 500)

for index, row in enumerate(master_raw_data.itertuples()):

    # Save the start time of this iteration
    iteration_start_time = time.time()

    # adding to 'case_citation' and 'file_number' columns
    try:
        # metadata_list, content_list = separate_file_sections(general_cleaning(master_raw_data.loc[row.Index, "raw_file_text"]))
        # master_raw_data.at[row.Index, 'case_citation'] = get_case_citation(metadata_list)
        # master_raw_data.at[row.Index, 'file_number'] = get_file_number(metadata_list)
        master_raw_data.at[row.Index, 'clean_str_metadata'] = " ".join(master_raw_data.loc[row.Index, "metadata"]).strip()
        master_raw_data.at[row.Index, 'clean_str_content'] = " ".join(master_raw_data.loc[row.Index, "content"]).strip()
        master_raw_data.at[row.Index, 'clean_str_full_file'] = master_raw_data.at[row.Index, 'clean_str_metadata'].strip() + " " + master_raw_data.at[row.Index, 'clean_str_content'].strip()
        
    except Exception as any_error:
        print(f"{any_error} with file at Df row: ", row.Index)

    # Save the end time of this iteration and push it into the deque
    iteration_end_time = time.time()
    time_deque.append(iteration_end_time - iteration_start_time)

    # progress tracker
    average_time_per_row = np.mean(time_deque)
    rows_left = len(master_raw_data) - (index + 1)
    estimated_time_left = rows_left * average_time_per_row

    print(f"Files processed: {index + 1} of {len(master_raw_data)}, Estimated time remaining: {time.strftime('%H:%M:%S', time.gmtime(estimated_time_left))}", end='\r')

master_raw_data.head()

Files processed: 43698 of 43698, Estimated time remaining: 00:00:00

Unnamed: 0,raw_file_name,raw_file_text,full_cleaned,metadata,content,clean_str_metadata,clean_str_content,clean_str_full_file
0,NOL-10723-12.txt,Metadata:\nDate:\t2013-01-08\nFile number:\t\n...,"[Metadata:, Date: 2013-01-08, File number:, NO...","[Date: 2013-01-08, File number:, NOL-10723-12,...",[Order under Section 69 Residential Tenancies ...,Date: 2013-01-08 File number: NOL-10723-12 Cit...,Order under Section 69 Residential Tenancies A...,Date: 2013-01-08 File number: NOL-10723-12 Cit...
1,TNL-43964-13.txt,Metadata:\nDate:\t2013-05-02\nFile number:\t\n...,"[Metadata:, Date: 2013-05-02, File number:, TN...","[Date: 2013-05-02, File number:, TNL-43964-13,...",[Order under section 69 Residential Tenancies ...,Date: 2013-05-02 File number: TNL-43964-13 Cit...,Order under section 69 Residential Tenancies A...,Date: 2013-05-02 File number: TNL-43964-13 Cit...
2,TNL-45470-13.txt,Metadata:\nDate:\t2013-06-17\nFile number:\t\n...,"[Metadata:, Date: 2013-06-17, File number:, TN...","[Date: 2013-06-17, File number:, TNL-45470-13,...",[Order under Section 69 Residential Tenancies ...,Date: 2013-06-17 File number: TNL-45470-13 Cit...,Order under Section 69 Residential Tenancies A...,Date: 2013-06-17 File number: TNL-45470-13 Cit...
3,TEL-33159-13; \n TET-33272-13.txt,Metadata:\nDate:\t2013-02-25\nFile number:\t\n...,"[Metadata:, Date: 2013-02-25, File number:, TE...","[Date: 2013-02-25, File number:, TEL-33159-13;...",[Order under sections 31 and 69 Residential Te...,Date: 2013-02-25 File number: TEL-33159-13; TE...,Order under sections 31 and 69 Residential Ten...,Date: 2013-02-25 File number: TEL-33159-13; TE...
4,TNL-39747-12.txt,Metadata:\nDate:\t2013-02-07\nFile number:\t\n...,"[Metadata:, Date: 2013-02-07, File number:, TN...","[Date: 2013-02-07, File number:, TNL-39747-12,...",[Order under Section 68 Residential Tenancies ...,Date: 2013-02-07 File number: TNL-39747-12 Cit...,Order under Section 68 Residential Tenancies A...,Date: 2013-02-07 File number: TNL-39747-12 Cit...


In [18]:
# writing smaller df to csv for easier loading in Colab

full_training_df = master_raw_data.drop(columns=['raw_file_text', 'metadata', 'content', 'full_cleaned'])
full_training_df.head()

Unnamed: 0,raw_file_name,clean_str_metadata,clean_str_content,clean_str_full_file
0,NOL-10723-12.txt,Date: 2013-01-08 File number: NOL-10723-12 Cit...,Order under Section 69 Residential Tenancies A...,Date: 2013-01-08 File number: NOL-10723-12 Cit...
1,TNL-43964-13.txt,Date: 2013-05-02 File number: TNL-43964-13 Cit...,Order under section 69 Residential Tenancies A...,Date: 2013-05-02 File number: TNL-43964-13 Cit...
2,TNL-45470-13.txt,Date: 2013-06-17 File number: TNL-45470-13 Cit...,Order under Section 69 Residential Tenancies A...,Date: 2013-06-17 File number: TNL-45470-13 Cit...
3,TEL-33159-13; \n TET-33272-13.txt,Date: 2013-02-25 File number: TEL-33159-13; TE...,Order under sections 31 and 69 Residential Ten...,Date: 2013-02-25 File number: TEL-33159-13; TE...
4,TNL-39747-12.txt,Date: 2013-02-07 File number: TNL-39747-12 Cit...,Order under Section 68 Residential Tenancies A...,Date: 2013-02-07 File number: TNL-39747-12 Cit...


In [23]:
train_split = 0.75
dev_split = 0.10
test_split = 0.15

train_rows = int(len(full_training_df) * train_split)
dev_rows = int(len(full_training_df) * dev_split)
test_rows = int(len(full_training_df) * test_split)

train_df = full_training_df.iloc[:train_rows, :]
dev_df = full_training_df.iloc[train_rows:train_rows + dev_rows, :]
test_df = full_training_df.iloc[train_rows + dev_rows:, :]

# making sure all rows are represented across the 3 sets
assert len(train_df) + len(dev_df) + len(test_df) == len(full_training_df)

In [24]:
train_df.to_csv("large_files/metadata_detection_train.csv", index = False)
dev_df.to_csv("large_files/metadata_detection_dev.csv", index = False)
test_df.to_csv("large_files/metadata_detection_test.csv", index = False)