In [None]:
import re
import os
from collections import defaultdict, Counter
import pandas as pd

In [None]:
def rule_based_parse_BCJ(path = None, doc = None):
    '''Given file path (text file) of negligence cases, finds static 
    information within the case (information that can be pattern matched)
    Expects a B.C.J. case format (British Columbia Judgments)
    
    The following fields are currently implemented:
    - Case Title
    - Judge Name
    - Registry
    - Year
    - Decision Length (in paragraphs)
    - Damages
    - Multiple Defendants
    - Plaintiff Wins
    
    Arguments: doc (String): The case in text format following the form used in the DOCX to TXT notebook
    Returns: case_parsed_data (list) of case_dict (Dictionary): List of Dictionaries with rule based parsable fields filled in
    '''
    if path:
        with open(path, encoding='utf-8') as document:
            document_data = document.read()
        document_data = document_data.split('End of Document\n') # Always split on 'End of Document\n'
    

    case_parsed_data = []
    for i in range(len(document_data)):
        case_dict = dict() 
        case = document_data[i]
        case = case.strip() # Make sure to strip!
        if len(case) == 0: # Skip empty lines
            continue
        
        lines = case.split('\n')
        if len(lines) < 2:
            print(case)
        case_title = lines[0]
        case_type = lines[1]

        if 'R. v.' in case_title or '(Re)' in case_title: # Skip crown cases, Skip (Re) cases
            continue
            
        # Skip client/solicitor cases (not same as plaintiff/defendant)
        regex_client_solicitor = re.search(r'(Between.*([C|c]lient[s]?).*([S|s]olicitor[s]?|[L|l]awyer[s]?))', case)
        if regex_client_solicitor:
            continue
        
        regex_solicitor_client = re.search(r'(Between.*([L|l]awyer[s]?|[S|s]olicitor[s]?).*([C|c]lient[s]?))', case)
        if regex_solicitor_client:
            continue
            
        # In some rare cases we have 'IN THE MATTER OF ..' (rather than 'Between ...') .. but it is following by the normal
        # plaintiff/defendant dynamic. Only skip cases if there is no mention of the following terms
        # (Can be cleaned up in future)
        key_words = ['appellant', 'respondent', 'claimant', 'petitioner', 'plaintiff', 'defendant',
        'appellants', 'respondents', 'claimants', 'petitioners', 'plaintiffs', 'defendants']
        regex_in_matter_of = re.search(r'IN THE MATTER OF .*\n\([0-9]+ paras.\)', case)
        if regex_in_matter_of:
            remove = True
            for key in key_words:
                if key in regex_in_matter_of.group(0).lower().strip():
                    remove = False
                    
            if remove:
                continue

        if 'British Columbia Judgments' in case_type: # Make sure we're dealing with a B.C.J. case
        
            # Fields that can be found via pattern matching
            if re.search('contributory negligence', case, re.IGNORECASE):
                contributory_negligence_raised = True
            else:
                contributory_negligence_raised = False
            case_number = re.search(r'\/P([0-9]+)\.txt', path).group(1)
            decision_len = re.search(r'\(([0-9]+) paras\.?\)', case) # e.g.) (100 paras.)
            registry = re.search(r'(Registry|Registries): ?([A-Za-z0-9 ]+)', case) # e.g.) Registry: Vancouver
            written_decision = True if int(decision_len.group(1)) > 1 else False
            if registry:
                registry = registry.group(2).strip()
            else:
                registry = re.search(r'([A-Za-z ]+) Registry No.', case) # Alt form e.g.) Vancouver Registory No. XXX
                if registry:
                    registry = registry.group(1).strip()
                else:
                    registry = re.search(r'([A-Za-z ]+) No. S[0-9]*', case)
                    if registry:
                        registry = registry.group(1).strip()
                    else:
                        print('WARNING: Registry could not be found (This shouldn\'t occur!)')
            # Fields that are always in the same place
            judge_name = lines[4].strip()
            case_title = lines[0].strip()
            # Extract year from case_title (in case we want to make visualizations, etc.)
            year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
            if year:
                year = year.group(0)
            else:
                # Rare case: Sometimes the title is too long. Rely on Heard date.
                year = re.search(r'Heard:.* ([2][0][0-2][0-9])', case)
                if year:
                    year = year.group(1)
                else:
                    print('WARNING: Year not found')
            case_dict['case_number'] = '%s of %s'%(i+1+((int(case_number)-1)*50), case_number)
            case_dict['case_title'] = case_title
            case_dict['year'] = year
            case_dict['registry'] = registry
            case_dict['judge'] = judge_name
            case_dict['decision_length'] = decision_len.group(1)
            case_dict['multiple_defendants'] = rule_based_multiple_defendants_parse(case)
            case_dict['contributory_negligence_raised'] = contributory_negligence_raised
            case_dict['written_decision'] = written_decision
            
            # TODO: Improve plaintiff_wins to take one case at a time.
            plaintiff_list = plaintiff_wins(path)
            if case_title in plaintiff_list:
                case_dict['plaintiff_wins'] = plaintiff_list[case_title]
            else:
                case_dict['plaintiff_wins'] = "NA"
                
            case_dict['damages'] = rule_based_damage_extraction(case)
                
        # don't add empty dictionaries (non BCJ cases) to list
        if case_dict != dict(): 
            case_parsed_data.append(case_dict)
    return case_parsed_data

In [None]:
def rule_based_multiple_defendants_parse(doc):
    ''' Work in progress. Subject to minor changes to Regex patterns.
    
    TODO:
        - Clarify solicitor/client cases with Lachlan
        - Clarify cases that say "IN MATTER OF ...", currently returning 'UNK' for these
    
    -----
    
    Given a case. Uses regex/pattern-matching to determine whether we have multiple defendants.
    For the most part the logic relies on whether the langauge used implies plurality or not.
    
    Arguments: doc (String): The case in text format following the form used in the DOCX to TXT notebook
    Returns: response (String, 'Y', 'N', or 'UNK')
    
    '''
    
    # Cases with (Re) in the title always have one person involved
    # May drop these cases depending on advice from Lachlan.
    if '(Re)' in doc.split('\n')[0]:
        return 'N'
    
    # Case 1)
    # Traditional/most common. Of form "Between A, B, C, Plaintiff(s), X, Y, Z Defendant(s)"
    # Can successfully cover ~98% of data
    regex_between_plaintiff_claimant = re.search(r'(Between.*([P|p]laintiff[s]?|[C|c]laimant[s]?|[A|a]ppellant[s]?|[P|p]etitioner[s]?).*([D|d]efendant[s]?|[R|r]espondent[s]?).*\n)', doc)
    
    # Match found
    if regex_between_plaintiff_claimant:
        text = regex_between_plaintiff_claimant.group(0).lower()
        if 'defendants' in text or 'respondents' in text:
            return 'Y'
        elif 'defendant' in text or 'respondent' in text:
            return 'N'
    
    # If not found, try other less common cases
    else:
        # Case 2)
        # Sometimes it does not mention the name of the second item. (Defendent/Respondent)
        # We can estimate if there are multiple based on the number of "," in the line (Covers all cases in initial data)
        regex_missing_defendent = re.search(r'(Between.*([P|p]laintiff[s]?|[C|c]laimant[s]?|[A|a]ppellant[s]?|[P|p]etitioner[s]?).*\n)', doc)
        if regex_missing_defendent:
            text = regex_missing_defendent.group(0).lower()
            if len(text.split(',')) > 5:
                return 'Y'
            else:
                return 'N'
            
        else:
            
            # Case 3A) solicitor-client
            # Some cases have a solicitor (lawyer) and a client
            # Currently assuming the second item is the defendant
            regex_solicitor_client = re.search(r'(Between.*([S|s]olicitor[s]?).*([C|c]lient[s]?))', doc)
            if regex_solicitor_client:
                text = regex_solicitor_client.group(0).lower()
                if 'clients' in text:
                    return 'Y'
                else:
                    return 'N'
            else:
                # Case 3B) client - solicitor
                regex_client_solicitor = re.search(r'(Between.*([C|c]lient[s]?).*([S|s]olicitor[s]?))', doc)
                if regex_client_solicitor:
                    text = regex_client_solicitor.group(0).lower()
                    if 'solicitors' in text:
                        return 'Y'
                    else:
                        return 'N'
                else:
                    return 'UNK'
        
        

In [None]:
def plaintiff_wins(path):
    '''This function will search the cases and returns a dictionary
    with case names as keys and boolean for value, True if the plaintiff
    wins the case and False if plaintiff looses'''
#     list_of_files = os.listdir(path)
    plaintiff_dict = {}

    with open(path,'r') as f:
        
        contents = f.read() 
        cases = contents.split("End of Document\n")
        for line in cases:
            lines = line.strip().split("\n")
            name = lines[0]
            # regex search for keyword HELD in cases, which determines if case was allowed or dismissed
            HELD = re.search(r'HELD.+', line)
            if HELD:
                matched = HELD.group(0)
                if "allowed" in matched or "favour" in matched or "awarded" in matched or "granted" in matched:
                    plaintiff_dict[name] = True
                if "dismissed" in matched:
                    plaintiff_dict[name] = False
            else:
                if line:
                    awarded =  re.search(r'award(.+)?.+?(plaintiff(.+)?)?', lines[-2])
                    #regex searches for pattern of plaintiff/defendant/applicant....entitled/have...costs
                    entiteled = re.search(r'(plaintiff|defendant.?|applicant)(.+)?(entitle(.)?(.+)?|have).+?cost(.+)?', lines[-2])
                    #regex searches for pattern of successful...(case)
                    successful = re.search(r'successful(.+)?.+?', lines[-2])
                    #regex searches for dismiss....
                    dismiss = re.search(r'(dismiss(.+)?.+)|(adjourned.+?)', lines[-2])
                    costs = re.search(r'costs.+?(award(.+)?|cause).+?', lines[-2])

                    if dismiss and "not dismissed" not in lines[-2]:
                        plaintiff_dict[name] = False
                    elif awarded:
                        plaintiff_dict[name] = True
                    elif entiteled:
                        plaintiff_dict[name] = True
                    elif successful:
                        plaintiff_dict[name] = True
                    elif costs:
                        plaintiff_dict[name] = True
                    else:
                        plaintiff_dict[name] = "OpenCase"

        
    return plaintiff_dict

In [None]:
def paragraph_tokenize(doc):
    ''' String of Entire Document and returns list of lists of paragraphs in document
    ---------
    Input: doc (str) - string of single legal case
    Return: docs_split(list) - list of lists of numbrered paragraphs per document'''
    
    doc_data = []
    lines = doc.split('\n')
    if not 'British Columbia Judgments' in lines[1]:
        return
    doc_data.append(lines[0])
    decision_length = re.search(r'\(([0-9|,]+) paras?\.?\)', doc).group(1)

    # split paragraphs on newline, paragraph number, two spaces
    pattern = r'[\W|\w]?(?=\n[0-9]{1,%s}[\xa0]{2})'%len(decision_length)
    paras_split = re.split(pattern, doc)

    paras = []
    for para in paras_split:   
        # make sure the paragraph starts with the correct characters
        para_start = re.match(r'^\n([0-9]{1,%s})[\xa0]{2}'%len(decision_length), para)
        if para_start:
            paras.append(para)
    doc_data.extend(paras)
    return doc_data

In [164]:
#regex_damages = r'[\w|-]* ?(?:damage|loss|capacity|cost).+?\$? ?[0-9][0-9|,|.]+[0-9]'
#regex_damages = r'(?:[\w|-]* ?){0,3}(?:damage|loss|capacity|cost).+?\$? ?[0-9][0-9|,|.]+[0-9]'
#regex_in_trust = r'(?:in-?trust|award).*?\$? ?[0-9][0-9|,|.]+[0-9]'

# Rule based dmg extraction REGEX patterns
regex_damages = r'(?![and])(?:[\w|-]* ?){0,2} ?(?:damage|loss|capacity|cost).+?\$? ?[0-9][0-9|,|.]+[0-9]'
regex_damages_2 = r'[^:] \$? ?[0-9][0-9|,|.]+[0-9] (?:for|representing)?[ \w\-+]+damages?'
regex_damages_3 = r'[^:] \$? ?[0-9][0-9|,|.]+[0-9] (?:for|representing)?[ \w\-+]+damages?(?:(?:for|representing)?.*?[;.\n])'
regex_future_care_loss = r'(?:future|past|in[-| ]?trust|award).*?(?:loss|costs?|income|care)?.*?\$? ?[0-9][0-9|,|.]+[0-9]'
regex_for_cost_of = r'\$? ?[0-9][0-9|,|.]+[0-9][\w ]*? cost .*?\.'

# Keywords to look in match for categorization
general_damage_keywords = [('general',), ('future', 'income', 'loss'), ('future', 'income'), ('future', 'wage', 'loss'), ('future', 'earning'), ('!past', 'earning', 'capacity'), ('future', 'capacity'), ('future', 'earning'), ('!past', 'loss', 'opportunity'), ('!past', 'loss', 'housekeep'), ('ei', 'benefit')]
special_damage_keywords = [('special',), ('trust',), ('past', 'income', 'loss'), ('past', 'wage'), ('past', 'earning'), ('past', 'income'), ('earning', 'capacity')]
aggravated_damage_keywords = [('aggravated',)]
non_pecuniary_damage_keywords = [('non', 'pecuniary')]
punitive_damage_keywords = [('punitive',)]
future_care_damage_keywords = [('future', 'care'), ('future', 'cost')]

def rule_based_damage_extraction(doc, min_score = 0.9, max_match_len_split = 10):
    '''Helper functino for rule_based_parse_BCJ
    
    Given a case, attempts to extract damages using regex patterns
    
    Arguments: doc (String): The case in text format following the form used in the DOCX to TXT notebook
    min_score (float): The minimum paragraph score to consider having a valid $ number
                       Paragraph has score 1 if its the last paragraph
                       Paragraph has score 0 if its the first paragraph
    max_match_len_split (int): The max amount of items that can appear in a regex match after splitting (no. words)
    
    Returns: damages (Dict): Contains any found damages
    
    '''
    damages = defaultdict(float)
    repetition_detection = defaultdict(set) # try to stem the repeated values
    no_paras = re.search(r'\(([0-9|,]+) paras?\.?\)', doc).group(1) # Get number of paragraphs
    pattern = r'([.]?)(?=\n[0-9]{1,%s}[\xa0|\s| ]{2})'%len(no_paras) # Used to split into paras
    paras_split = re.split(pattern, doc)
    money_patt = r'\$[0-9|,]+' # Used to get all paragraphs with a money amount
    scored_paras = [] # Score paragraphs based on where they appear in the document
                      # Score of 0.0 would be the first paragraph. Score of 1.0 would be the last paragraph
        
    for i, paragraph in enumerate(paras_split):
        if re.search(money_patt, paragraph):
            scored_paras.append((i / len(paras_split), paragraph)) # (score, paragraph). Score formula: i/no_paras
            
    scored_paras = sorted(scored_paras, key=lambda x:x[0])[::-1] # Store from last paragraph to first
    if len(scored_paras) == 0:
        return None
    if scored_paras[0][0] < min_score: #If highest scored paragraph is less than minimum score.
        return None
    
    patterns = [regex_damages, regex_damages_2, regex_damages_3, regex_future_care_loss, regex_for_cost_of]
    banned_words = ['seek', 'claim', 'propose', 'range', ' v. '] # Skip paragraphs containing these
    counter_words = ['summary', 'dismissed'] # Unless these are mentioned. 
                                             # example) "Special damage is $5k. But claims for aggravated are 'dismissed'" 
    
    # Get money mounts from the text
    total = None
    matches = []
    summary_matches = []
    for i, scored_para in enumerate(scored_paras):
        text = scored_para[1]
        score = scored_para[0]
        
        if score > min_score:
            if any(item.startswith('summary') for item in text.lower().split()[:4]) or any(item.startswith('conclusion') for item in text.lower().split()[:4]):
                text_matches = get_matching_text(patterns, text, max_match_len_split)
                for t_m in text_matches:
                    summary_matches.append((score, t_m))
            elif i+1 < len(scored_paras) and (any(item.startswith('summary') for item in scored_paras[i+1][1].lower().split()[-4:]) or any(item.startswith('conclusion') for item in scored_paras[i+1][1].lower().split()[-4:])):
                text_matches = get_matching_text(patterns, text, max_match_len_split)
                for t_m in text_matches:
                    summary_matches.append((score, t_m))
            else:
                skip = False # Skip paras with banned words
                for banned_word in banned_words: 
                    if banned_word in text:
                        skip = True       
                for counter_word in counter_words:
                    if counter_word in text:
                        skip = False
                if skip:
                    continue

                text_matches = get_matching_text(patterns, text, max_match_len_split)
                for t_m in text_matches:
                    matches.append((score, t_m))
        
    # Only keep matches from the summary if a summary was found. If not keep all matches.
    if len(summary_matches) > 0: 
        matches = summary_matches

    # Extract $ value. Determine correct column
    regex_number_extraction = r' ?[0-9][0-9|,|.]+[0-9]'
    for score, match in matches:
        skip = False # Banned words should not appear in final matches
        for banned_word in banned_words: 
            if banned_word in match:    
                skip = True
                break
        if skip:
            continue
        
        amount = re.findall(regex_number_extraction, match, re.IGNORECASE)
        extracted_value = clean_money_amount(amount)
        if extracted_value is None: # Make sure we are able to extract a value
            continue
            
        value_mapped = False # If we mapped the value into a damage category - stop trying to map into other categories
        value_mapped = assign_damage_to_category(extracted_value, general_damage_keywords, match, score, matches, 'General', damages, repetition_detection, repetition_key = ('general',))
        if not value_mapped:
            value_mapped = assign_damage_to_category(extracted_value, special_damage_keywords, match, score, matches, 'Special', damages, repetition_detection, repetition_key = ('special',))
        if not value_mapped:
            value_mapped = assign_damage_to_category(extracted_value, non_pecuniary_damage_keywords, match, score, matches, 'Non-pecuniary', damages, repetition_detection, repetition_key = ('non','pecuniary'))
        if not value_mapped:
            value_mapped = assign_damage_to_category(extracted_value, aggravated_damage_keywords, match, score, matches, 'Aggravated', damages, repetition_detection, repetition_key = ('aggravated',))
        if not value_mapped:
            value_mapped = assign_damage_to_category(extracted_value, aggravated_damage_keywords, match, score, matches, 'Punitive', damages, repetition_detection, repetition_key = ('punitive',))
        if not value_mapped:
            value_mapped = assign_damage_to_category(extracted_value, future_care_damage_keywords, match, score, matches, 'Future Care', damages, repetition_detection) 
        if not value_mapped: # Last attempt: Only use "total amounts" if nothing else was found
            total_keywords = [('total',), ('sum',), ('award',)]
            for keywords in total_keywords:
                if match_contains_words(match.lower(), keywords):
                    if is_best_score(score, matches, keywords):
                        if extracted_value not in repetition_detection[('total',)]:
                            damages['Pecuniary Total'] = damages['Special'] + damages['General'] + damages['Punitive'] + damages['Aggravated'] + damages['Future Care']
                            damages['Total'] = damages['Pecuniary Total'] + damages['Non-pecuniary']
                            if damages['Total'] == 0:
                                total = extracted_value
                                repetition_detection[('total',)].add(extracted_value)
                        
    damages['Pecuniary Total'] = damages['Special'] + damages['General'] + damages['Punitive'] + damages['Aggravated'] + damages['Future Care']
    damages['Total'] = damages['Pecuniary Total'] + damages['Non-pecuniary']
    
    if damages['Total'] == 0 and total is not None: # Only use the "total" if we couldnt find anything else!
        damages['Total'] = total
        damages['General'] = total
        
    columns = ['Total', 'Pecuniary Total', 'Non-pecuniary', 'Special', 'General', 'Punitive', 'Aggravated', 'Future Care']
    for c in columns:
        damages[c] = None if damages[c] == 0 else damages[c]
    
    return damages

def assign_damage_to_category(damage, damage_keywords, match, match_score, matches, damage_type, damage_dict, repetition_dict, repetition_key = None):
    '''Helper function for rule based damage extraction.
    
    Adds damage to dictionary based on given parameters so long as it is the
    highest scoring match & doesn't appear in the repetition dictionary
    
    Argumets:
    damage (float) - The damage amount in the match
    damage_keywords (list) - Keywords that may appear in match
    match (string) - The match string itself
    matches (list) - All matches. Used to determine if we found the best match
    damage_dict (dict) - Dictionary storing all damages
                       - Will be modified in place
    repetition_dict (dict) - Dictionary storing repeated values
                           - Will be modified in place
    (Optional) repetition_key (Tuple) - If not none, will use this key to store repetitions. Else will use matching keyword
    
    Returns:
    value_belongs (Boolean) - True if the value belongs in the given keyword category. False otherwise
    '''
    match = match.lower()
    value_belongs = False
    
    for keywords in damage_keywords:
        if match_contains_words(match, keywords):
            value_belongs = True
            if is_best_score(match_score, matches, keywords):
                if damage not in repetition_dict[repetition_key if repetition_key else keywords]:
                    damage_dict[damage_type] += damage
                    repetition_dict[repetition_key if repetition_key else keywords].add(damage)
            break
    
    return value_belongs

def clean_money_amount(money_regex_match):
    '''Helper function for rule based damage extraction.
    
    Arguments:
    money_regex_match (Regex.findall object) - Match of $ amount
    
    Returns:
    None if a bad match
    extracted_value (float) - The money amount in float form
    '''
    # If our regex contains more than 1 or 0 money values. We cannot use the match.
    if len(money_regex_match) > 1:
        return None
    if len(money_regex_match) == 0:
        print('Error: No Money in match!', match)
        return None
    extracted_value = None
    amount = money_regex_match[0].replace(',', '')
    # Deals with money at end of sentence. example) ... for '5,000.00.' -> '5000.00'
    if amount[-1] == '.': 
        amount = amount[:-1]
    if 'million' in amount:
        amount = str(float(re.findall('[0-9|\.]+', amount)[0])*10e6)
    # Deals with a rare typo in some cases. example) 50.000.00 -> 50000.00
    if amount.count('.') > 1: 
        dot_count = amount.count('.')
        changes_made = 0
        new_amount = ''
        for letter in amount:
            if letter == '.' and changes_made != dot_count-1:
                changes_made += 1
            else:
                new_amount += letter
        amount = new_amount
    extracted_value = float(amount)
    return extracted_value

def get_matching_text(patterns, text, max_match_len_split):
    '''Helper function for rule based damage extraction.
    
    Given a set of regex; pulls out all matching text
    
    Arguments:
    patterns (list) - List of regex patterns in string format
    text (string) - Text to search for matches in
    
    Returns:
    matches (list) - List containing all matches in text format
    '''

    matches = []
    for pattern in patterns:
        for match in re.findall(pattern, text, re.IGNORECASE):
            if 'and' not in match:
                if len(match.split()) <= max_match_len_split:
                    matches.append(match)
                    
    return matches

def is_best_score(score, matches, keywords):
    '''Helper function for rule based damage extraction.
    
    Given a set of regex matches, determine if the score is the highest score out of all matches for the given keywords
    
    Arguments:
    score (float) - The score of the item you're inspecting
    matches (list) - List of matches where each element is of form (score, match text)
    keywords (tuple) - All words that should appear in the match
    
    Returns: True or False
    
    '''
    best_score = score
    
    for score, match in matches:
        if all(word in match.lower() for word in keywords):
            if score > best_score:
                return False
            
    return True

def match_contains_words(match, words):
    '''Helper function for rule based damage extraction.
    
    Given some text. Find if the words are all present in the text.
    If word begins with '!' the word cannot appear in the text. Can handle mix/matching of both types.
    
    Arguments:
    match (String) - The text to look for words in
    words (list) - List of words to check for. If word begins with ! (i.e. '!past'), then the word cannot appear in it
    
    Returns:
    True if all words are present (or not present if using !)
    False otherwise
    
    '''
    pos_words = []
    neg_words = []
    for word in words:
        if word.startswith('!'):
            neg_words.append(word[1:])
        else:
            pos_words.append(word)
            
    if all(word in match for word in pos_words):
        if all(word not in match for word in neg_words):
            return True
        
    return False

In [None]:
cases = rule_based_parse_BCJ('../data/Lexis Cases txt/P1.txt')
cases

In [None]:
path= '../data/Lexis Cases txt/'
list_of_files = os.listdir(path)
print(list_of_files)all_cases_parsed =[]

for file in list_of_files:
    if file != ".DS_Store" and file != '.ipynb_checkpoints':
        all_cases_parsed.extend(rule_based_parse_BCJ(path + file))

In [None]:
all_cases_parsed =[]

for file in list_of_files:
    if file != ".DS_Store" and file != '.ipynb_checkpoints':
        all_cases_parsed.extend(rule_based_parse_BCJ(path + file))
    

In [None]:
all_cases_parsed[-2]

In [None]:
def rule_based_convert_cases_to_DF(cases):
    '''Given a list of parsed cases returns a dataframe'''
    lists = defaultdict(list)    
    for case in cases:
        lists['Case Number'].append(case['case_number'])
        lists['Case Name'].append(case['case_title'])
        lists['Year'].append(case['year'])
        lists['Total Damage'].append(case['damages']['Total'] if case['damages'] != None else None)
        lists['Total Pecuniary'].append(case['damages']['Pecuniary Total'] if case['damages'] != None else None)
        lists['Non Pecuniary'].append(case['damages']['Non-pecuniary'] if case['damages'] != None else None)
        lists['General'].append(case['damages']['General'] if case['damages'] != None else None)
        lists['Special'].append(case['damages']['Special'] if case['damages'] != None else None)
        lists['Punitive'].append(case['damages']['Punitive'] if case['damages'] != None else None)
        lists['Aggravated'].append(case['damages']['Aggravated'] if case['damages'] != None else None)
        lists['Future Care'].append(case['damages']['Future Care'] if case['damages'] != None else None)
        lists['Judge Name'].append(case['judge'])
        lists['Decision Length'].append(case['decision_length'])
        lists['Multiple defendants?'].append(case['multiple_defendants'])
#         lists['File'].append(case['filename'])
        lists['Plaintiff Wins?'].append(case['plaintiff_wins'])
        lists['Contributory Negligence Raised'].append(case['contributory_negligence_raised'])
        lists['Contributory Negligence Successful'].append(case['contributory_negligence_successful'])
        lists['Percent Reduction'].append(case['percent_reduction'])
        lists['Written Decision?'].append(case['written_decision'])
        lists['Registry'].append(case['registry'])
    df = pd.DataFrame()
    for key in lists.keys():
        df[key] = lists[key]
    return df

In [None]:
def evaluate(dev_data, gold_data, subset=None):
    # keep track of wrong % reductions
    case_titles_incorrect = set()
    print('#### Evaluation ####')
    # Use case name as 'primary key'
    dev_case_names = list(dev_data['Case Name'])
    gold_case_names = list(gold_data['Case Name'])
    # Filter data to only use overlapping items
    gold_data = gold_data[gold_data['Case Name'].isin(dev_case_names)]
    dev_data = dev_data[dev_data['Case Name'].isin(gold_case_names)]
    # Mapping from our variable names to Lachlan's column names
    column_mapping = {'Decision Length': 'Decision Length: paragraphs)',
                      'Total Damage': '$ Damages total before contributory negligence',
                      'Non Pecuniary': '$ Non-Pecuniary Damages', 
                      'Total Pecuniary': '$ Pecuniary Damages Total',
                      'Special': '$ Special damages Pecuniary (ie. any expenses already incurred)',
                      'Future Care': 'Future Care Costs (General Damages)',
                      'General': '$ General Damages',
                      'Punitive': '$ Punitive Damages',
                      'Aggravated': '$Aggravated Damages',
                      'Contributory Negligence Raised': 'Contributory Negligence Raised?',
                     'Contributory Negligence Successful':'Contributory Negligence Successful?',
                     'Percent Reduction':'% Reduction as a result of contributory negligence'
                     }
    dev_data.rename(columns = column_mapping, inplace = True)
    if subset is None: # Use all columns if no subset specified
        subset = dev_data.columns
    for column in dev_data.columns:
        if column in gold_data.columns:
            if column in subset:
                empty_correct = 0
                non_empty_correct = 0
                total_empty = 0
                total_non_empty = 0
                for case_name in list(dev_data['Case Name']):
                    dev_value = list(dev_data[dev_data['Case Name'] == case_name][column])[0]
                    gold_value = list(gold_data[gold_data['Case Name'] == case_name][column])[0]
                    # Convert string to float if possible
                    try:
                        gold_value = float(gold_value)
                    except:
                        pass
                    try:
                        dev_value = float(dev_value)
                    except:
                        pass
                    # Set values to 'None' if they're a NaN float value
                    dev_value = None if isinstance(dev_value, float) and math.isnan(dev_value) else dev_value
                    gold_value = None if isinstance(gold_value, float) and math.isnan(gold_value) else gold_value
                    # Lowercase values if they're a string
                    dev_value = dev_value.lower().strip() if isinstance(dev_value, str) else dev_value
                    gold_value = gold_value.lower().strip() if isinstance(gold_value, str) else gold_value
                    if gold_value is None:
                        total_empty += 1
                        if dev_value is None:
                            empty_correct += 1
                    else:
                        total_non_empty += 1
                        if isinstance(dev_value, float) and isinstance(gold_value, float):
                            if math.isclose(dev_value, gold_value, abs_tol=1): # Tolerance within 1
                                non_empty_correct += 1
                        elif dev_value == gold_value:
                            non_empty_correct += 1
                     # trying to trouble shoot % reduction issues       
                    if column == '% Reduction as a result of contributory negligence' and gold_value != dev_value:
                        case_titles_incorrect.add(case_name)
                        print(case_name)
                        print(dev_data[dev_data['Case Name'] == case_name]['Case Number'])
                        print('gold:', gold_value)
                        print('dev:', dev_value)
                        print('======')
                print('-------')
                print('COLUMN:', column)
                if total_empty != 0:
                    print('Empty field accuracy:', empty_correct / total_empty * 100, '%', empty_correct, '/', total_empty)
                if total_non_empty != 0:
                    print('Filled field accuracy:', non_empty_correct / total_non_empty * 100, '%', non_empty_correct, '/', total_non_empty)
                print('Overall accuracy:', (empty_correct+non_empty_correct) / (total_non_empty+total_empty) * 100, '%', (empty_correct+non_empty_correct), '/', (total_non_empty+total_empty))
    return case_titles_incorrect
    # for testing:
    #return dev_data, gold_data

In [None]:
test_df = rule_based_convert_cases_to_DF(all_cases_parsed)
test_df.head()

In [None]:
import numpy as np
gold = pd.read_excel('../data/Case Annotation.xlsx', header=2)
gold['Contributory Negligence Successful?'].replace(to_replace = ['Y', 'y'], value = True, inplace = True)
gold['Contributory Negligence Successful?'].replace(to_replace = ['N', 'n'], value = False, inplace = True)
gold['Contributory Negligence Successful?'].replace(to_replace = [np.nan], value = False, inplace = True)
gold['Contributory Negligence Raised?'].replace(to_replace = ['Y', 'y'], value = True, inplace = True)
gold['Contributory Negligence Raised?'].replace(to_replace = ['N', 'n'], value = False, inplace = True)
gold['Contributory Negligence Raised?'].replace(to_replace = [np.nan], value = False, inplace = True)
gold.head()

In [None]:
gold[gold['Case Name'] == 'Gill v. A&P Fruit Growers Ltd., [2009] B.C.J. No. 593']

In [None]:
test_df[test_df['Case Name'] == 'Paskall v. Scheithauer, [2012] B.C.J. No. 2601']

In [None]:
import math
case_titles_incorrect = evaluate(test_df, gold)

In [None]:
def paragraph_tokenize(case):
    ''' String of Entire Document and returns list of lists of paragraphs in document
    ---------
    Input: case (str) - string of single legal case
    Return: docs_split(list) - list of of numbrered paragraphs in the document where the first item is the case_title'''
    
    case_data = []
    lines = case.split('\n')
    if not 'British Columbia Judgments' in lines[1]:
        return
    case_data.append(lines[0])
    decision_length = re.search(r'\(([0-9|,]+) paras?\.?\)', case).group(1)

    # split paragraphs on newline, paragraph number, two spaces
    pattern = r'.?(?=\n[0-9]{1,%s}[\xa0]{2})'%len(decision_length)
    paras_split = re.split(pattern, case)

    paras = []
    for para in paras_split:   
        # make sure the paragraph starts with the correct characters
        para_start = re.match(r'^\n([0-9]{1,%s})[\xa0]{2}'%len(decision_length), para)
        if para_start:
            paras.append(para)
    case_data.extend(paras)
    return case_data

In [None]:
def summary_tokenize(case):
    ''' String of Entire Document and returns the document summary and HELD section.
    ---------
    Input: doc (str) - string of single legal case
    Return: summary - summary and HELD section of case (str)'''
    
    lines = case.split('\n')
    if not 'British Columbia Judgments' in lines[1]:
        return
    
    # split paragraphs on newline, paragraph number, two spaces
    summary = re.search(r'\([0-9]{1,3} paras\.\)\ncase summary\n((.*\n+?)+)(?=HELD|(Statutes, Regulations and Rules Cited:)|(Counsel\n))', case, re.IGNORECASE)
    if summary:
        summary = summary.group(1)
    else:
        return None

    return summary

In [None]:
def get_context_and_float(value, text, context_length = 6, plaintiff_name = 'Plaintiff', defendant_name = 'Defendant'):
    '''Given a string value found in a body of text, 
    return a its context of length context_length, and its float equivalent.
    -----------------
    Arguments:
    value - percent match found in text
    text - string value where matches were extracted from, eg paragraph or summary (str)
    context_length - the length of context around each quantity to return
    Rerturn:
    value_context - string of context around value (str)
    extracted_value - string quantity value extracted to its float equivalent
    '''
    
    
    # get context for monetary/percent values 
    
    context = ''
    amount = re.findall(r'[0-9]+[0-9|,]*(?:\.[0-9]+)?', value)
    extracted_value = clean_money_amount(amount) #use helper function to get float of dollar/percent value
    if not extracted_value:
        print('cant convert string, %s'%value)
        return context, None
    # get indices of last instance of value in text - tokenize like this for values of type 'per cent and percent'
    start_idx = text.rfind(value)
    if start_idx == -1:
        print('ERROR: value not in text')
    end_idx = start_idx + len(value)
    tokens = text[:start_idx].split() + [value] + text[end_idx:].split()
#     if 'percent' in value:
#         value = amount[0]+'%'
#         text = text.replace(value, amount[0]+'%')
#     elif 'per cent' in value:
#         value = amount[0]+'%'
#         text = text.replace(value, amount[0]+'%')
#     tokens = text.split()
    loc = [i for i, token in enumerate(tokens) if value in token] 
    # if the quantity is in the text, choose context of last mention of value
    if len(loc) > 0:
        loc = loc[-1] 
        if loc - context_length >= 0 and loc + context_length < len(tokens):
            context = " ".join(tokens[loc - context_length:loc + context_length + 1])
        elif loc - context_length < 0 and loc + context_length < len(tokens):
            beg = abs(loc -context_length)
            context = " ".join(tokens[loc-context_length + beg:loc + context_length + 1])
        elif loc - context_length > 0 and loc + context_length > len(tokens): 
            context = " ".join(tokens[loc - context_length:len(tokens)])


    return context.lower(), extracted_value

In [None]:
def rule_based_parse_BCJ(path):
    '''Given file path (text file) of negligence cases, finds static 
    information within the case (information that can be pattern matched)
    Expects a B.C.J. case format (British Columbia Judgments)
    
    The following fields are currently implemented:
    - Case Title
    - Judge Name
    - Registry
    - Year
    - Decision Length (in paragraphs)
    - Damages
    - Multiple Defendants
    - Plaintiff Wins
    
    Arguments: doc (String): The case in text format following the form used in the DOCX to TXT notebook
    Returns: case_parsed_data (list) of case_dict (Dictionary): List of Dictionaries with rule based parsable fields filled in
    '''
    if path:
        with open(path, encoding='utf-8') as document:
            document_data = document.read()
        document_data = document_data.split('End of Document\n') # Always split on 'End of Document\n'
    

    case_parsed_data = []
    for i in range(len(document_data)):
        case_dict = dict() 
        case = document_data[i]
        case = case.strip() # Make sure to strip!
        if len(case) == 0: # Skip empty lines
            continue
        
        lines = case.split('\n')
        if len(lines) < 2:
            print(case)
        case_title = lines[0]
        case_type = lines[1]

        if 'R. v.' in case_title or '(Re)' in case_title: # Skip crown cases, Skip (Re) cases
            continue
            
        # Skip client/solicitor cases (not same as plaintiff/defendant)
        regex_client_solicitor = re.search(r'(Between.*([C|c]lient[s]?).*([S|s]olicitor[s]?|[L|l]awyer[s]?))', case)
        if regex_client_solicitor:
            continue
        
        regex_solicitor_client = re.search(r'(Between.*([L|l]awyer[s]?|[S|s]olicitor[s]?).*([C|c]lient[s]?))', case)
        if regex_solicitor_client:
            continue
            
        # In some rare cases we have 'IN THE MATTER OF ..' (rather than 'Between ...') .. but it is following by the normal
        # plaintiff/defendant dynamic. Only skip cases if there is no mention of the following terms
        # (Can be cleaned up in future)
        key_words = ['appellant', 'respondent', 'claimant', 'petitioner', 'plaintiff', 'defendant',
        'appellants', 'respondents', 'claimants', 'petitioners', 'plaintiffs', 'defendants']
        regex_in_matter_of = re.search(r'IN THE MATTER OF .*\n\([0-9]+ paras.\)', case)
        if regex_in_matter_of:
            remove = True
            for key in key_words:
                if key in regex_in_matter_of.group(0).lower().strip():
                    remove = False
                    
            if remove:
                continue

        if 'British Columbia Judgments' in case_type: # Make sure we're dealing with a B.C.J. case
        
            # Fields that can be found via pattern matching
            if re.search('contributory negligence', case, re.IGNORECASE):
                contributory_negligence_raised = True
            else:
                contributory_negligence_raised = False
            case_number = re.search(r'\/P([0-9]+)\.txt', path).group(1)
            decision_len = re.search(r'\(([0-9]+) paras\.?\)', case) # e.g.) (100 paras.)
            registry = re.search(r'(Registry|Registries): ?([A-Za-z0-9 ]+)', case) # e.g.) Registry: Vancouver
            written_decision = True if int(decision_len.group(1)) > 1 else False
            if registry:
                registry = registry.group(2).strip()
            else:
                registry = re.search(r'([A-Za-z ]+) Registry No.', case) # Alt form e.g.) Vancouver Registory No. XXX
                if registry:
                    registry = registry.group(1).strip()
                else:
                    registry = re.search(r'([A-Za-z ]+) No. S[0-9]*', case)
                    if registry:
                        registry = registry.group(1).strip()
                    else:
                        print('WARNING: Registry could not be found (This shouldn\'t occur!)')
            # Fields that are always in the same place
            judge_name = lines[4].strip()
            case_title = lines[0].strip()
            # Extract year from case_title (in case we want to make visualizations, etc.)
            year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
            if year:
                year = year.group(0)
            else:
                # Rare case: Sometimes the title is too long. Rely on Heard date.
                year = re.search(r'Heard:.* ([2][0][0-2][0-9])', case)
                if year:
                    year = year.group(1)
                else:
                    print('WARNING: Year not found')
            case_dict['case_number'] = '%s of %s'%(i+1+((int(case_number)-1)*50), case_number)
            case_dict['case_title'] = case_title
            case_dict['year'] = year
            case_dict['registry'] = registry
            case_dict['judge'] = judge_name
            case_dict['decision_length'] = decision_len.group(1)
            case_dict['multiple_defendants'] = rule_based_multiple_defendants_parse(case)
            case_dict['contributory_negligence_raised'] = contributory_negligence_raised
            case_dict['written_decision'] = written_decision
            
            # TODO: Improve plaintiff_wins to take one case at a time.
            plaintiff_list = plaintiff_wins(path)
            if case_title in plaintiff_list:
                case_dict['plaintiff_wins'] = plaintiff_list[case_title]
            else:
                case_dict['plaintiff_wins'] = "NA"
                
            case_dict['damages'] = rule_based_damage_extraction(case)
            percent_reduction, contributory_negligence_successful = get_percent_reduction_and_contributory_negligence_success(case_dict, case)
            case_dict['percent_reduction'] = percent_reduction
            case_dict['contributory_negligence_successful'] = contributory_negligence_successful
        # don't add empty dictionaries (non BCJ cases) to list
        if case_dict != dict(): 
            case_parsed_data.append(case_dict)
    return case_parsed_data

In [None]:
def conditions_for_extracted_value(context, extracted_value, keywords, plaintiff_split, defendant_split, entities):
    ''' Given the context surrounding an extracted value, keywords relevant to contributory negligence, 
    a list of the Plaintiffs names, a list of the defendants names, and a combined list of entities related to either the Plaintiff or Defendant:
    Return the modifed extracted value
    ------------
    Arugments:
    context: (str)
    extracted_value: (float) found in context
    keywords, plaintiff_split, defendant_split, entities: (list) of strings
    '''
    # conditions for keeping extracted_value and updating extracted_value
    # skip extracted_values with contexts lacking keywords/entities
    if extracted_value == 100 or extracted_value == 0 or extracted_value < 10:
        extracted_value = None
        return extracted_value
    if not any(token in context for token in keywords + entities) or context == '' or any('costs' == token for token in context.split()) or ('interest' in context and 'rate' in context.split()):
        extracted_value = None
        return extracted_value
    if 'recover' in context and any(word in context for word in plaintiff_split + ['plaintiff']):
#         print('plaintiff recovers percent subtract %s from 100'%extracted_value)
        extracted_value = 100 - extracted_value
    if any(word1 in context and word2 in context for word1 in defendant_split + ['defendant'] for word2 in ['liable', 'responsible', 'fault', 'against']):
#         print('defendat is %s liable, subtract from 100'%extracted_value)
        extracted_value = 100 - extracted_value
    return extracted_value

In [None]:
keywords = ['against', 'reduce', 'liability', 'liable', 'contributor', 'fault', 'apportion', 'recover', 'responsible']
entities = ['defendant', 'plaintiff', 'she', 'he', 'John', 'Jane']
context1 = 'was held 90%'
test =keywords + entities

if not any(word in context for word in test):
    print('fdsfm')

In [None]:
def contributory_negligence_successful_fun(context, keywords):
    '''Given text containing percent reduction and a list of keywords to check for,
    confirm presence of keywords and return whether or not contributory negligence was successful
    --------------
    Arguments:
    context (str)
    keywords(list)
    Returns: True or None (bool)'''
    if any(word in context for word in keywords):
        if 'plaintiff' or 'damages' or 'defendant' in context:
            contributory_negligence_successful = True
            return contributory_negligence_successful
    return

In [None]:
def get_percent_reduction_and_contributory_negligence_success(case_dict, case, min_score = 0.9):
    paragraphs = paragraph_tokenize(case)
    case_title = case_dict['case_title']
    assert paragraphs[0] == case_title
    # default value for contributory negligence success is FALSE
    contributory_negligence_successful = False
    percent_pattern = r'([0-9][0-9|\.]*(?:%|\sper\s?cent))'
    # entities and keywords used to filter percent values
    keywords = ['against', 'reduce', 'liability', 'liable', 'contributor', 'fault', 'apportion', 'recover', 'responsible']
    # extract plaintiff and defendant name for use in %reduction conditions
    plaintiff_defendant_pattern = r'([A-Za-z|-|\.]+(:? \(.*\))?)+ v\. ([A-Za-z|-]+)+' # group 1 is plaintiff group 2 is defendant
    if re.search(plaintiff_defendant_pattern, case_title):
        plaitiff_defendant = re.search(plaintiff_defendant_pattern, case_title).groups() # tuple (plaintiff, defendant)
    else:
        plaitiff_defendant = ('Plaintiff', 'Defendant')
    plaintiff_split = [word.lower() for word in plaitiff_defendant[0].split()]
    defendant_split = [word.lower() for word in plaitiff_defendant[-1].split()]
    entities = ['defendant', 'plaintiff'] + plaintiff_split + defendant_split 

    if case_dict['contributory_negligence_raised'] and case_dict['plaintiff_wins']:
        #### troubleshooting~~
        if case_title in case_titles_incorrect:
            print(case_title)
        percent_reduction = None
        best_percent = None
        best_score = 0
        for j, paragraph in enumerate(paragraphs[1:]):
            score = float((j+1)/int(case_dict['decision_length']))
            paragraph = paragraph.lower()
            if not score >= min_score: ## min score not existant in bcj parser
                continue

            percent_mentioned = re.findall(percent_pattern, paragraph, re.IGNORECASE)
            extracted_value_tie_breaker = Counter()
            if len(percent_mentioned) > 0:
                for percent in percent_mentioned:
                    context, extracted_value = get_context_and_float(percent, paragraph)
                    # conditions for keeping extracted_value and updating extracted_value
                    # skip extracted_values with contexts lacking keywords/entities
                    if context == '':
                        continue
                    extracted_value = conditions_for_extracted_value(context, extracted_value, keywords, plaintiff_split, defendant_split, entities)
                    if not extracted_value:
                        
                        continue
                        
                    extracted_value_tie_breaker.update([extracted_value])
                
                    # conditions for contributory negligence successful
                    if not contributory_negligence_successful and extracted_value:
                        contributory_negligence_successful = contributory_negligence_successful_fun(context, keywords)
                    #### troubleshooting~~
                    if case_title in case_titles_incorrect:
                        print(extracted_value_tie_breaker, context)
                        
                    # matches patter "PERCENT against plaintiff"
                    if ('against' in context or 'fault' in context) and any(plaintiff_word in context for plaintiff_word in plaintiff_split+['plaintiff']):
                        best_percent = extracted_value
                        best_score = score
                        break                    
                    
                    # choose most common percent mentioned in highest scoring paragraph
                    if extracted_value_tie_breaker != Counter():
                        if score > best_score:
                            best_score = score
                            best_percent = extracted_value_tie_breaker.most_common(1)[0][0]

                #### troubleshooting~~
                if case_title in case_titles_incorrect:
                    print("paragraph:", best_score, best_percent)
                    print('======')
             # if no percent found, check for equal apportionment
            else:
                equal_apportionment = re.findall(r'.{20} (?:liability|fault) [a-zA-Z]{1,3} apportione?d? equally .{20}', paragraph)
                if len(equal_apportionment) > 0:
                    if contributory_negligence_successful_fun(equal_apportionment[0], keywords):
                        best_percent = 50.0
                        contributory_negligence_successful = True
        
        if best_score == 0 or not best_percent or not contributory_negligence_successful:
            # no percents found in paragraphs - time to check summary - same process
            summary = summary_tokenize(case)
            if summary:
                summary = summary.lower()
                percent_mentioned = re.findall(percent_pattern, summary, re.IGNORECASE)
                #### troubleshooting~~
                if case_title in case_titles_incorrect:
                    print('checking summary...')
                    print(percent_mentioned)
                extracted_value_tie_breaker = Counter()
                if len(percent_mentioned) > 0:
                    for percent in percent_mentioned:
                        context, extracted_value = get_context_and_float(percent, summary)
                        #### troubleshooting~~
                        if case_title in case_titles_incorrect:
                            print(extracted_value, context)
                        # conditions for keeping extracted_value and updating extracted_value
                        # skip extracted_values with contexts lacking keywords/entities
                        extracted_value = conditions_for_extracted_value(context, extracted_value, keywords, plaintiff_split, defendant_split, entities)
                        if not extracted_value:
                            continue
                        extracted_value_tie_breaker.update([extracted_value])
                                                   
                        # conditions for contributory negligence successful
                        if not contributory_negligence_successful and extracted_value:
                            contributory_negligence_successful = contributory_negligence_successful_fun(context, keywords) 
                            
                        # matches patter "PERCENT against plaintiff"
                        if ('against' in context or 'fault' in context) and any(plaintiff_word in context for plaintiff_word in plaintiff_split+['plaintiff']):
                            best_percent = extracted_value
                            best_score = score
                            break 
                        # choose most common percent mentioned in summary
                        if extracted_value_tie_breaker != Counter():
                            best_percent = extracted_value_tie_breaker.most_common(1)[0][0]
                        
                        #### troubleshooting~~
                        if case_title in case_titles_incorrect:
                            print("summary:", best_score, best_percent)
                            print('======')
               # if no percent found, check for equal apportionment
                else:
                    #### troubleshooting~~
                    if case_title in case_titles_incorrect:
                        print('checking equal apportionment...')
                    equal_apportionment = re.findall(r'.{20} (?:liability|fault) [a-zA-Z]{1,3} apportione?d? equally .{20}', summary)
                    if len(equal_apportionment) > 0:
                        if contributory_negligence_successful_fun(equal_apportionment[0], keywords):
                            best_percent = 50.0
                            contributory_negligence_successful = True
        if contributory_negligence_successful:
            percent_reduction = best_percent
    else:
        percent_reduction = None
    #### troubleshooting~~
    if case_title in case_titles_incorrect:
        print(percent_reduction)    
    return percent_reduction, contributory_negligence_successful

In [None]:
text = '''Case Summary
Damages — Mitigation — In tort — Personal injuries, treatment for — Torts — Negligence — Standard of care, particular persons and relationships — Police officers — Motor vehicle, standard of care of driver — Keeping a proper lookout — Emergencies — Circumstances requiring caution or extreme caution — Emergency or police vehicles — Defences — Contributory negligence — Apportionment of fault.
Action by Blackburn for damages for injuries suffered when her vehicle collided with a police car at an intersection. Constable Leyh was on his way to another accident and was proceeding through the intersection with his lights and siren activated. He slowed down as he approached the intersection and began speeding up half-way through it. Blackburn was a hearing impaired 17-year-old driver who was completely deaf without her hearing aids. Although she was wearing her hearing aids, witnesses stated that her radio was playing loudly. Blackburn did not hear the siren at all. Several vehicles in the lane next to Blackburn's lane had stopped at the green light. Blackburn proceeded through the intersection on the green light and was struck by the police car proceeding through the intersection on the red light. Constable Leyh had stopped at his home for his rain coat prior to leaving for the scene of the accident. As a result of the collision, Blackburn suffered a moderately severe cervical sprain and post-traumatic stress. However, she failed to follow the recommended course of treatment. 

HELD: Action allowed in part.
 Constable Leyh took a calculated risk that was not proportionate to the urgency of the situation. The amount of time consumed by retrieving his raincoat was much greater than the few seconds that would have been consumed by momentarily delaying his acceleration. Speeding up where there was still a risk of a vehicle approaching was not justified by the urgency of the situation. Blackburn failed to observe that traffic in the lane next to her had stopped at the green light and did not reduce her speed or exercise appropriate caution as she approached the intersection. Visual attentiveness took on an added importance in respect of a deaf person driving a car. Blackburn should have paid special attention to visual clues and should not have had the volume of her radio turned up loudly. Fault for the accident was apportioned 80 per cent to Blackburn and 20 percent to Constable Leyh. Taking into account Blackburn's failure to mitigate her damages, her non-pecuniary losses were assessed at $30,000 subject to the liability apportionment.'''
get_context_and_float('80', text)

## Annotations of Numbers/Money

In [None]:
my_annotations = set(gold.iloc[:44, 1].values)

In [None]:
path= '../data/Lexis Cases txt/'
list_of_files = os.listdir(path)
to_annotate = dict()

# f = open('../data/ilanas_annotations_.txt', 'w')
for file in list_of_files:
    if file != ".DS_Store" and file != '.ipynb_checkpoints':
        with open(path+file, encoding='utf-8') as document:
            document_data = document.read()
        document_data = document_data.split('End of Document\n') # Always split on 'End of Document\n'
        for i in range(len(document_data)):
            case = document_data[i]
            if len(case) == 0: # Skip empty lines
                continue
            lines = case.split('\n')
            case_title = lines[0]
            if case_title in my_annotations:
                to_annotate[case_title] = re.findall(r'\$ ?[0-9][0-9|,|.]+[0-9]', case)
#                 f.write(case + 'End of Document\n')
            
# f.close()

## Classification

### Step 1: Read in Annotations

In [111]:
from nltk.corpus import stopwords
import re
stop_words = set(stopwords.words('english'))
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
from nltk import sent_tokenize

In [289]:
def case2tags(case):
    '''Input case is a string from the legal negligence case, with xml tags indicating damage types.
    Return a list of quantities tagged (str) and a list of the associated damage type tags corresponding to those values
    -------------------
    Example: 
    case = "I asses non-pecuniary damages of <damage type=non pecuniary>$1,000,000</damage>"
    case2tags(case) = ['$1,000,000'], ['non pecuniary']
    '''
    # your code here
    soup = BeautifulSoup('<xml>'+case+'</xml>', "xml")
    tags = []
    values = []
    full_match = []
    for damage in soup.find_all('damage'):
        if 'non-pecuniary' in damage['type']:
            tags.append(damage['type'].replace('non-pecuniary', 'non pecuniary'))
        else:
            tags.append(damage['type'])
        values.append(damage.get_text())
    return values, tags

In [282]:
import os


In [291]:
tag_distribution

Counter({'total': 105,
         'other': 1670,
         'total after': 16,
         'non pecuniary': 126,
         'past wage loss': 108,
         'sub-future wage loss': 22,
         'future care': 97,
         'special': 117,
         'future wage loss': 91,
         'reduction': 2,
         'sub-past wage loss': 18,
         'sub-special': 58,
         'sub-future care': 41,
         'general': 31,
         'punitive': 7,
         'in trust': 4,
         'reduction by': 5,
         'aggravated': 8,
         'sub-non pecuniary': 8,
         'sub-total': 12,
         'sub-general': 15,
         'reduction to': 8,
         'past income loss': 1})

In [318]:
case_info = defaultdict(dict)
case_tags = []
tag_distribution = Counter()

# define context length for use in features
context_length = 5
features_per_case = []
path = "../data/"
list_of_files = os.listdir(path)

#iterate over new annotations
for file in list_of_files:
    if file == "ilanas_annotations.txt" or file == "ravi_annotations.txt":
        with open(path+file) as f:
            cases = f.read()
        cases = cases.split('End of Document\n')
        for i in range(len(cases)):
            case = cases[i]
            if len(case) == 0: # Skip empty lines
                continue
            lines = case.split('\n')
            case_title = lines[0]
            case_type = lines[1]

            # skip irrelevant cases
            if 'R. v.' in case_title or '(Re)' in case_title: # Skip crown cases, Skip (Re) cases
                    continue

            # Skip client/solicitor cases (not same as plaintiff/defendant)
            regex_client_solicitor = re.search(r'(Between.*([C|c]lient[s]?).*([S|s]olicitor[s]?|[L|l]awyer[s]?))', case)
            if regex_client_solicitor:
                continue
            regex_solicitor_client = re.search(r'(Between.*([L|l]awyer[s]?|[S|s]olicitor[s]?).*([C|c]lient[s]?))', case)
            if regex_solicitor_client:
                continue

            key_words = ['appellant', 'respondent', 'claimant', 'petitioner', 'plaintiff', 'defendant',
                'appellants', 'respondents', 'claimants', 'petitioners', 'plaintiffs', 'defendants']
            regex_in_matter_of = re.search(r'IN THE MATTER OF .*\n\([0-9]+ paras.\)', case)
            if regex_in_matter_of:
                remove = True
                for key in key_words:
                    if key in regex_in_matter_of.group(0).lower().strip():
                        remove = False
                if remove:
                    continue
                    
            # Make sure we're dealing with a B.C.J. case   
            if 'British Columbia Judgments' in case_type: 
                
                #remove stopwords and lower case
                case = ' '.join([word for word in case.lower().split() if word not in stop_words])
                #get tagged values from annotations
                values, tags = case2tags(case)
                case_tags.append(tags)
                
                # save tags, values per case in dictionary for reference
                case_info[case_title]['values'] = values
                case_info[case_title]['tags'] = tags
                tag_distribution.update(tags)

                # get context of tagged values in case
                print(case_title)
                print(len(values))
                visited_indices = set()
                case_feats = []
                for value, tag in zip(values, tags):
                    # save features in dictionary
                    features = dict()
                    
                    # temp value is equal to value without $ symbol - useful for regex escape issues
                    temp = value
                    if '$' in value:
                        temp = value.replace('$', '')
                    pattern = re.compile('''<damage type=['|"]%s['|"]>\$?%s<\/damage>'''%(tag, temp))
                    matches = pattern.finditer(case)
                    for match in matches:
                        if match.start() in visited_indices: # dont want to add the same context twice
                            continue
                        start_idx = match.start()
                        end_idx = match.end()
                        
                        # first use nltk sent_tokenize to get sentence on either side of value only
                        tokens = sent_tokenize(case[:start_idx])[-1] + " "+ value + " " + sent_tokenize(case[end_idx:])[0] #sentence before and after value
                        tokens = tokens.split()
                        # get indices of quantity value in text
                        loc = [i for i, token in enumerate(tokens) if value == tokens[i] or i+1<len(tokens) and value == tokens[i]+" "+tokens[i+1]] #case $3 million split on whitespace

                        # remove tags from text for context
                        tokens = " ".join(tokens)
                        tokens = re.sub("<damage", "", tokens)
                        tokens = re.sub("<\/damage>", "", tokens)
                        tokens = re.sub("type='[a-z| |-]+'>", '', tokens).split()
                        
                        # there should always be (at least) one match - ideally one...
                        if len(loc) < 1:
                            print("WHYY")
                            print(match.start())
                            print(value)
                            continue
                        if len(loc) > 1:
                            print(loc)
                            print(value)
                            print(tokens[loc[0]-5:loc[0]], tokens[loc[-1]-5:loc[-1]])
                            loc = loc[-1] #if more than one match, choose last one
                        else:
                            loc = loc[0] 


                        # context before and after of length: context-length    
                        if loc - context_length >= 0 and loc + context_length < len(tokens):
                            context_before = " ".join(tokens[loc - context_length:loc+1])
                            context_after = " ".join(tokens[loc+1:loc + context_length + 1])
                            context = " ".join(tokens[loc - context_length:loc + context_length + 1])
                        elif loc - context_length < 0 and loc + context_length < len(tokens):
                            beg = abs(loc -context_length)
                            context_before = " ".join(tokens[loc-context_length + beg:loc+1])
                            context_after =  " ".join(tokens[loc+1:loc + context_length + 1])
                            context = " ".join(tokens[loc-context_length + beg:loc + context_length + 1])
                        elif loc - context_length > 0 and loc + context_length > len(tokens): 
                            context_before = " ".join(tokens[loc - context_length:loc+1])
                            context_after = " ".join(tokens[loc +1 : len(tokens)])
                            context = " ".join(tokens[loc - context_length : len(tokens)])

        #                 assert any(val in tokens[loc] for val in value)
                        count+= 1

                        visited_indices.add(match.start())
                        # features to engineer...
                        features['value'] = value
                        features['context_before'] = context_before
                        features['context_after'] = context_after
                        features['context'] = context
                        features['float'] = clean_money_amount([temp])
                        features['start_idx_ratio'] = match.start()/len(case)
                        features['greater_than_1000'] = features['float'] > 1000
                        break #only add one match at a time - features need to have same order as values list
                    case_feats.append(features)
                features_per_case.append(case_feats)

Mennonite Church British Columbia v. Sur-Del Roofing Ltd., [2010] B.C.J. No. 297
12
Paskall v. Scheithauer, [2012] B.C.J. No. 2601
23
Salgado v. Toth, [2009] B.C.J. No. 2230
47
Gray v. Fraser Health Authority (c.o.b. Ridge Meadows Hospital), [2009] B.C.J. No. 372
40
Kirkham v. Richardson, [2014] B.C.J. No. 1194
76
Neff v. Patry, [2008] B.C.J. No. 209
14
Hardychuk v. Johnstone, [2012] B.C.J. No. 1909
63
Najdychor v. Swartz, [2009] B.C.J. No. 1202
64
Delgiglio v. Becker, [2012] B.C.J. No. 650
45
Rycroft v. Rego, [2017] B.C.J. No. 447
30
Roger Garside Construction Ltd. v. Stirling, [2013] B.C.J. No. 1777
2
Zhang v. Law, [2009] B.C.J. No. 1468
27
Chamberlain v. Pro Star Mechanical Technologies Ltd., [2014] B.C.J. No. 2669
33
Andrusko v. Alexander, [2013] B.C.J. No. 1161
51
Anderson v. Minhas, [2011] B.C.J. No. 259
22
Berenjian v. Primus, [2013] B.C.J. No. 194
26
Kumar v. Picco, [2007] B.C.J. No. 2463
29
Gillespie v. Yellow Cab Co., [2014] B.C.J. No. 2332
49
Gregory v. Penner, [2010] B.C.J.

In [263]:
for i in range(len(features_per_case)):
    d = features_per_case[i]
    d2 = case_info[i]
    if d ==[]:
        continue
    print(len(d))
    print(len(d2))
    print('===')


90
90
===
23
23
===
60
60
===
14
14
===
27
27
===
38
38
===
4
4
===
2
2
===
17
17
===
8
8
===
38
38
===
153
153
===
63
63
===
54
54
===
2
2
===
7
7
===
21
21
===
31
31
===
7
7
===
41
41
===
45
45
===
7
7
===
18
18
===
7
7
===
54
54
===
55
55
===
84
84
===
38
38
===
6
6
===


In [319]:
for i in range(len(features_per_case[-3])):
    d = features_per_case[-3][i]
    v = case_tags[-3][i]
    print(d, v)
    print('===')

{'value': '$5,647,773', 'context_before': 'pain suffering — plaintiff awarded $5,647,773', 'context_after': 'total damages injuries suffered bicycle', 'context': 'pain suffering — plaintiff awarded $5,647,773 total damages injuries suffered bicycle', 'float': 5647773.0, 'start_idx_ratio': 0.006906599039551071, 'greater_than_1000': True} total
===
{'value': '$5,647,773', 'context_before': '— paralysis — plaintiff awarded $5,647,773', 'context_after': 'total damages injuries suffered bicycle', 'context': '— paralysis — plaintiff awarded $5,647,773 total damages injuries suffered bicycle', 'float': 5647773.0, 'start_idx_ratio': 0.009910250184355833, 'greater_than_1000': True} total
===
{'value': '$5,647,773', 'context_before': 'highway repair — plaintiff awarded $5,647,773', 'context_after': 'total damages injuries suffered bicycle', 'context': 'highway repair — plaintiff awarded $5,647,773 total damages injuries suffered bicycle', 'float': 5647773.0, 'start_idx_ratio': 0.0131297325491465

## Classifier!

In [294]:
from itertools import chain
from sklearn.metrics import f1_score, classification_report

# turn list of lists into one-dimenion
tags = list(chain.from_iterable(case_info))
feats = list(chain.from_iterable(features_per_case))


In [295]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
vectorizer = DictVectorizer()

In [296]:
from sklearn.model_selection import train_test_split #cross validation instead
X_train, X_test, y_train, y_test = train_test_split(
     feats, tags, test_size=0.33, random_state=42)

In [297]:
X_train = vectorizer.fit_transform(X_train)
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [298]:
X_test = vectorizer.transform(X_test)
y_pred = clf.predict(X_test)
print('macro: %s, micro: %s f-scores' %(f1_score(y_test, y_pred, average = 'macro'), f1_score(y_test, y_pred, average= 'micro')))

macro: 0.09955655785733969, micro: 0.47761194029850745 f-scores


In [299]:
clf.score(X_test, y_test)

0.47761194029850745

In [300]:
classification_report(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


'                   precision    recall  f1-score   support\n\n       aggravated       0.00      0.00      0.00         2\n      future care       1.00      0.25      0.40        16\n future wage loss       0.00      0.00      0.00         7\n          general       0.00      0.00      0.00         9\n    non pecuniary       0.00      0.00      0.00        11\n            other       0.69      0.69      0.69       210\n   past wage loss       0.10      0.08      0.09        13\n         punitive       0.00      0.00      0.00         2\n     reduction to       0.00      0.00      0.00         2\n          special       0.00      0.00      0.00        22\n  sub-future care       0.00      0.00      0.00         5\n      sub-general       0.00      0.00      0.00         4\nsub-non pecuniary       0.00      0.00      0.00         3\n      sub-special       0.17      0.20      0.18        10\n        sub-total       0.00      0.00      0.00         3\n            total       0.15      0.5

In [305]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)
svc.score(X_test, y_test)
y_pred_svc = svc.predict(X_test)
classification_report(y_test, y_pred_svc)

  _warn_prf(average, modifier, msg_start, len(result))


'                   precision    recall  f1-score   support\n\n       aggravated       0.00      0.00      0.00         2\n      future care       0.00      0.00      0.00        16\n future wage loss       0.00      0.00      0.00         7\n          general       0.00      0.00      0.00         9\n    non pecuniary       0.00      0.00      0.00        11\n            other       0.63      0.99      0.77       210\n   past wage loss       0.00      0.00      0.00        13\n         punitive       0.00      0.00      0.00         2\n     reduction to       0.00      0.00      0.00         2\n          special       0.00      0.00      0.00        22\n  sub-future care       0.00      0.00      0.00         5\n      sub-general       0.00      0.00      0.00         4\nsub-non pecuniary       0.00      0.00      0.00         3\n      sub-special       0.00      0.00      0.00        10\n        sub-total       0.00      0.00      0.00         3\n            total       0.71      0.3

In [28]:
case2tags("I asses non-pecuniary damages of <damage type='non pecuniary'>$1,000,000</damage>")

<damage type="non pecuniary">$1,000,000</damage>


(['$1,000,000'], ['non pecuniary'])

In [25]:
soup = BeautifulSoup('<xml>'+"I asses non-pecuniary damages of <damage type='non pecuniary'>$1,000,000</damage>"+'</xml>', "xml")
soup.find_all('damage')

[<damage type="non pecuniary">$1,000,000</damage>]