#### Libraries

In [730]:
import re
from collections import defaultdict
import nltk
import pandas as pd
import math

#### Global Variables to aid with filenames

In [2]:
path_to_data = '../data/Lexis Cases txt/'
file_prefix = 'P'
file_suffix = '.txt'
file_identifiers = range(1, 86) # Range from 1 to 85

#### Code to parse each single document

# WORK IN PROGRESS
##### Currently is able to print out the Decision Length, Judge Name, Year, and Registry Loc of any record

In [815]:
def rule_based_parse_BCJ(path):
    '''Given file path (text file) of negligence cases, finds static 
    information within the case (information that can be pattern matched)
    Expects a B.C.J. case format (British Columbia Judgments)
    
    The following fields are currently implemented:
    - Case Title
    - Judge Name
    - Registry
    - Year
    - Decision Length (in paragraphs)
    - Damages
    - Multiple Defendants
    - Plaintiff Wins
    
    Arguments: doc (String): The case in text format following the form used in the DOCX to TXT notebook
    Returns: case_parsed_data (list) of case_dict (Dictionary): List of Dictionaries with rule based parsable fields filled in
    '''
    with open(path, encoding='utf-8') as document:
        document_data = document.read()
        
    document_data = document_data.split('End of Document\n') # Always split on 'End of Document\n'
    case_parsed_data = []
    for i in range(len(document_data)):
        case_dict = dict() 
        case = document_data[i]
        case = case.strip() # Make sure to strip!
        if len(case) == 0: # Skip empty lines
            continue
        
        lines = case.split('\n')
        if len(lines) < 2:
            print(case)
        case_title = lines[0]
        case_type = lines[1]

        if 'R. v.' in case_title or '(Re)' in case_title: # Skip crown cases, Skip (Re) cases
            continue
            
        # Skip client/solicitor cases (not same as plaintiff/defendant)
        regex_client_solicitor = re.search(r'(Between.*([C|c]lient[s]?).*([S|s]olicitor[s]?|[L|l]awyer[s]?))', case)
        if regex_client_solicitor:
            continue
        
        regex_solicitor_client = re.search(r'(Between.*([L|l]awyer[s]?|[S|s]olicitor[s]?).*([C|c]lient[s]?))', case)
        if regex_solicitor_client:
            continue
            
        # In some rare cases we have 'IN THE MATTER OF ..' (rather than 'Between ...') .. but it is following by the normal
        # plaintiff/defendant dynamic. Only skip cases if there is no mention of the following terms
        # (Can be cleaned up in future)
        key_words = ['appellant', 'respondent', 'claimant', 'petitioner', 'plaintiff', 'defendant',
        'appellants', 'respondents', 'claimants', 'petitioners', 'plaintiffs', 'defendants']
        regex_in_matter_of = re.search(r'IN THE MATTER OF .*\n\([0-9]+ paras.\)', case)
        if regex_in_matter_of:
            remove = True
            for key in key_words:
                if key in regex_in_matter_of.group(0).lower().strip():
                    remove = False
                    
            if remove:
                continue

        if 'British Columbia Judgments' in case_type: # Make sure we're dealing with a B.C.J. case
        
            # Fields that can be found via pattern matching
            if re.search('contributory negligence', case, re.IGNORECASE):
                contributory_negligence_raised = 'Y'
            else:
                contributory_negligence_raised = 'N'
            case_number = re.search(r'\/P([0-9]+)\.txt', path).group(1)
            decision_len = re.search(r'\(([0-9]+) paras\.?\)', case) # e.g.) (100 paras.)
            registry = re.search(r'(Registry|Registries): ?([A-Za-z0-9 ]+)', case) # e.g.) Registry: Vancouver
            written_decision = 'Y' if int(decision_len.group(1)) > 1 else 'N'
            if registry:
                registry = registry.group(2).strip()
            else:
                registry = re.search(r'([A-Za-z ]+) Registry No.', case) # Alt form e.g.) Vancouver Registory No. XXX
                if registry:
                    registry = registry.group(1).strip()
                else:
                    registry = re.search(r'([A-Za-z ]+) No. S[0-9]*', case)
                    if registry:
                        registry = registry.group(1).strip()
                    else:
                        print('WARNING: Registry could not be found (This shouldn\'t occur!)')
            # Fields that are always in the same place
            judge_name = lines[4].strip()
            case_title = lines[0].strip()
            # Extract year from case_title (in case we want to make visualizations, etc.)
            year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
            if year:
                year = year.group(0)
            else:
                # Rare case: Sometimes the title is too long. Rely on Heard date.
                year = re.search(r'Heard:.* ([2][0][0-2][0-9])', case)
                if year:
                    year = year.group(1)
                else:
                    print('WARNING: Year not found')
            case_dict['case_number'] = '%s of %s'%(i+1+((int(case_number)-1)*50), case_number)
            case_dict['case_title'] = case_title
            
            #print(case_title)
            
            
            case_dict['year'] = year
            case_dict['registry'] = registry
            case_dict['judge'] = judge_name
            case_dict['decision_length'] = decision_len.group(1)
            case_dict['multiple_defendants'] = rule_based_multiple_defendants_parse(case)
            case_dict['contributory_negligence_raised'] = contributory_negligence_raised
            case_dict['written_decision'] = written_decision
            
            # TODO: Improve plaintiff_wins to take one case at a time.
            #plaintiff_list = plaintiff_wins(path)
            #if case_title in plaintiff_list:
            case_dict['plaintiff_wins'] = plaintiff_wins(case)
            #else:
            #case_dict['plaintiff_wins'] = "NA"
                
            case_dict['filename'] = path
            case_dict['damages'] = rule_based_damage_extraction(case)
                
        # don't add empty dictionaries (non BCJ cases) to list
        if case_dict != dict(): 
            case_parsed_data.append(case_dict)
            
        #break
    return case_parsed_data

In [1012]:
def rule_based_multiple_defendants_parse(doc):
    ''' Helper function for rule_based_parse_BCJ
    
    Given a case. Uses regex/pattern-matching to determine whether we have multiple defendants.
    For the most part the logic relies on whether the langauge used implies plurality or not.
    
    Arguments: doc (String): The case in text format following the form used in the DOCX to TXT notebook
    Returns: response (String, 'Y', 'N', or 'UNK')
    '''

    # Case 1)
    # Traditional/most common. Of form "Between A, B, C, Plaintiff(s), X, Y, Z Defendant(s)"
    # Will also allow "IN THE MATTER OF ... Plaintiff .... Defendant..."
    # Can successfully cover ~98% of data
    regex_between_plaintiff_claimant = re.search(r'((?:Between|IN THE MATTER OF).*(?:[P|p]laintiff[s]?|[C|c]laimant[s]?|[A|a]ppellant[s]?|[P|p]etitioner[s]?|[R|r]espondent[s]?|[A|a]pplicant[s]?).*(?:[D|d]efendant[s]?|[R|r]espondent[s]?|[A|a]pplicant[s]?).*\n)', doc)
    
    
    multiple_defendant_keywords = [('defendants',), ('respondents',), ('applicants',)]
    single_defendant_keywords = [('defendant',), ('respondent',), ('applicant',)]
    # Match found
    if regex_between_plaintiff_claimant:
        text = regex_between_plaintiff_claimant.group(0).lower()
        for keyword in multiple_defendant_keywords:
            if match_contains_words(text, keyword):
                return 'Y'
            
        for keyword in single_defendant_keywords:
            if keyword[0] in text:
                if text.count(keyword[0]) > 1:
                    return 'Y'
                else:
                    return 'N'
    
    # If not found, try other less common cases
    else:
        # Case 2)
        # Sometimes it does not mention the name of the second item. (Defendent/Respondent)
        # We can estimate if there are multiple based on the number of "," in the line (Covers all cases in initial data)
        regex_missing_defendent = re.search(r'(Between.*([P|p]laintiff[s]?|[C|c]laimant[s]?|[A|a]ppellant[s]?|[P|p]etitioner[s]?).*\n)', doc)
        if regex_missing_defendent:
            text = regex_missing_defendent.group(0).lower()
            if len(text.split(',')) > 5:
                return 'Y'
            else:
                return 'N'
            
        else:
            #print('Multiple defendants: Unknown! Unable to regex match')
            #print(doc)
            return 'UNK'

In [1013]:
regex_damages = r'[\w|-]* ?(?:damage|loss|capacity|cost).+?\$? ?[0-9][0-9|,|.]+[0-9]'
#regex_damages = r'(?:[\w|-]* ?){0,3}(?:damage|loss|capacity|cost).+?\$? ?[0-9][0-9|,|.]+[0-9]'
#regex_in_trust = r'(?:in-?trust|award).*?\$? ?[0-9][0-9|,|.]+[0-9]'

# Rule based dmg extraction REGEX patterns
#regex_damages = r'(?![and])(?:[\w|-]* ?){0,2} ?(?:damage|loss|capacity|cost).+?\$? ?[0-9][0-9|,|.]+[0-9]'
regex_damages_2 = r'[^:] \$? ?[0-9][0-9|,|.]+[0-9] (?:for|representing)?[ \w\-+]+damages?'
regex_damages_3 = r'[^:] \$? ?[0-9][0-9|,|.]+[0-9] (?:for|representing)?[ \w\-+]+damages?(?:(?:for|representing)?.*?[;.\n])'
regex_future_care_loss = r'(?:future|past|in[-| ]?trust|award).*?(?:loss|costs?|income|care)?.*?\$? ?[0-9][0-9|,|.]+[0-9]'
regex_for_cost_of = r'\$? ?[0-9][0-9|,|.]+[0-9][\w ]*? cost .*?\.'

# Keywords to look in match for categorization
general_damage_keywords = [('general',), ('future', 'income', 'loss'), ('future', 'income'), ('future', 'wage', 'loss'), ('future', 'earning'), ('!past', 'earning', 'capacity'), ('future', 'capacity'), ('future', 'earning'), ('!past', 'loss', 'opportunity'), ('!past', 'loss', 'housekeep'), ('ei', 'benefit')]
special_damage_keywords = [('special',), ('trust',), ('past', 'income', 'loss'), ('past', 'wage'), ('past', 'earning'), ('past', 'income'), ('earning', 'capacity')]
aggravated_damage_keywords = [('aggravated',)]
non_pecuniary_damage_keywords = [('non', 'pecuniary')]
punitive_damage_keywords = [('punitive',)]
future_care_damage_keywords = [('future', 'care'), ('future', 'cost')]

def rule_based_damage_extraction(doc, min_score = 0.9, max_match_len_split = 10):
    '''Helper functino for rule_based_parse_BCJ
    
    Given a case, attempts to extract damages using regex patterns
    
    Arguments: doc (String): The case in text format following the form used in the DOCX to TXT notebook
    min_score (float): The minimum paragraph score to consider having a valid $ number
                       Paragraph has score 1 if its the last paragraph
                       Paragraph has score 0 if its the first paragraph
    max_match_len_split (int): The max amount of items that can appear in a regex match after splitting (no. words)
    
    Returns: damages (Dict): Contains any found damages
    
    '''
    damages = defaultdict(float)
    repetition_detection = defaultdict(set) # try to stem the repeated values
    no_paras = re.search(r'\(([0-9|,]+) paras?\.?\)', doc).group(1) # Get number of paragraphs
    pattern = r'([.]?)(?=\n[0-9]{1,%s}[\xa0|\s| ]{2})'%len(no_paras) # Used to split into paras
    paras_split = re.split(pattern, doc)
    money_patt = r'\$[0-9|,]+' # Used to get all paragraphs with a money amount
    scored_paras = [] # Score paragraphs based on where they appear in the document
                      # Score of 0.0 would be the first paragraph. Score of 1.0 would be the last paragraph
        
    for i, paragraph in enumerate(paras_split):
        if re.search(money_patt, paragraph):
            scored_paras.append((i / len(paras_split), paragraph)) # (score, paragraph). Score formula: i/no_paras
            
    scored_paras = sorted(scored_paras, key=lambda x:x[0])[::-1] # Store from last paragraph to first
    if len(scored_paras) == 0:
        return None
    if scored_paras[0][0] < min_score: #If highest scored paragraph is less than minimum score.
        return None
    
    patterns = [regex_damages, regex_damages_2, regex_damages_3, regex_future_care_loss, regex_for_cost_of]
    banned_words = ['seek', 'claim', 'propose', 'range', ' v. '] # Skip paragraphs containing these
    counter_words = ['summary', 'dismissed'] # Unless these are mentioned. 
                                             # example) "Special damage is $5k. But claims for aggravated are 'dismissed'" 
    
    # Get money mounts from the text
    total = None
    matches = []
    summary_matches = []
    for i, scored_para in enumerate(scored_paras):
        text = scored_para[1]
        score = scored_para[0]
        
        if score > min_score:
            if any(item.startswith('summary') for item in text.lower().split()[:4]) or any(item.startswith('conclusion') for item in text.lower().split()[:4]):
                text_matches = get_matching_text(patterns, text, max_match_len_split)
                for t_m in text_matches:
                    summary_matches.append((score, t_m))
            elif i+1 < len(scored_paras) and (any(item.startswith('summary') for item in scored_paras[i+1][1].lower().split()[-4:]) or any(item.startswith('conclusion') for item in scored_paras[i+1][1].lower().split()[-4:])):
                text_matches = get_matching_text(patterns, text, max_match_len_split)
                for t_m in text_matches:
                    summary_matches.append((score, t_m))
            else:
                skip = False # Skip paras with banned words
                for banned_word in banned_words: 
                    if banned_word in text:
                        skip = True       
                for counter_word in counter_words:
                    if counter_word in text:
                        skip = False
                if skip:
                    continue

                text_matches = get_matching_text(patterns, text, max_match_len_split)
                for t_m in text_matches:
                    matches.append((score, t_m))
   # print(doc.split()[0])
   # print(matches)
    # Only keep matches from the summary if a summary was found. If not keep all matches.
    if len(summary_matches) > 0: 
        matches = summary_matches
    # Extract $ value. Determine correct column
    regex_number_extraction = r' ?[0-9][0-9|,|.]+[0-9]'
    for score, match in matches:
        skip = False # Banned words should not appear in final matches
        for banned_word in banned_words: 
            if banned_word in match:    
                skip = True
                break
        if skip:
            continue
        
        amount = re.findall(regex_number_extraction, match, re.IGNORECASE)
        extracted_value = clean_money_amount(amount)
        if extracted_value is None: # Make sure we are able to extract a value
            continue
            
        value_mapped = False # If we mapped the value into a damage category - stop trying to map into other categories
        value_mapped = assign_damage_to_category(extracted_value, general_damage_keywords, match, score, matches, 'General', damages, repetition_detection, repetition_key = ('general',))
        if not value_mapped:
            value_mapped = assign_damage_to_category(extracted_value, special_damage_keywords, match, score, matches, 'Special', damages, repetition_detection, repetition_key = ('special',))
        if not value_mapped:
            value_mapped = assign_damage_to_category(extracted_value, non_pecuniary_damage_keywords, match, score, matches, 'Non-pecuniary', damages, repetition_detection, repetition_key = ('non','pecuniary'))
        if not value_mapped:
            value_mapped = assign_damage_to_category(extracted_value, aggravated_damage_keywords, match, score, matches, 'Aggravated', damages, repetition_detection, repetition_key = ('aggravated',))
        if not value_mapped:
            value_mapped = assign_damage_to_category(extracted_value, punitive_damage_keywords, match, score, matches, 'Punitive', damages, repetition_detection, repetition_key = ('punitive',))
        if not value_mapped:
            value_mapped = assign_damage_to_category(extracted_value, future_care_damage_keywords, match, score, matches, 'Future Care', damages, repetition_detection) 
        if not value_mapped: # Last attempt: Only use "total amounts" if nothing else was found
            total_keywords = [('total',), ('sum',), ('award',)]
            for keywords in total_keywords:
                if match_contains_words(match.lower(), keywords):
                    if is_best_score(score, matches, keywords):
                        if extracted_value not in repetition_detection[('total',)]:
                            damages['Pecuniary Total'] = damages['Special'] + damages['General'] + damages['Punitive'] + damages['Aggravated'] + damages['Future Care']
                            damages['Total'] = damages['Pecuniary Total'] + damages['Non-pecuniary']
                            if damages['Total'] == 0:
                                total = extracted_value
                                repetition_detection[('total',)].add(extracted_value)
                        
    damages['Pecuniary Total'] = damages['Special'] + damages['General'] + damages['Punitive'] + damages['Aggravated'] + damages['Future Care']
    damages['Total'] = damages['Pecuniary Total'] + damages['Non-pecuniary']
    
    if damages['Total'] == 0 and total is not None: # Only use the "total" if we couldnt find anything else!
        damages['Total'] = total
        damages['General'] = total
        
    columns = ['Total', 'Pecuniary Total', 'Non-pecuniary', 'Special', 'General', 'Punitive', 'Aggravated', 'Future Care']
    for c in columns:
        damages[c] = None if damages[c] == 0 else damages[c]
    
   # print(damages)
    return damages

def assign_damage_to_category(damage, damage_keywords, match, match_score, matches, damage_type, damage_dict, repetition_dict, repetition_key = None):
    '''Helper function for rule based damage extraction.
    
    Adds damage to dictionary based on given parameters so long as it is the
    highest scoring match & doesn't appear in the repetition dictionary
    
    Argumets:
    damage (float) - The damage amount in the match
    damage_keywords (list) - Keywords that may appear in match
    match (string) - The match string itself
    matches (list) - All matches. Used to determine if we found the best match
    damage_dict (dict) - Dictionary storing all damages
                       - Will be modified in place
    repetition_dict (dict) - Dictionary storing repeated values
                           - Will be modified in place
    (Optional) repetition_key (Tuple) - If not none, will use this key to store repetitions. Else will use matching keyword
    
    Returns:
    value_belongs (Boolean) - True if the value belongs in the given keyword category. False otherwise
    '''
    match = match.lower()
    value_belongs = False
    
    for keywords in damage_keywords:
        if match_contains_words(match, keywords):
            value_belongs = True
            if is_best_score(match_score, matches, keywords):
                if damage not in repetition_dict[repetition_key if repetition_key else keywords]:
                    damage_dict[damage_type] += damage
                    repetition_dict[repetition_key if repetition_key else keywords].add(damage)
            break
    
    return value_belongs

def clean_money_amount(money_regex_match):
    '''Helper function for rule based damage extraction.
    
    Arguments:
    money_regex_match (Regex.findall object) - Match of $ amount
    
    Returns:
    None if a bad match
    extracted_value (float) - The money amount in float form
    '''
    # If our regex contains more than 1 or 0 money values. We cannot use the match.
    if len(money_regex_match) > 1:
        return None
    if len(money_regex_match) == 0:
        print('Error: No Money in match!', match)
        return None

    extracted_value = None
    amount = money_regex_match[0].replace(',', '')
    # Deals with money at end of sentence. example) ... for '5,000.00.' -> '5000.00'
    if amount[-1] == '.': 
        amount = amount[:-1]

    # Deals with a rare typo in some cases. example) 50.000.00 -> 50000.00
    if amount.count('.') > 1: 
        dot_count = t.count('.')
        changes_made = 0
        new_amount = ''
        for letter in amount:
            if letter == '.' and changes_made != dot_count-1:
                changes_made += 1
            else:
                new_amount += letter
        amount = new_amount
    extracted_value = float(amount)
    return extracted_value

def get_matching_text(patterns, text, max_match_len_split):
    '''Helper function for rule based damage extraction.
    
    Given a set of regex; pulls out all matching text
    
    Arguments:
    patterns (list) - List of regex patterns in string format
    text (string) - Text to search for matches in
    
    Returns:
    matches (list) - List containing all matches in text format
    '''

    matches = []
    for pattern in patterns:
        for match in re.findall(pattern, text, re.IGNORECASE):
            if 'and' not in match:
                if len(match.split()) <= max_match_len_split:
                    matches.append(match)
                    
    return matches

def is_best_score(score, matches, keywords):
    '''Helper function for rule based damage extraction.
    
    Given a set of regex matches, determine if the score is the highest score out of all matches for the given keywords
    
    Arguments:
    score (float) - The score of the item you're inspecting
    matches (list) - List of matches where each element is of form (score, match text)
    keywords (tuple) - All words that should appear in the match
    
    Returns: True or False
    
    '''
    best_score = score
    
    for score, match in matches:
        if all(word in match.lower() for word in keywords):
            if score > best_score:
                return False
            
    return True

def match_contains_words(match, words):
    '''Helper function for rule based damage extraction.
    
    Given some text. Find if the words are all present in the text.
    If word begins with '!' the word cannot appear in the text. Can handle mix/matching of both types.
    
    Arguments:
    match (String) - The text to look for words in
    words (list) - List of words to check for. If word begins with ! (i.e. '!past'), then the word cannot appear in it
    
    Returns:
    True if all words are present (or not present if using !)
    False otherwise
    
    '''
    pos_words = []
    neg_words = []
    for word in words:
        if word.startswith('!'):
            neg_words.append(word[1:])
        else:
            pos_words.append(word)
            
    if all(word in match for word in pos_words):
        if all(word not in match for word in neg_words):
            return True
        
    return False


    
def rule_based_convert_cases_to_DF(cases):
    '''Given a list of parsed cases returns a dataframe'''

    lists = defaultdict(list)    
    for case in cases:
        lists['Case Number'].append(case['case_number'])
        lists['Case Name'].append(case['case_title'])
        lists['Year'].append(case['year'])
        lists['Total Damage'].append(case['damages']['Total'] if case['damages'] != None else None)
        lists['Total Pecuniary'].append(case['damages']['Pecuniary Total'] if case['damages'] != None else None)
        lists['Non Pecuniary'].append(case['damages']['Non-pecuniary'] if case['damages'] != None else None)
        lists['General'].append(case['damages']['General'] if case['damages'] != None else None)
        lists['Special'].append(case['damages']['Special'] if case['damages'] != None else None)
        lists['Punitive'].append(case['damages']['Punitive'] if case['damages'] != None else None)
        lists['Aggravated'].append(case['damages']['Aggravated'] if case['damages'] != None else None)
        lists['Future Care'].append(case['damages']['Future Care'] if case['damages'] != None else None)
        lists['Judge Name'].append(case['judge'])
        lists['Decision Length'].append(case['decision_length'])
        lists['Multiple defendants?'].append(case['multiple_defendants'])
        lists['File'].append(case['filename'])
        lists['Plaintiff Wins?'].append(case['plaintiff_wins'])
        lists['Contributory Negligence Raised'].append(case['contributory_negligence_raised'])
        lists['Written Decision?'].append(case['written_decision'])
        lists['Registry'].append(case['registry'])
        
        
    df = pd.DataFrame()
    for key in lists.keys():
        df[key] = lists[key]
    
    return df

def plaintiff_wins(case):
    '''This function will search the cases and returns a dictionary
    with case names as keys and boolean for value, True if the plaintiff
    wins the case and False if plaintiff looses'''

    # regex search for keyword HELD in cases, which determines if case was allowed or dismissed
    lines = case.strip().split('\n')
    HELD = re.search(r'HELD.+', case)
    if HELD:
        matched = HELD.group(0)
        if "allowed" in matched or "favour" in matched or "awarded" in matched:
            return 'Y'
        if "dismissed" in matched:
            return 'N'
        
        
    awarded =  re.search(r'award(.+)?.+?(plaintiff(.+)?)?', lines[-2])
    #regex searches for pattern of plaintiff/defendant/applicant....entitled/have...costs
    entiteled = re.search(r'(plaintiff|defendant.?|applicant)(.+)?(entitle(.)?(.+)?|have).+?cost(.+)?', lines[-2])
    #regex searches for pattern of successful...(case)
    successful = re.search(r'successful(.+)?.+?', lines[-2])
    #regex searches for dismiss....
    dismiss = re.search(r'dismiss(.+)?.+', lines[-2])
    costs = re.search(r'costs.+?(award(.+)?|cause).+?', lines[-2])

    if dismiss and "not dismissed" not in lines[-2]:
        return 'N'
    elif awarded:
        return 'Y'
    elif entiteled:
        return 'Y'
    elif successful:
        return 'Y'
    elif costs:
        return 'Y'
    else:
        return "OpenCase"  

#### Code driver|

In [1014]:
data = []
for file_number in file_identifiers:
    print('## Processing ' + path_to_data + file_prefix + str(file_number) + file_suffix + ' ##\n')
    #if file_number == 33:
    data.extend(rule_based_parse_BCJ(path_to_data + file_prefix + str(file_number) + file_suffix))
        
    #break 

## Processing ../data/Lexis Cases txt/P1.txt ##

## Processing ../data/Lexis Cases txt/P2.txt ##

## Processing ../data/Lexis Cases txt/P3.txt ##

## Processing ../data/Lexis Cases txt/P4.txt ##

## Processing ../data/Lexis Cases txt/P5.txt ##

## Processing ../data/Lexis Cases txt/P6.txt ##

## Processing ../data/Lexis Cases txt/P7.txt ##

## Processing ../data/Lexis Cases txt/P8.txt ##

## Processing ../data/Lexis Cases txt/P9.txt ##

## Processing ../data/Lexis Cases txt/P10.txt ##

## Processing ../data/Lexis Cases txt/P11.txt ##

## Processing ../data/Lexis Cases txt/P12.txt ##

## Processing ../data/Lexis Cases txt/P13.txt ##

## Processing ../data/Lexis Cases txt/P14.txt ##

## Processing ../data/Lexis Cases txt/P15.txt ##

## Processing ../data/Lexis Cases txt/P16.txt ##

## Processing ../data/Lexis Cases txt/P17.txt ##

## Processing ../data/Lexis Cases txt/P18.txt ##

## Processing ../data/Lexis Cases txt/P19.txt ##

## Processing ../data/Lexis Cases txt/P20.txt ##

## Proces

In [895]:
dev_df = rule_based_convert_cases_to_DF(data)

In [619]:
df.columns

Index(['Case Number', 'Title', 'Year', 'Total Damage', 'Total Pecuniary',
       'Non Pecuniary', 'General', 'Special', 'Punitive', 'Aggravated',
       'Future Care', 'Judge', 'Decision Length', 'Multiple Defendants',
       'File', 'Plaintiff Wins', 'Contributory Negligence Raised',
       'Written Decision'],
      dtype='object')

In [568]:
df.to_csv('third_pass.csv', index=False)

In [315]:
match_contains_words('loss of housekeeping capacity: $11,000', ('loss', 'housekeep'))

True

In [682]:
df= pd.read_csv('../data/gold_annotations.csv', header=2)

In [684]:
names = ['Mawani v. Pitcairn, [2012] B.C.J. No. 1819', 'd']
df.dropna(how = 'all', inplace=True) # If all values are empty in a row. Drop that row

In [1010]:
def evaluate(dev_data, gold_data, subset=None):
    
    print('#### Evaluation ####')
    
    # Use case name as 'primary key'
    dev_case_names = list(dev_data['Case Name'])
    gold_case_names = list(gold_data['Case Name'])
    
    # Filter data to only use overlapping items
    gold_data = gold_data[gold_data['Case Name'].isin(dev_case_names)]
    dev_data = dev_data[dev_data['Case Name'].isin(gold_case_names)]
    
    # Mapping from our variable names to Lachlan's column names
    column_mapping = {'Decision Length': 'Decision Length: paragraphs)',
                      'Total Damage': '$ Damages total before contributory negligence',
                      'Non Pecuniary': '$ Non-Pecuniary Damages', 
                      'Total Pecuniary': '$ Pecuniary Damages Total',
                      'Special': '$ Special damages Pecuniary (ie. any expenses already incurred)',
                      'Future Care': 'Future Care Costs (General Damages)',
                      'General': '$ General Damages',
                      'Punitive': '$ Punitive Damages',
                      'Aggravated': '$Aggravated Damages'}
    dev_data.rename(columns = column_mapping, inplace = True)
     
    if subset is None: # Use all columns if no subset specified
        subset = dev_data.columns
        
    for column in dev_data.columns:
        if column in gold_data.columns:
            if column in subset:
                empty_correct = 0
                non_empty_correct = 0
                total_empty = 0
                total_non_empty = 0
                for case_name in list(dev_data['Case Name']):
                    dev_value = list(dev_data[dev_data['Case Name'] == case_name][column])[0]
                    gold_value = list(gold_data[gold_data['Case Name'] == case_name][column])[0]

                    # Convert string to float if possible
                    try:
                        gold_value = float(gold_value)
                    except:
                        pass

                    try:
                        dev_value = float(dev_value)
                    except:
                        pass
                    # Set values to 'None' if they're a NaN float value
                    dev_value = None if isinstance(dev_value, float) and math.isnan(dev_value) else dev_value
                    gold_value = None if isinstance(gold_value, float) and math.isnan(gold_value) else gold_value
                    # Lowercase values if they're a string
                    dev_value = dev_value.lower().strip() if isinstance(dev_value, str) else dev_value
                    gold_value = gold_value.lower().strip() if isinstance(gold_value, str) else gold_value

                    if gold_value is None:
                        total_empty += 1
                        if dev_value is None:
                            empty_correct += 1
                    else:
                        total_non_empty += 1
                        if isinstance(dev_value, float) and isinstance(gold_value, float):
                            if math.isclose(dev_value, gold_value, abs_tol=1): # Tolerance within 1
                                non_empty_correct += 1
                        elif dev_value == gold_value:
                            non_empty_correct += 1
                        
                print('-------')
                print('COLUMN:', column)
                if total_empty != 0:
                    print('Empty field accuracy:', empty_correct / total_empty * 100, '%', empty_correct, '/', total_empty)
                if total_non_empty != 0:
                    print('Filled field accuracy:', non_empty_correct / total_non_empty * 100, '%', non_empty_correct, '/', total_non_empty)
                print('Overall accuracy:', (empty_correct+non_empty_correct) / (total_non_empty+total_empty) * 100, '%', (empty_correct+non_empty_correct), '/', (total_non_empty+total_empty))
    
    # for testing:
    #return dev_data, gold_data


def evaluate_2(dev_data, gold_data, subset=None):
    '''gold_data in dataframe form
       dev_data in dataframe form'''
    dev_case_names = list(dev_data['Case Name'])
    gold_case_names = list(gold_data['Case Name'])
    
    #invalid_cases = list(gold_data[~gold_data['Case Name'].isin(dev_case_names)]['Case Name'])
    # typically due to non BCJ case.
    #print('# of invalid cases:', len(invalid_cases))
    #print('Case titles:', invalid_cases)
    
    gold_data = gold_data[gold_data['Case Name'].isin(dev_case_names)]
    dev_data = dev_data[dev_data['Case Name'].isin(gold_case_names)]
    #print(len(gold_data))
    #print(len(dev_data))
    
    # mapping from our format to lachlan
    column_mapping = {'Decision Length': 'Decision Length: paragraphs)',
                      'Total Damage': '$ Damages total before contributory negligence',
                      'Non Pecuniary': '$ Non-Pecuniary Damages', 
                      'Total Pecuniary': '$ Pecuniary Damages Total',
                      'Special': '$ Special damages Pecuniary (ie. any expenses already incurred)',
                      'Future Care': 'Future Care Costs (General Damages)',
                      'General': '$ General Damages',
                      'Punitive': '$ Punitive Damages',
                      'Aggravated': '$Aggravated Damages'}
    
    dev_data.rename(columns = column_mapping, inplace=True)

    if subset is None:
        subset = dev_data.columns
    
    for column in dev_data.columns:
        if column in gold_data.columns:
            if column in subset:
                print('----------')
                print(column)
                total=0
                correct=0
                
                if column == 'Judge Name':
                    for case_name in list(dev_data['Case Name']):
                        dev_val = list(dev_data[dev_data['Case Name'] == case_name][column])[0]
                        gold_val = list(gold_data[gold_data['Case Name'] == case_name][column])[0]
                        
                        if gold_val.split()[0].lower() in dev_val.lower():
                            correct += 1
                        else:
                            pass
                            #print(dev_val, gold_val)
                        total += 1
                    print('Accuracy:', correct/total*100, '%', correct, '/', total)
                else:
                    for case_name in list(dev_data['Case Name']):
                        dev_val = list(dev_data[dev_data['Case Name'] == case_name][column])[0]
                        gold_val = list(gold_data[gold_data['Case Name'] == case_name][column])[0]

                        try:
                            gold_val = float(gold_val)
                        except:
                            pass

                        try:
                            dev_val = float(dev_val)
                        except:
                            pass

                        if isinstance(gold_val, float) and math.isnan(gold_val):
                            gold_val = None

                        if isinstance(dev_val, float) and math.isnan(dev_val):
                            dev_val = None

                        dev_val = dev_val.lower().strip() if isinstance(dev_val, str) else dev_val
                        gold_val = gold_val.lower().strip() if isinstance(gold_val, str) else gold_val

                        if isinstance(dev_val, float) and isinstance(gold_val, float):
                            if math.isclose(dev_val, gold_val, abs_tol=1):
                                correct += 1
                            else:
                                pass
                                print(case_name)
                                print('DEV VALUE', dev_val, type(dev_val))
                                print('GOLD VALUE', gold_val, type(gold_val))
                                print('+')



                        elif dev_val is None and gold_val is None:
                            correct+=1
                            #total -= 1

                        elif dev_val == gold_val:
                            correct += 1
                        else:
                            pass
                            print(case_name)
                            print('DEV VALUE', dev_val, type(dev_val))
                            print('GOLD VALUE', gold_val, type(gold_val))
                            print('-')
                        total += 1
                    print('Accuracy:', correct/total*100, '%', correct, '/', total)
    
    return dev_data, gold_data

In [1019]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

gold_df = pd.read_csv('../data/gold_annotations.csv', header=2)#, skiprows = lambda x: x in range(3, 33))
gold_df.dropna(how = 'all', inplace=True) 

dev_df = rule_based_convert_cases_to_DF(data)

d1, g1 = evaluate_2(dev_df, gold_df, subset = 'Multiple defendants?')

----------
Multiple defendants?
Najdychor v. Swartz, [2009] B.C.J. No. 1202
DEV VALUE n <class 'str'>
GOLD VALUE y <class 'str'>
-
Akbari v. Insurance Corp. of British Columbia, [2012] B.C.J. No. 2451
DEV VALUE n <class 'str'>
GOLD VALUE y <class 'str'>
-
Stegemann v. Pasemko, [2007] B.C.J. No. 1585
DEV VALUE n <class 'str'>
GOLD VALUE y <class 'str'>
-
Accuracy: 97.54098360655738 % 119 / 122


In [930]:
d1

Unnamed: 0,Case Number,Case Name,Year,$ Damages total before contributory negligence,$ Pecuniary Damages Total,$ Non-Pecuniary Damages,$ General Damages,$ Special damages Pecuniary (ie. any expenses already incurred),$ Punitive Damages,$Aggravated Damages,Future Care Costs (General Damages),Judge Name,Decision Length: paragraphs),Multiple defendants?,File,Plaintiff Wins?,Contributory Negligence Raised,Written Decision?,Registry
0,1 of 1,"Mawani v. Pitcairn, [2012] B.C.J. No. 1819",2012,,,,,,,,,S.F. Kelleher J.,115,Y,../data/Lexis Cases txt/P1.txt,Y,Y,Y,Vancouver
1,2 of 1,"Ediger (Guardian ad litem of) v. Johnston, [20...",2009,20000.0,,,20000.0,,,,,H.J. Holmes J.,350,N,../data/Lexis Cases txt/P1.txt,Y,N,Y,Vancouver
2,3 of 1,"Furness v. Guest, [2010] B.C.J. No. 1388",2010,42041.4,42041.4,,,42041.4,,,,D.A. Halfyard J.,97,Y,../data/Lexis Cases txt/P1.txt,Y,Y,Y,Nanaimo
51,53 of 2,"Ruchelski v. Moore, [2013] B.C.J. No. 561",2013,105982.0,105982.0,,75000.0,16982.0,,,14000.0,P. Abrioux J.,110,Y,../data/Lexis Cases txt/P2.txt,Y,N,Y,Vernon
97,100 of 2,"Abbott v. Gerges, [2014] B.C.J. No. 1848",2014,299152.0,249152.0,50000.0,,249152.0,,,,L.A. Warren J.,208,Y,../data/Lexis Cases txt/P2.txt,Y,Y,Y,Vancouver
98,101 of 3,"Brooks-Martin v. Martin, [2011] B.C.J. No. 243",2011,,,,,,,,,D.A. Halfyard J.,172,Y,../data/Lexis Cases txt/P3.txt,Y,Y,Y,Nanaimo
146,152 of 4,"McGavin v. Talbot, [2017] B.C.J. No. 2439",2017,88536.82,88536.82,,72000.0,15636.82,,,900.0,D.M. Masuhara J.,59,Y,../data/Lexis Cases txt/P4.txt,Y,N,Y,Victoria
147,153 of 4,"Aberdeen v. Langley (Township), [2007] B.C.J. ...",2007,5163134.0,4852134.0,311000.0,502381.0,198249.0,,,4151504.0,Groves J.,243,Y,../data/Lexis Cases txt/P4.txt,Y,Y,Y,Vancouver
195,201 of 5,"Mclaren v. Rice, [2009] B.C.J. No. 2108",2009,,,,,,,,,T.R. Brooke J.,46,Y,../data/Lexis Cases txt/P5.txt,Y,Y,Y,Vancouver
197,203 of 5,"Neidermayer v. Gillies, [2012] B.C.J. No. 183",2012,30000.0,,,30000.0,,,,,T.C. Armstrong J.,120,Y,../data/Lexis Cases txt/P5.txt,Y,Y,Y,Nanaimo


In [780]:
g1

Unnamed: 0,Case Number,Case Name,Written Decision?,Plaintiff Wins?,Multiple defendants?,Judge Name,Decision Length: paragraphs),Registry,$ Damages total before contributory negligence,$ Non-Pecuniary Damages,$ Pecuniary Damages Total,$ Special damages Pecuniary (ie. any expenses already incurred),Future Care Costs (General Damages),$ General Damages,$ Punitive Damages,$Aggravated Damages,Contributory Negligence Raised?,Contributory Negligence Successful?,% Reduction as a result of contributory negligence,$ Reduction as a result of contributory negligence,$ Final Award after contributory negligence,Unnamed: 21
0,1 of 1,"Mawani v. Pitcairn, [2012] B.C.J. No. 1819",Y,Y,,Kelleher J,115,Vancouver,,,,,,,,,y,Y,50.0,,,
2,101 of 3,"Brooks-Martin v. Martin, [2011] B.C.J. No. 243",Y,Y,,Halfyard J.,172,Nanaimo,,,,,,,,,y,Y,30.0,,,
4,201 of 5,"Mclaren v. Rice, [2009] B.C.J. No. 2108",n,Y,,Brooke J,46,Vancouver,,,,,,,,,,,,,,
5,251 of 6,"Jacobs v. Basil, [2017] B.C.J. No. 1517",N,Y,,Abrioux J,229,Vernon,290000.0,,290000.0,15000.0,,275000.0,,,y,y,50.0,145000.0,150000.0,
6,301 of 7,"Jackson v. Fisheries and Oceans Canada, [2006]...",Y,Y,,Halfyard J.,38,Prince Rupert,,,,,,,,,,,,,,
7,351 of 8,"Brito (Guardian ad litem of) v. Woolley, [2001...",y,y,,Prowse J,694,Vancouver,1506035.2,260000.0,1246035.2,259239.5,,986795.71,,,,,,,1506035.2,
8,401 of 9,Rackstraw (Litigation guardian of) v. Robertso...,Y,N,,Fisher J,38,Vancouver,,,,,,,,,,,,,,
9,451 of 10,"Intrawest Corp. v. Hart, [2002] B.C.J. No. 301",Y,Y,,Melnick J,76,Vancouver,353520.0,,,,,353520.0,,,,50,176760.0,,176760.0,
10,501 of 11,"Millard v. Singleton, [2015] B.C.J. No. 1234",Y,Y,,Sharma J,70,Vancouver,,,,,,,,,y,N,,,,
11,551 of 12,"C.H. v. British Columbia, [2003] B.C.J. No. 1706",y,y,,Owen-Flood J,234,Victoria,152500.0,,,,,142500.0,,10000.0,,,,,,


# CURRENTLY UNUSED CODE

In [None]:
def parse_chfl_case(doc):
    lines = doc.split('\n')
    doc_dict = dict() # fill in later :) 
    registry = None
    decision_len = 0
    judge_name = lines[3][8:]
    case_title = lines[0]
    year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
    
    print('CASE TITLE:', case_title)
    print('REGISTRY:', registry)
    
    if year:
        print('YEAR:', year.group(0))
    print('JUDGE NAME:', judge_name)
    print('DECISION LENGTH:', decision_len) # Pull out dec. length number
    
    print('====================')
    
def parse_ilr_case(doc):
    lines = doc.split('\n')
    doc_dict = dict() # fill in later :) 
    registry = None
    judge_name = lines[3][8:]
    case_title = lines[0]
    year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
    
    # Search for decision length (Verified this works on 6 different cases)
    # Logic: Start at 1. If we find a line following the current that is +1 of the decision length
    #        then we increment it. This should help avoid if a line begins with a different number
    #        because the decisions should be properly incrementing by 1 each time.
    decision_len = 0
    for line in lines:
        if line.startswith('Reasons for Judg') or line.lower().startswith('introduction'):
            decision_len = 1
            
        if line.startswith(str(decision_len + 1)):
            decision_len += 1
    
    print('CASE TITLE:', case_title)
    print('REGISTRY:', registry)
    
    if year:
        print('YEAR:', year.group(0))
    print('JUDGE NAME:', judge_name)
    print('DECISION LENGTH:', decision_len) # Pull out dec. length number
    
    print('====================')

def parse_cnlr_case(doc):
    lines = doc.split('\n')
    doc_dict = dict() # fill in later :) 
    registry = None
    judge_name = lines[3]
    case_title = lines[0]
    year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
    
    # Search for decision length (Verified this works on 6 different cases)
    # Logic: Start at 1. If we find a line following the current that is +1 of the decision length
    #        then we increment it. This should help avoid if a line begins with a different number
    #        because the decisions should be properly incrementing by 1 each time.
    decision_len = 0
    for line in lines:
        if line.startswith('Reasons for Judg') or line.lower().startswith('introduction'):
            decision_len = 1
            
        if line.startswith(str(decision_len + 1)):
            decision_len += 1
    
    print('CASE TITLE:', case_title)
    print('REGISTRY:', registry)
    
    if year:
        print('YEAR:', year.group(0))
    print('JUDGE NAME:', judge_name)
    print('DECISION LENGTH:', decision_len) # Pull out dec. length number
    
    print('====================')
    
def parse_dtc_case(doc):
    lines = doc.split('\n')
    doc_dict = dict() # fill in later :) 
    registry = None
    judge_name = lines[5][9:]
    case_title = lines[0]
    year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
    
    # Search for decision length (Verified this works on 6 different cases)
    # Logic: Start at 1. If we find a line following the current that is +1 of the decision length
    #        then we increment it. This should help avoid if a line begins with a different number
    #        because the decisions should be properly incrementing by 1 each time.
    decision_len = 0
    for line in lines:
        if line.startswith('Reasons for Judg') or line.lower().startswith('introduction'):
            decision_len = 1
            
        if line.startswith(str(decision_len + 1)):
            decision_len += 1
    
    print('CASE TITLE:', case_title)
    print('REGISTRY:', registry)
    
    if year:
        print('YEAR:', year.group(0))
    print('JUDGE NAME:', judge_name)
    print('DECISION LENGTH:', decision_len) # Pull out dec. length number
    
    print('====================')
    
def parse_bcj_case(doc):
    '''given a string of the entire document, extract relevant info - return dictionary of values'''
    lines = doc.split('\n')
    
    doc_dict = dict() # fill in later :) 
    
    # Simple Regex Fields
    case_no = re.search(r'\[[0-9]{4}\] [[A-Z|\.]+ No\. [0-9]+', doc)
    registry = re.search(r'Registry: ?([A-Za-z ]+)', doc)
    decision_len = re.search(r'\(([0-9]+) paras\.?\)', doc)
    
    # Fields that are always in the same place
    judge_name = lines[4].strip()
    case_title = lines[0].strip()
    # Extract year from case_title (in case we want to make visualizations, etc.)
    year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
    
    print('CASE TITLE:', case_title)
    if registry:
        print('REGISTRY:', registry.group(1).strip()) # Get rid of newline
    else:
        registry = re.search(r'([A-Za-z ]+) Registry No.', doc)
        if registry:
            print('(alt method) REGISTRY:', registry.group(1))
        else:
            print('@@@@ ERROR: UNABLE TO FIND REGISTRY @@@@')
    
    if year:
        print('YEAR:', year.group(0))
    print('JUDGE NAME:', judge_name)
    print('DECISION LENGTH:', decision_len.group(1)) # Pull out dec. length number
    
    print('====================')
    