#### Libraries

In [153]:
import re
from collections import defaultdict
import nltk
import pandas as pd

#### Global Variables to aid with filenames

In [2]:
path_to_data = '../data/Lexis Cases txt/'
file_prefix = 'P'
file_suffix = '.txt'
file_identifiers = range(1, 86) # Range from 1 to 85

#### Code to parse each single document

# WORK IN PROGRESS
##### Currently is able to print out the Decision Length, Judge Name, Year, and Registry Loc of any record

In [212]:
def parse_chfl_case(doc):
    lines = doc.split('\n')
    doc_dict = dict() # fill in later :) 
    registry = None
    decision_len = 0
    judge_name = lines[3][8:]
    case_title = lines[0]
    year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
    
    print('CASE TITLE:', case_title)
    print('REGISTRY:', registry)
    
    if year:
        print('YEAR:', year.group(0))
    print('JUDGE NAME:', judge_name)
    print('DECISION LENGTH:', decision_len) # Pull out dec. length number
    
    print('====================')
    
def parse_ilr_case(doc):
    lines = doc.split('\n')
    doc_dict = dict() # fill in later :) 
    registry = None
    judge_name = lines[3][8:]
    case_title = lines[0]
    year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
    
    # Search for decision length (Verified this works on 6 different cases)
    # Logic: Start at 1. If we find a line following the current that is +1 of the decision length
    #        then we increment it. This should help avoid if a line begins with a different number
    #        because the decisions should be properly incrementing by 1 each time.
    decision_len = 0
    for line in lines:
        if line.startswith('Reasons for Judg') or line.lower().startswith('introduction'):
            decision_len = 1
            
        if line.startswith(str(decision_len + 1)):
            decision_len += 1
    
    print('CASE TITLE:', case_title)
    print('REGISTRY:', registry)
    
    if year:
        print('YEAR:', year.group(0))
    print('JUDGE NAME:', judge_name)
    print('DECISION LENGTH:', decision_len) # Pull out dec. length number
    
    print('====================')

def parse_cnlr_case(doc):
    lines = doc.split('\n')
    doc_dict = dict() # fill in later :) 
    registry = None
    judge_name = lines[3]
    case_title = lines[0]
    year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
    
    # Search for decision length (Verified this works on 6 different cases)
    # Logic: Start at 1. If we find a line following the current that is +1 of the decision length
    #        then we increment it. This should help avoid if a line begins with a different number
    #        because the decisions should be properly incrementing by 1 each time.
    decision_len = 0
    for line in lines:
        if line.startswith('Reasons for Judg') or line.lower().startswith('introduction'):
            decision_len = 1
            
        if line.startswith(str(decision_len + 1)):
            decision_len += 1
    
    print('CASE TITLE:', case_title)
    print('REGISTRY:', registry)
    
    if year:
        print('YEAR:', year.group(0))
    print('JUDGE NAME:', judge_name)
    print('DECISION LENGTH:', decision_len) # Pull out dec. length number
    
    print('====================')
    
def parse_dtc_case(doc):
    lines = doc.split('\n')
    doc_dict = dict() # fill in later :) 
    registry = None
    judge_name = lines[5][9:]
    case_title = lines[0]
    year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
    
    # Search for decision length (Verified this works on 6 different cases)
    # Logic: Start at 1. If we find a line following the current that is +1 of the decision length
    #        then we increment it. This should help avoid if a line begins with a different number
    #        because the decisions should be properly incrementing by 1 each time.
    decision_len = 0
    for line in lines:
        if line.startswith('Reasons for Judg') or line.lower().startswith('introduction'):
            decision_len = 1
            
        if line.startswith(str(decision_len + 1)):
            decision_len += 1
    
    print('CASE TITLE:', case_title)
    print('REGISTRY:', registry)
    
    if year:
        print('YEAR:', year.group(0))
    print('JUDGE NAME:', judge_name)
    print('DECISION LENGTH:', decision_len) # Pull out dec. length number
    
    print('====================')
    
def parse_bcj_case(doc):
    '''given a string of the entire document, extract relevant info - return dictionary of values'''
    lines = doc.split('\n')
    
    doc_dict = dict() # fill in later :) 
    
    # Simple Regex Fields
    case_no = re.search(r'\[[0-9]{4}\] [[A-Z|\.]+ No\. [0-9]+', doc)
    registry = re.search(r'Registry: ?([A-Za-z ]+)', doc)
    decision_len = re.search(r'\(([0-9]+) paras\.?\)', doc)
    
    # Fields that are always in the same place
    judge_name = lines[4].strip()
    case_title = lines[0].strip()
    # Extract year from case_title (in case we want to make visualizations, etc.)
    year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
    
    print('CASE TITLE:', case_title)
    if registry:
        print('REGISTRY:', registry.group(1).strip()) # Get rid of newline
    else:
        registry = re.search(r'([A-Za-z ]+) Registry No.', doc)
        if registry:
            print('(alt method) REGISTRY:', registry.group(1))
        else:
            print('@@@@ ERROR: UNABLE TO FIND REGISTRY @@@@')
    
    if year:
        print('YEAR:', year.group(0))
    print('JUDGE NAME:', judge_name)
    print('DECISION LENGTH:', decision_len.group(1)) # Pull out dec. length number
    
    print('====================')
    
def rule_based_parse_BCJ(doc):
    '''Given an entire case, finds static information within the case (information that can be pattern matched)
    Expects a B.C.J. case format (British Columbia Judgments)
    
    The following fields are currently implemented:
    - Case Title
    - Judge Name
    - Registry
    - Year
    - Decision Length (in paragraphs)
    
    The following fields are still being implemented & tweaked:
    - Multiple Defendants
    
    Arguments: doc (String): The case in text format following the form used in the DOCX to TXT notebook
    Returns: case_dict (Dictionary): Dictionary with rule based parsable fields filled in
    '''
    lines = doc.split('\n')
    case_dict = dict() 
    
    # Fields that can be found via pattern matching
    decision_len = re.search(r'\(([0-9]+) paras\.?\)', doc) # e.g.) (100 paras.)
    registry = re.search(r'(Registry|Registries): ?([A-Za-z0-9 ]+)', doc) # e.g.) Registry: Vancouver
    if registry:
        registry = registry.group(2).strip()
    else:
        registry = re.search(r'([A-Za-z ]+) Registry No.', doc) # Alt form e.g.) Vancouver Registory No. XXX
        if registry:
            registry = registry.group(1).strip()
        else:
            registry = re.search(r'([A-Za-z ]+) No. S[0-9]*', doc)
            if registry:
                registry = registry.group(1).strip()
            else:
                print('WARNING: Registry could not be found (This shouldn\'t occur!)')

    
    # Fields that are always in the same place
    judge_name = lines[4].strip()
    case_title = lines[0].strip()
    
    # Extract year from case_title (in case we want to make visualizations, etc.)
    year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
    if year:
        year = year.group(0)
    else:
        # Rare case: Sometimes the title is too long. Rely on Heard date.
        year = re.search(r'Heard:.* ([2][0][0-2][0-9])', doc)
        if year:
            year = year.group(1)
        else:
            print('WARNING: Year not found')
    
    case_dict['case_title'] = case_title
    case_dict['year'] = year
    case_dict['registry'] = registry
    case_dict['judge'] = judge_name
    case_dict['decision_length'] = decision_len.group(1)
    case_dict['multiple_defendants'] = rule_based_multiple_defendants_parse(doc)
    case_dict['damages'] = rule_based_damage_extraction(doc)
    
    return case_dict
    
def rule_based_multiple_defendants_parse(doc):
    ''' 
    
    Given a case. Uses regex/pattern-matching to determine whether we have multiple defendants.
    For the most part the logic relies on whether the langauge used implies plurality or not.
    
    Arguments: doc (String): The case in text format following the form used in the DOCX to TXT notebook
    Returns: response (String, 'Y', 'N', or 'UNK' if unable to parse)
    
    '''
    
    
    # Case 1)
    # Traditional/most common. Of form "Between A, B, C, Plaintiff(s), X, Y, Z Defendant(s)"
    # Will also allow "IN THE MATTER OF ... Plaintiff .... Defendant..."
    # Can successfully cover ~98% of data
    regex_between_plaintiff_claimant = re.search(r'([Between|IN THE MATTER OF].*([P|p]laintiff[s]?|[C|c]laimant[s]?|[A|a]ppellant[s]?|[P|p]etitioner[s]?|[R|r]espondent[s]?).*([D|d]efendant[s]?|[R|r]espondent[s]?|[A|a]pplicant[s]?).*\n)', doc)
    
    # Match found
    if regex_between_plaintiff_claimant:
        text = regex_between_plaintiff_claimant.group(0).lower()
        if 'defendants' in text or 'respondents' in text or 'applicants' in text: # Defendant/respondent same thing.
            return 'Y'
        elif 'defendant' in text or 'respondent' in text or 'applicant' in text:
            return 'N'
    
    # If not found, try other less common cases
    else:
        # Case 2)
        # Sometimes it does not mention the name of the second item. (Defendent/Respondent)
        # We can estimate if there are multiple based on the number of "," in the line (Covers all cases in initial data)
        regex_missing_defendent = re.search(r'(Between.*([P|p]laintiff[s]?|[C|c]laimant[s]?|[A|a]ppellant[s]?|[P|p]etitioner[s]?).*\n)', doc)
        if regex_missing_defendent:
            text = regex_missing_defendent.group(0).lower()
            if len(text.split(',')) > 5:
                return 'Y'
            else:
                return 'N'
            
        else:
            print('Multiple defendants: Unknown! Unable to regex match')
            return 'UNK'
    
def rule_based_damage_extraction(doc, min_score = 0.85, max_match_len_split = 10):
    '''Given a case, attempts to extract damages using regex patterns
    
    Arguments: doc (String): The case in text format following the form used in the DOCX to TXT notebook
    min_score (float): The minimum paragraph score to consider having a valid $ number
                       Paragraph has score 1 if its the last paragraph
                       Paragraph has score 0 if its the first paragraph
    max_match_len_split (int): The max amount of items that can appear in a regex match after splitting (no. words)
    
    Returns: damages (Dict): Contains any found damages
    
    '''
    damages = defaultdict(float)
    repetition_detection = defaultdict(set) # try to stem the repeated values
    no_paras = re.search(r'\(([0-9|,]+) paras?\.?\)', doc).group(1) # Get number of paragraphs
    pattern = r'[\W|\w]?(?=\n[0-9]{1,%s}[\xa0|\s| ]{2})'%len(no_paras) # Used to split into paras
    paras_split = re.split(pattern, doc)
    money_patt = r'\$[0-9|,]+' # Used to get all paragraphs with a money amount
    scored_paras = [] # Score paragraphs based on where they appear in the document
                      # Score of 0.0 would be the first paragraph. Score of 1.0 would be the last paragraph
        
    for i, paragraph in enumerate(paras_split):
        if re.search(money_patt, paragraph):
            scored_paras.append((i / len(paras_split), paragraph)) # (score, paragraph). Score formula: i/no_paras
            
    scored_paras = sorted(scored_paras, key=lambda x:x[0])[::-1] # Store from last paragraph to first
    if len(scored_paras) == 0: # If no $ amount is mentioned
        return None
    if scored_paras[0][0] < min_score: # If last $ amount isn't mentioned late enough in the text
        return None
    
    # Form: X damages: $5 -- i.e. Non-pecuniary damages $5000
    regex_damages = r'[\w|-]* ?(?:damage|loss|capacity).+?\$? ?[0-9][0-9|,|.]+[0-9]'
    regex_alt_damages = r'\$? ?[0-9][0-9|,|.]+[0-9] (?:for|representing)?[ \w\-+]+damages?'
    regex_future_care_loss = r'(?:future|past|in-?trust|award).*?(?:loss|costs?|income|care)?.*?\$? ?[0-9][0-9|,|.]+[0-9]'
    regex_for_cost_of = r'\$? ?[0-9][0-9|,|.]+[0-9]?[\w ]*? cost .*?\.'
    
    #regex_in_trust = r'(?:in-?trust|award).*?\$? ?[0-9][0-9|,|.]+[0-9]'
    patterns = [regex_damages, regex_alt_damages, regex_future_care_loss, regex_for_cost_of]
    
    # Words that tend to mean someoen is asking for this amount of money
    # Or they are describing a previous case
    banned_words = ['seek', 'claim', 'propose', 'range', ' v. ']
    # If these words appear. Ignore the banned_words.
    counter_words = ['summary']
    
    # Get money mounts from the text
    total = None
    matches = []
    for score, text in scored_paras:
        if score > min_score:
            skip = False
            for banned_word in banned_words: # Skip paras with banned words
                if banned_word in text:
                    skip = True       
            for counter_word in counter_words:
                if counter_word in text:
                    skip = False
            if skip:
                continue
            
            text_matches = get_matching_text(patterns, text, max_match_len_split)
            for t_m in text_matches:
                matches.append((score, t_m))

                    
    print('Matches:')
    print(matches)
    # Convert money amounts into actual float values
    # Determine which column they should fit into.
    regex_number_extraction = r'[0-9][0-9|,|.]+'
    for score, match in matches:
        skip = False
        for banned_word in banned_words: # If it had "Summary" banned words may appear here.
            if banned_word in match:     # Its ok to be in the paragraph but not the match itself.
                skip=True
                break
        if skip:
            continue
        
        amount = re.search(regex_number_extraction, match, re.IGNORECASE)
        extracted_value = None
        if amount:
            amount = amount.group(0).replace(',', '')
            if amount[-1] == '.':
                amount = amount[:-1]
            extracted_value = float(amount)
        else:    
            print('Error: Has no money amount?', match)
            
        value_extracted = False
        
        if not value_extracted:
            general_damage_keywords = [('general',), ('future', 'income', 'loss'), ('future', 'income'), ('future', 'wage', 'loss'), ('future', 'earning'), ('earning', 'capacity')]
            for keywords in general_damage_keywords:
                if all(word in match.lower() for word in keywords):
                    value_extracted = True
                    if best_score_item(score, matches, keywords):
                        if extracted_value not in repetition_detection['General']:
                            
                            damages['General'] += extracted_value
                            repetition_detection['General'].add(extracted_value)
                    break
        
        if not value_extracted:
            special_damage_keywords = [('special',), ('trust',), ('past', 'income', 'loss'), ('past', 'wage'), ('past', 'earning'), ('past', 'income')]
            for keywords in special_damage_keywords:
                if all(word in match.lower() for word in keywords):        
                    value_extracted = True
                    if best_score_item(score, matches, ('special',)):
                        if extracted_value not in repetition_detection[('special',)]:
                            damages['Special'] += extracted_value
                            repetition_detection[('special',)].add(extracted_value)
                        
        if not value_extracted:
            if 'aggravated' in match.lower():
                value_extracted = True
                if best_score_item(score, matches, ('aggravated',)):
                    if extracted_value not in repetition_detection[('aggravated',)]:
                        damages['Aggravated'] += extracted_value
                        repetition_detection[('aggravated',)].add(extracted_value)
                            
        if not value_extracted:
            if 'non' in match.lower() and 'pecuniary' in match.lower():
                value_extracted = True
                if best_score_item(score, matches, ('non', 'pecuniary')):
                    if extracted_value not in repetition_detection[('non', 'pecuniary')]:
                        damages['Non-pecuniary'] += extracted_value
                        repetition_detection[('non', 'pecuniary')].add(extracted_value)
                            
        if not value_extracted:
            if 'punitive' in match.lower():
                value_extracted = True
                if best_score_item(score, matches, ('punitive',)):
                    if extracted_value not in repetition_detection[('punitive',)]:
                        damages['Punitive'] += extracted_value
                        repetition_detection[('punitive',)].add(extracted_value)                    
                        
                        
        if not value_extracted:
            future_care_keywords = [('future', 'care'), ('future', 'cost')]
            for keywords in future_care_keywords:
                if all(word in match.lower() for word in keywords):
                    value_extracted = True
                    if best_score_item(score, matches, keywords):
                        if extracted_value not in repetition_detection[keywords]:
                            damages['Future Care'] += extracted_value
                            repetition_detection[keywords].add(extracted_value)
                    break
                            
        if not value_extracted:
            total_keywords = [('total',), ('sum',), ('award',)]
            for keywords in total_keywords:
                if all(word in match.lower() for word in keywords):
                    value_extracted=True
                    if best_score_item(score, matches, keywords):
                        # Only set the "total" if does not match any if statement above and
                        # it is not the same as adding up all of the values manually.
                        if extracted_value not in repetition_detection[('total',)]:
                            damages['Pecuniary Total'] = damages['Special'] + damages['General'] + damages['Punitive'] + damages['Aggravated'] + damages['Future Care']
                            damages['Total'] = damages['Pecuniary Total'] + damages['Non-pecuniary']
                            if damages['Total'] == 0:
                                total = extracted_value
                                repetition_detection[('total',)].add(extracted_value)
                        
    damages['Pecuniary Total'] = damages['Special'] + damages['General'] + damages['Punitive'] + damages['Aggravated'] + damages['Future Care']
    damages['Total'] = damages['Pecuniary Total'] + damages['Non-pecuniary']
    
    if damages['Total'] == 0: # only use the "total" if we couldnt find anything else. Otherwise add it up
    
        damages['Total'] = total
        damages['General'] = total
    
    
    if damages['Total'] == 0 or damages['Total'] is None:
        damages['Pecuniary Total'] = None
        damages['Special'] = None
        damages['General'] = None
        damages['Punitive'] = None
        damages['Aggravated'] = None
        damages['Future Care'] = None
        damages['Non-pecuniary'] = None
        damages['Total'] = None

    print(damages)
    return damages


def get_matching_text(patterns, text, max_match_len_split):
    '''
    Given a set of regex; pulls out all matching text
    
    Arguments:
    patterns (list) - List of regex patterns in string format
    text (string) - Text to search for matches in
    
    Returns:
    matches (list) - List containing all matches in text format
    '''

    matches = []
    for pattern in patterns:
        for match in re.findall(pattern, text, re.IGNORECASE):
            if 'and' not in match:
                if len(match.split()) <= max_match_len_split:
                    matches.append(match)
                    
    return matches
    
def is_best_score(score, matches, keywords):
    '''
    Given a set of regex matches, determine if the score is the highest score out of all matches for the given keywords
    
    Arguments:
    score (float) - The score of the item you're inspecting
    matches (list) - List of matches where each element is of form (score, match text)
    keywords (tuple) - All words that should appear in the match
    
    Returns: True or False
    
    '''
    best_score = score
    
    for score, match in matches:
        if all(word in match.lower() for word in keywords):
            if score > best_score:
                return False
            
    return True
    
def rule_based_convert_to_DF(cases):
    '''temp function. i think ilana has a better one'''
    
    cols = cases[0].keys()
    
    titles = []
    years = []
    judges = []
    dec_len = []
    mul_def = []
    
    total_dmg = []
    total_pec_total = []
    non_pec = []
    punitive = []
    aggravated = []
    special = []
    general = []
    future_care = []
    
    for case in cases:
        titles.append(case['case_title'])
        years.append(case['year'])
        judges.append(case['judge'])
        dec_len.append(case['decision_length'])
        mul_def.append(case['multiple_defendants'])
        
        if case['damages'] is not None:
            total_dmg.append(case['damages']['Total'])
            total_pec_total.append(case['damages']['Pecuniary Total'])
            non_pec.append(case['damages']['Non-pecuniary'])
            punitive.append(case['damages']['Punitive'])
            aggravated.append(case['damages']['Aggravated'])
            special.append(case['damages']['Special'])
            general.append(case['damages']['General'])
            future_care.append(case['damages']['Future Care'])
        else:
            total_dmg.append(None)
            total_pec_total.append(None)
            non_pec.append(None)
            punitive.append(None)
            aggravated.append(None)
            special.append(None)
            general.append(None)
            future_care.append(None)
        
        
    df = pd.DataFrame()
    df['Title'] = titles
    df['Year'] = years
    df['Total Damage'] = total_dmg
    df['Total Pecuniary'] = total_pec_total
    df['Non pecuniary'] = non_pec
    df['General'] = general
    df['Special'] = special
    df['Aggravated'] = aggravated
    df['Future Care'] = future_care
    df['Punitive'] = punitive
    df['Judge'] = judges
    df['Decision Length'] = dec_len
    df['Multiple Defendants'] = mul_def
    
    return df

#### Code driver

In [213]:
case_type_counts = defaultdict(int)
cases = 0
data = []
for file_number in file_identifiers:

    print('## Processing ' + path_to_data + file_prefix + str(file_number) + file_suffix + ' ##\n')
    document_data = None
    
    with open(path_to_data + file_prefix + str(file_number) + file_suffix, 'r') as document:
        document_data = document.read()
        
    document_data = document_data.split('End of Document\n') # Always split on 'End of Document\n'
    
    case_parsed_data = []
    for case in document_data:
        case = case.strip() # Make sure to strip!

        if len(case) == 0: # Skip empty lines
            continue

        case_title = case.split('\n')[0]
        case_type = case.split('\n')[1]

        # Skip crown cases
        # Skip Re: cases
        if 'R. v.' in case_title or '(Re)' in case_title: 
            continue
            
        # Skip client/solicitor cases (not same as plaintiff/defendant)
        regex_client_solicitor = re.search(r'(Between.*([C|c]lient[s]?).*([S|s]olicitor[s]?|[L|l]awyer[s]?))', case)
        if regex_client_solicitor:
            continue
        
        regex_solicitor_client = re.search(r'(Between.*([L|l]awyer[s]?|[S|s]olicitor[s]?).*([C|c]lient[s]?))', case)
        if regex_solicitor_client:
            continue
            
        key_words = ['appellant', 'respondent', 'claimant', 'petitioner', 'plaintiff', 'defendant',
        'appellants', 'respondents', 'claimants', 'petitioners', 'plaintiffs', 'defendants']
        regex_in_matter_of = re.search(r'IN THE MATTER OF .*\n\([0-9]+ paras.\)', case)
        if regex_in_matter_of:
            remove = True
            for key in key_words:
                if key in regex_in_matter_of.group(0).lower().strip():
                    remove = False
                    
            if remove:
                continue

        if 'British Columbia Judgments' in case_type: # Make sure we're dealing with a B.C.J. case
            cases += 1
            #case_parsed_data.append(rule_based_parse_BCJ(case))
            print(case_title)
            data.append(rule_based_parse_BCJ(case))
            
        break
        
        
print('Total cases being used:', cases)


## Processing ../data/Lexis Cases txt/P1.txt ##

Mawani v. Pitcairn, [2012] B.C.J. No. 1819
## Processing ../data/Lexis Cases txt/P2.txt ##

## Processing ../data/Lexis Cases txt/P3.txt ##

Brooks-Martin v. Martin, [2011] B.C.J. No. 243
## Processing ../data/Lexis Cases txt/P4.txt ##

McGavin v. Talbot, [2017] B.C.J. No. 2439
Matches:
[(0.9747899159663865, 'General Damages (First Accident): $65,000.00'), (0.9747899159663865, 'General Damages (Second Accident): $7,000.00'), (0.9747899159663865, ' Loss of Past Income Earning Opportunity: $9,000.00'), (0.9747899159663865, 'Special Damages: $6,636.82'), (0.9747899159663865, '$65,000.00 General Damages'), (0.9747899159663865, '$900.00 Special Damages'), (0.9747899159663865, 'Past Income Earning Opportunity: $9,000.00'), (0.9747899159663865, 'Future Care: $900.00'), (0.9747899159663865, '$9,000.00 Cost of Future Care: $900.'), (0.957983193277311, 'special damages is $6,636.82'), (0.8739495798319328, 'award of $1,800'), (0.8571428571428571, '

Tabori v. Renaud, [2016] B.C.J. No. 1424
## Processing ../data/Lexis Cases txt/P32.txt ##

Maruna v. Lopatka, [2001] B.C.J. No. 484
## Processing ../data/Lexis Cases txt/P33.txt ##

Cresswell Investments Ltd. v. Pavone, [2011] B.C.J. No. 1523
## Processing ../data/Lexis Cases txt/P34.txt ##

Drucker, Inc. v. Gui, [2009] B.C.J. No. 808
Matches:
[(0.967741935483871, '$5,000 in punitive damages'), (0.967741935483871, 'award $5,000')]
defaultdict(<class 'float'>, {'Punitive': 5000.0, 'Special': 0.0, 'General': 0.0, 'Aggravated': 0.0, 'Future Care': 0.0, 'Pecuniary Total': 5000.0, 'Non-pecuniary': 0.0, 'Total': 5000.0})
## Processing ../data/Lexis Cases txt/P35.txt ##

Bell v. Thorner, [2009] B.C.J. No. 63
## Processing ../data/Lexis Cases txt/P36.txt ##

## Processing ../data/Lexis Cases txt/P37.txt ##

Thom v. Laird Custom Homes Ltd., [2017] B.C.J. No. 1762
## Processing ../data/Lexis Cases txt/P38.txt ##

Aldred v. Colbeck, [2010] B.C.J. No. 89
Matches:
[]
defaultdict(<class 'float'>, {'

Budget Rent a Car System, Inc. v. Philadelphia Indemnity Insurance Co., [2018] B.C.J. No. 165
## Processing ../data/Lexis Cases txt/P64.txt ##

Home Equity Development Inc. v. Crow, [2002] B.C.J. No. 3169
## Processing ../data/Lexis Cases txt/P65.txt ##

Woods v. Plewes, [2014] B.C.J. No. 350
## Processing ../data/Lexis Cases txt/P66.txt ##

R.G. v. Vancouver (City) Police Board, [2012] B.C.J. No. 62
## Processing ../data/Lexis Cases txt/P67.txt ##

Mutual Construction (2000) Ltd. v. Hardwick, [2009] B.C.J. No. 183
## Processing ../data/Lexis Cases txt/P68.txt ##

336332 B.C. Ltd. v. Imperial Oil Ltd., [2002] B.C.J. No. 844
## Processing ../data/Lexis Cases txt/P69.txt ##

Mann v. Kathuria, [2017] B.C.J. No. 2462
## Processing ../data/Lexis Cases txt/P70.txt ##

Lenning v. Hicks, [2014] B.C.J. No. 1272
## Processing ../data/Lexis Cases txt/P71.txt ##

Cornish v. Khunkhun, [2015] B.C.J. No. 1014
Matches:
[]
defaultdict(<class 'float'>, {'Special': None, 'General': None, 'Punitive': None

In [214]:
data

[{'case_title': 'Mawani v. Pitcairn, [2012] B.C.J. No. 1819',
  'year': '2012',
  'registry': 'Vancouver',
  'judge': 'S.F. Kelleher J.',
  'decision_length': '115',
  'multiple_defendants': 'Y',
  'damages': None},
 {'case_title': 'Brooks-Martin v. Martin, [2011] B.C.J. No. 243',
  'year': '2011',
  'registry': 'Nanaimo',
  'judge': 'D.A. Halfyard J.',
  'decision_length': '172',
  'multiple_defendants': 'Y',
  'damages': None},
 {'case_title': 'McGavin v. Talbot, [2017] B.C.J. No. 2439',
  'year': '2017',
  'registry': 'Victoria',
  'judge': 'D.M. Masuhara J.',
  'decision_length': '59',
  'multiple_defendants': 'Y',
  'damages': defaultdict(float,
              {'General': 72000.0,
               'Special': 16536.82,
               'Future Care': 9900.0,
               'Punitive': 0.0,
               'Aggravated': 0.0,
               'Pecuniary Total': 98436.82,
               'Non-pecuniary': 0.0,
               'Total': 98436.82})},
 {'case_title': 'Mclaren v. Rice, [2009] B.C.J. 

In [215]:
df = rule_based_convert_to_DF(data)

In [216]:
df.to_csv('second_pass.csv', index=False)

In [None]:
# IN THE MATTER OF (Re) cases
# client/soliciter

# 

len(t)

In [244]:
bcj

45

In [245]:
t

[<re.Match object; span=(317, 628), match='Between N&C Transportation Ltd., T&S Transportati>,
 <re.Match object; span=(390, 887), match='Between Christopher Radke, plaintiff, and M.S., a>,
 <re.Match object; span=(394, 509), match='Between Roger Garside Construction Ltd., Plaintif>,
 <re.Match object; span=(280, 375), match='Between Texas Rutter, Plaintiff, and George Willi>,
 <re.Match object; span=(384, 632), match='Between Salico Property Marketing Corporation, Cr>,
 <re.Match object; span=(294, 375), match='Between David Douglas Schlachter, Plaintiff, and >,
 <re.Match object; span=(303, 404), match='Between Charles Suzuki, plaintiff, and Jeremy Dun>,
 <re.Match object; span=(297, 447), match='Between Rebecca Thompson, an infant by her litiga>,
 <re.Match object; span=(311, 408), match='Between Tammy Thon and Tanya Thon, plaintiffs, an>,
 <re.Match object; span=(293, 405), match='Between Jaime Jocelyn Patricia Slater, Plaintiff,>,
 <re.Match object; span=(293, 364), match='Between

In [158]:
case_type_dict = {'BCJ': 'British Columbia Judgments',
                  'CHFL': 'Canadian Health Facilities Law Guide',
                  'ILR': 'Canadian Insurance Law Reporter Cases',
                  'CCLG': 'Canadian Commercial Law Guide',
                  'OCLG': 'Ontario Corporations Law Guide',
                  'CCSG': 'Canadian Corporate Secretary\'s Guide',
                  'CBPG': 'Canadian Employment Benefits & Pension Guide',
                  'ACLG': 'Alberta Corporations Law Guide',
                  'BREG': 'British Columbia Real Estate Law Guide',
                  'CNLR': 'Canadian Native Law Reporter',
                  'DTC': 'Dominion Tax Cases',
                  'CLLC': 'Canadian Labour Law Reporter',
                  'BCLG': 'British Columbia Corporations Law Guide'}

In [162]:
for key, count in case_type_counts.items():
    try:
        print(key + ' (' + case_type_dict[key] + '): ' + str(count))
    except:
        print(key + ': ' + str(count))

BCJ (British Columbia Judgments): 3885
CHFL (Canadian Health Facilities Law Guide): 29
ILR (Canadian Insurance Law Reporter Cases): 128
Crown Cases (R. v. ___): 43
CCLG (Canadian Commercial Law Guide): 5
OCLG (Ontario Corporations Law Guide): 1
CCSG (Canadian Corporate Secretary's Guide): 2
CBPG (Canadian Employment Benefits & Pension Guide): 2
ACLG (Alberta Corporations Law Guide): 1
BREG (British Columbia Real Estate Law Guide): 16
CNLR (Canadian Native Law Reporter): 2
DTC (Dominion Tax Cases): 2
CLLC (Canadian Labour Law Reporter): 1
BCLG (British Columbia Corporations Law Guide): 1


In [174]:
#4,118
total

4118