#### Libraries

In [127]:
import re
from collections import defaultdict

#### Global Variables to aid with filenames

In [2]:
path_to_data = '../data/Lexis Cases txt/'
file_prefix = 'P'
file_suffix = '.txt'
file_identifiers = range(1, 86) # Range from 1 to 85

#### Code to parse each single document

# WORK IN PROGRESS
##### Currently is able to print out the Decision Length, Judge Name, Year, and Registry Loc of any record

In [146]:
def parse_chfl_case(doc):
    lines = doc.split('\n')
    doc_dict = dict() # fill in later :) 
    registry = None
    decision_len = 0
    judge_name = lines[3][8:]
    case_title = lines[0]
    year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
    
    print('CASE TITLE:', case_title)
    print('REGISTRY:', registry)
    
    if year:
        print('YEAR:', year.group(0))
    print('JUDGE NAME:', judge_name)
    print('DECISION LENGTH:', decision_len) # Pull out dec. length number
    
    print('====================')
    
def parse_ilr_case(doc):
    lines = doc.split('\n')
    doc_dict = dict() # fill in later :) 
    registry = None
    judge_name = lines[3][8:]
    case_title = lines[0]
    year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
    
    # Search for decision length (Verified this works on 6 different cases)
    # Logic: Start at 1. If we find a line following the current that is +1 of the decision length
    #        then we increment it. This should help avoid if a line begins with a different number
    #        because the decisions should be properly incrementing by 1 each time.
    decision_len = 0
    for line in lines:
        if line.startswith('Reasons for Judg') or line.lower().startswith('introduction'):
            decision_len = 1
            
        if line.startswith(str(decision_len + 1)):
            decision_len += 1
    
    print('CASE TITLE:', case_title)
    print('REGISTRY:', registry)
    
    if year:
        print('YEAR:', year.group(0))
    print('JUDGE NAME:', judge_name)
    print('DECISION LENGTH:', decision_len) # Pull out dec. length number
    
    print('====================')

def parse_cnlr_case(doc):
    lines = doc.split('\n')
    doc_dict = dict() # fill in later :) 
    registry = None
    judge_name = lines[3]
    case_title = lines[0]
    year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
    
    # Search for decision length (Verified this works on 6 different cases)
    # Logic: Start at 1. If we find a line following the current that is +1 of the decision length
    #        then we increment it. This should help avoid if a line begins with a different number
    #        because the decisions should be properly incrementing by 1 each time.
    decision_len = 0
    for line in lines:
        if line.startswith('Reasons for Judg') or line.lower().startswith('introduction'):
            decision_len = 1
            
        if line.startswith(str(decision_len + 1)):
            decision_len += 1
    
    print('CASE TITLE:', case_title)
    print('REGISTRY:', registry)
    
    if year:
        print('YEAR:', year.group(0))
    print('JUDGE NAME:', judge_name)
    print('DECISION LENGTH:', decision_len) # Pull out dec. length number
    
    print('====================')
    
def parse_dtc_case(doc):
    lines = doc.split('\n')
    doc_dict = dict() # fill in later :) 
    registry = None
    judge_name = lines[5][9:]
    case_title = lines[0]
    year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
    
    # Search for decision length (Verified this works on 6 different cases)
    # Logic: Start at 1. If we find a line following the current that is +1 of the decision length
    #        then we increment it. This should help avoid if a line begins with a different number
    #        because the decisions should be properly incrementing by 1 each time.
    decision_len = 0
    for line in lines:
        if line.startswith('Reasons for Judg') or line.lower().startswith('introduction'):
            decision_len = 1
            
        if line.startswith(str(decision_len + 1)):
            decision_len += 1
    
    print('CASE TITLE:', case_title)
    print('REGISTRY:', registry)
    
    if year:
        print('YEAR:', year.group(0))
    print('JUDGE NAME:', judge_name)
    print('DECISION LENGTH:', decision_len) # Pull out dec. length number
    
    print('====================')
    
def parse_bcj_case(doc):
    '''given a string of the entire document, extract relevant info - return dictionary of values'''
    lines = doc.split('\n')
    
    doc_dict = dict() # fill in later :) 
    
    # Simple Regex Fields
    case_no = re.search(r'\[[0-9]{4}\] [[A-Z|\.]+ No\. [0-9]+', doc)
    registry = re.search(r'Registry: ?([A-Za-z ]+)', doc)
    decision_len = re.search(r'\(([0-9]+) paras\.?\)', doc)
    
    # Fields that are always in the same place
    judge_name = lines[4]
    case_title = lines[0]
    # Extract year from case_title (in case we want to make visualizations, etc.)
    year = re.search(r'20[0-2][0-9]', case_title) # Limit regex to be from 2000 to 2029
    
    print('CASE TITLE:', case_title)
    if registry:
        print('REGISTRY:', registry.group(1).strip()) # Get rid of newline
    else:
        registry = re.search(r'([A-Za-z ]+) Registry No.', doc)
        if registry:
            print('REGISTRY:', registry.group(1))
            print('ALT WAY')
            print()
            print()
        else:
            print('@@@@ ERROR: UNABLE TO FIND REGISTRY @@@@')
    
    if year:
        print('YEAR:', year.group(0))
    print('JUDGE NAME:', judge_name)
    print('DECISION LENGTH:', decision_len.group(1)) # Pull out dec. length number
    
    print('====================')

#### Code driver

In [167]:
stop = False
case_type_counts = defaultdict(int)
total = 0
for file_number in file_identifiers:
    print('\nProcessing ' + path_to_data + file_prefix + str(file_number) + file_suffix + '\n')
    document_data = None
    with open(path_to_data + file_prefix + str(file_number) + file_suffix, 'r') as document:
        document_data = document.read()
        
    document_data = document_data.split('End of Document\n') # Must have \n as the phrase appears in one of the cases
    total += len(document_data)-1
    for case in document_data:
        case = case.strip()
        
        # In case we have an empty case in the list
        if len(case) > 0:
            case_title = case.split('\n')[0]
            case_type = case.split('\n')[1]
            if 'R. v.' in case_title:
                case_type_counts['Crown Cases (R. v. ___)'] += 1
            elif 'Canadian Health Facilities Law Guide' in case_type: # CHFL
                case_type_counts['CHFL'] += 1
                #parse_chfl_case(case)
            elif 'British Columbia Judgments' in case_type: # B.C.J.
                case_type_counts['BCJ'] += 1
                #parse_bcj_case(case)
            elif 'Canadian Insurance Law Reporter Cases' in case_type: # I.L.R.
                case_type_counts['ILR'] += 1
                #parse_ilr_case(case)
            elif 'Canadian Commercial Law Guide' in case_type: # CCLG
                case_type_counts['CCLG'] += 1
                #parse_ilr_case(case) # Same format as I.L.R.
            elif 'Ontario Corporations Law Guide' in case_type: # OCLG
                case_type_counts['OCLG'] += 1
                #parse_ilr_case(case) # Same format as I.L.R.
            elif 'Canadian Corporate Secretary\'s Guide' in case_type: # CCSG
                case_type_counts['CCSG'] += 1
                #parse_ilr_case(case) # Same format as I.L.R.
            elif 'Canadian Employment Benefits & Pension Guide' in case_type: # CBPG
                case_type_counts['CBPG'] += 1
                #parse_ilr_case(case) # Same format as I.L.R.
            elif 'Alberta Corporations Law Guide' in case_type: # ACLG
                case_type_counts['ACLG'] += 1
                #parse_ilr_case(case) # Same format as I.L.R.
            elif 'British Columbia Real Estate Law Guide' in case_type: # BREG
                case_type_counts['BREG'] += 1
                #parse_ilr_case(case) # Same format as I.L.R.
            elif 'Canadian Native Law Reporter' in case_type: # C.N.L.R
                case_type_counts['CNLR'] += 1
                #parse_cnlr_case(case) # Slightly different judge name format
            elif 'Dominion Tax Cases' in case_type: # DTC
                case_type_counts['DTC'] += 1
                #parse_dtc_case(case) # Slightly different
            elif 'Canadian Labour Law Reporter' in case_type: # CLLC
                case_type_counts['CLLC'] += 1
                #parse_ilr_case(case) # Same format as I.L.R.
            elif 'British Columbia Corporations Law Guide' in case_type: # BCLG
                case_type_counts['BCLG'] += 1
                #parse_ilr_case(case) # Same format as I.L.R.
            else:
                print('UNKNOWN CASE TYPE --- STOPPING')
                print(case_title)
                stop = True
                break
        
        if stop:
            break
    
    if stop:
        break


Processing ../data/Lexis Cases txt/P1.txt


Processing ../data/Lexis Cases txt/P2.txt


Processing ../data/Lexis Cases txt/P3.txt


Processing ../data/Lexis Cases txt/P4.txt


Processing ../data/Lexis Cases txt/P5.txt


Processing ../data/Lexis Cases txt/P6.txt


Processing ../data/Lexis Cases txt/P7.txt


Processing ../data/Lexis Cases txt/P8.txt


Processing ../data/Lexis Cases txt/P9.txt


Processing ../data/Lexis Cases txt/P10.txt


Processing ../data/Lexis Cases txt/P11.txt


Processing ../data/Lexis Cases txt/P12.txt


Processing ../data/Lexis Cases txt/P13.txt


Processing ../data/Lexis Cases txt/P14.txt


Processing ../data/Lexis Cases txt/P15.txt


Processing ../data/Lexis Cases txt/P16.txt


Processing ../data/Lexis Cases txt/P17.txt


Processing ../data/Lexis Cases txt/P18.txt


Processing ../data/Lexis Cases txt/P19.txt


Processing ../data/Lexis Cases txt/P20.txt


Processing ../data/Lexis Cases txt/P21.txt


Processing ../data/Lexis Cases txt/P22.txt


Processing ../data

In [156]:
case_type_counts

defaultdict(int,
            {'BCJ': 3885,
             'CHFL': 29,
             'ILR': 128,
             'Crown Cases (R. v. ___)': 43,
             'CCLG': 5,
             'OCLG': 1,
             'CCSG': 2,
             'CBPG': 2,
             'ACLG': 1,
             'BREG': 16,
             'CNLR': 2,
             'DTC': 2,
             'CLLC': 1,
             'BCLG': 1})

In [158]:
case_type_dict = {'BCJ': 'British Columbia Judgments',
                  'CHFL': 'Canadian Health Facilities Law Guide',
                  'ILR': 'Canadian Insurance Law Reporter Cases',
                  'CCLG': 'Canadian Commercial Law Guide',
                  'OCLG': 'Ontario Corporations Law Guide',
                  'CCSG': 'Canadian Corporate Secretary\'s Guide',
                  'CBPG': 'Canadian Employment Benefits & Pension Guide',
                  'ACLG': 'Alberta Corporations Law Guide',
                  'BREG': 'British Columbia Real Estate Law Guide',
                  'CNLR': 'Canadian Native Law Reporter',
                  'DTC': 'Dominion Tax Cases',
                  'CLLC': 'Canadian Labour Law Reporter',
                  'BCLG': 'British Columbia Corporations Law Guide'}

In [162]:
for key, count in case_type_counts.items():
    try:
        print(key + ' (' + case_type_dict[key] + '): ' + str(count))
    except:
        print(key + ': ' + str(count))

BCJ (British Columbia Judgments): 3885
CHFL (Canadian Health Facilities Law Guide): 29
ILR (Canadian Insurance Law Reporter Cases): 128
Crown Cases (R. v. ___): 43
CCLG (Canadian Commercial Law Guide): 5
OCLG (Ontario Corporations Law Guide): 1
CCSG (Canadian Corporate Secretary's Guide): 2
CBPG (Canadian Employment Benefits & Pension Guide): 2
ACLG (Alberta Corporations Law Guide): 1
BREG (British Columbia Real Estate Law Guide): 16
CNLR (Canadian Native Law Reporter): 2
DTC (Dominion Tax Cases): 2
CLLC (Canadian Labour Law Reporter): 1
BCLG (British Columbia Corporations Law Guide): 1


In [166]:
#4,118
total

4118