In [3]:
# import individual files from folder
# We are going to use the NLTK package for Natural language Processing. 
import numpy as np
import pandas as pd
# For the first model we will use the word tokenizer. 
# This means that we are going to treat words separately. The other choice would be sentence tokenizer. 
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# This is our own tokenizer
from nltk.tokenize import RegexpTokenizer

import os
from docx import Document


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vinic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vinic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vinic\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# list all file names
os.listdir('154 cases') 


['Alexandra M. Lisowski v. New Jersey Transit, BER-L-13231-04.docx',
 'Allen Williams v. State Farm Insurance Co, MID-L-4430-05.docx',
 'Ana Budimlic v. National Retail Transportation, et al, 04-00884.docx',
 'Andrea Farro v. New Jersey Transit, Allstate Insurance Co, SAL-L-43-04.docx',
 'Anthony Faranca, Carolyn Faranca v. Luis Bonilla, et al, 05-01749.docx',
 'Anthony J. Conte III v. Gina Marie Rongone, Karen A. Rongone, 05-03951.docx',
 'Antoinette Sheffer, James Sheffer v. Linda Sullivan, BUR-L-748-04.docx',
 'Arthur Harriatt v. Cynthia Scott and Simon Wilson, UNN-L-3463-03.docx',
 'Ashley Dwyer, Jody Dwyer v. Vera Armoogan, Keith Armoogan, MID-L-005468-04.docx',
 'Bora Kochar v. J&B Leasing and Carlos Calzadilla, 05-02682.docx',
 'Boris Lipski et al. v. Mark Vanselous et al, 04-06009.docx',
 'Brian C. Hawkins v. United States of America, 03-3979.docx',
 'Brian Keir, as Guardian Ad Litem of Megan Keir, a minor child v. Winslow Township High School, et al.docx',
 'Charles Sjogren v.

In [9]:
# total number of files
list_ = os.listdir('154 cases') 

len(list_)

154

In [11]:
# This code creates a dictionary called documents, where the files are stored. 
# Each document has a number as its index. 

# Initializing the dictionary
documents = {}
# Setting initial value for the first document
i = 0

# Initializing for loop over all files in folder
for filename in list_:
    # Initialize document
    doc = ''
    # Create a temporary file. Within this file we will get every paragraph 
    temp = Document('154 cases/{}'.format(filename))
    
    # Create a for loop for every paragraph in the file to form the complete file
    for p in temp.paragraphs:
        
        doc = doc + ' ' + p.text

    documents[i] = doc
    
    i+=1
    

In [12]:
# for every document
injury = {}

for x in range(0, len(documents)):
    a = documents[x]
    
    # split document into individual words
    words = a.split()
    
    # find when "Injury:" and "Court:" occurs and get all the words in between
    try:
        start = words.index("Injury:")
        end = words.index("Court:")
        i = words[start:end]

        # join individual words together 
        phrase = ' '.join(i[0:0+len(i)])

        # add to injury dictionary
        injury[x] = phrase
        print(phrase)

    # if no headline that starts with "Injury" just ignore, from year 2006
    except ValueError:
        injury[x] = "NA"    

Injury: Compression fracture at T-12 in lower back, fusion of five vertebrae with steel rod
Injury: Half of pinky amputated; head and facial injuries
Injury: Internal damage to the knee, which required arthroscopic surgery; disc herniation, cuts and bruises
Injury: Fractured pelvic bone, ulna and two ribs, closed head injuries
Injury: Thoracic disc herniation at T6-T7, calcifying hemotoma on the left anterior tibia
Injury: Fractured arm, fractured nose
Injury: Multiple fractures and facial scarring
Injury: Fractured wrist, fractured femur, bilateral distal radial shaft fractures
Injury: Pelvic fractures, back injuries and traumatic brain injuries
Injury: Severe back injury
Injury: Unspecified
Injury: Disc herniation resulting in fusion surgery
Injury: Torn meniscus in knee, herniation at L4-5, laminectomy, radical diskectomy and forminatomy procedures in L3 to S1 regions of the spine, spinal leak
Injury: Death
Injury: Greg Riley - back injuries; Laurie Riley - broken back, broken pelvi

In [13]:
# number of cases we have
len(injury)

154

In [14]:
# create dataframe

# abdominal/abdomen, ankle, arm, back, brain, burn, chest, ear, elbow, eye, face/facial, foot, leg, genital, 
# hand, head, heart, hip, knee, mouth, neck, nose, pelvic, shoulder, spinal, thigh, wrist, psychological, death
columns = ['abdominal', 'ankle', 'arm', 'back', 'brain', 'burn', 'chest', 'ear', 'elbow', 'eye', 'face', 'foot', 'leg', 'genital', 'hand', 'head', 'heart', 'hip', 'knee', 'mouth', 'neck', 'nose', 'pelvic', 'shoulder', 'spinal', 'thigh', 'wrist', 'psychological', 'death', 'multiplePlaintiff']
df = pd.DataFrame(index=range(0,153), columns=columns)

In [15]:
# fill in dataframe

for x in range(0, len(injury)):

    # identify injury
    a = injury[x]

    # split up injury into individual words
    injurywords = a.split()
    
    # define keywords
    keywords = ['abdominal', 'abdomen', 'torso',  'spleen', ' renal artery',  'upper extremity']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'abdominal'] = 0
    else:
        df.loc[x, 'abdominal'] = 1    

    keywords = ['ankle']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'ankle'] = 0
    else:
        df.loc[x, 'ankle'] = 1           
 
    keywords = ['arm', 'forearm', 'ulna',  'brachial plexus']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'arm'] = 0
    else:
        df.loc[x, 'arm'] = 1  

    keywords = ['back']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'back'] = 0
    else:
        df.loc[x, 'back'] = 1          
        
    keywords = ['brain', 'cerebral', 'concussion',  'loss of consciousness']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'brain'] = 0
    else:
        df.loc[x, 'brain'] = 1          
        
    keywords = ['burn', 'burns']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'burn'] = 0
    else:
        df.loc[x, 'burn'] = 1  
        
    keywords = ['chest', 'rib', 'ribs', 'lung', 'clavical', 'pulmonary', 'thoracic']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'chest'] = 0
    else:
        df.loc[x, 'chest'] = 1          
                
    keywords = ['ear', 'hearing']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'ear'] = 0
    else:
        df.loc[x, 'ear'] = 1          
        
    keywords = ['elbow']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'elbow'] = 0
    else:
        df.loc[x, 'elbow'] = 1          
        
    keywords = ['eye', 'eyes', 'blindness']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'eye'] = 0
    else:
        df.loc[x, 'eye'] = 1              
        
    keywords = ['face', 'facial', 'forehead']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'face'] = 0
    else:
        df.loc[x, 'face'] = 1          
        
    keywords = ['foot', 'feet', 'heel']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'foot'] = 0
    else:
        df.loc[x, 'foot'] = 1          
        
    keywords = ['leg', 'tibia', 'tibial',  'sciatica']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'leg'] = 0
    else:
        df.loc[x, 'leg'] = 1          
        
    keywords = ['genital']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'genital'] = 0
    else:
        df.loc[x, 'genital'] = 1          
        
    keywords = ['hand', 'pinky', 'finger', 'thumb']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'hand'] = 0
    else:
        df.loc[x, 'hand'] = 1          
        
    keywords = ['head', 'headaches']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'head'] = 0
    else:
        df.loc[x, 'head'] = 1          
        
    keywords = ['heart']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'heart'] = 0
    else:
        df.loc[x, 'heart'] = 1          
        
    keywords = ['hip']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'hip'] = 0
    else:
        df.loc[x, 'hip'] = 1          
        
    keywords = ['knee']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'knee'] = 0
    else:
        df.loc[x, 'knee'] = 1          
        
    keywords = ['mouth', 'lips', 'lip', 'jaw']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'mouth'] = 0
    else:
        df.loc[x, 'mouth'] = 1     

    keywords = ['neck', 'cervical']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'neck'] = 0
    else:
        df.loc[x, 'neck'] = 1          
        
    keywords = ['nose', 'nasal']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'nose'] = 0
    else:
        df.loc[x, 'nose'] = 1     
        
    keywords = ['pelvic', 'pelvis', 'acetabular', 'pubic ramus', 'sacroiliac']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'pelvic'] = 0
    else:
        df.loc[x, 'pelvic'] = 1          
        
    keywords = ['shoulder', 'clavicle', 'rotator',  'sternoclavicular','subacromial decompression','acromioplasty']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'shoulder'] = 0
    else:
        df.loc[x, 'shoulder'] = 1     

    keywords = ['spinal', 'spine', 'disc', 'discs', 'vertebrae', 'radiculopathy', 'laminectomy', 'forminatomy', 'diskectomy', 'T8', 'lumbar', 'herniation']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'spinal'] = 0
    else:
        df.loc[x, 'spinal'] = 1          
        
    keywords = ['thigh', 'femur', 'shaft']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'thigh'] = 0
    else:
        df.loc[x, 'thigh'] = 1    

    keywords = ['wrist', 'carpel tunnel']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'wrist'] = 0
    else:
        df.loc[x, 'wrist'] = 1     

    keywords = ['psychological', 'cognitive', 'memory']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'psychological'] = 0
    else:
        df.loc[x, 'psychological'] = 1          
        
    keywords = ['death']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'death'] = 0
    else:
        df.loc[x, 'death'] = 1            

In [18]:
# Cases that in involve more than 1 person:

# for every document grab the plaintiff profile information
plaintiff = {}

for x in range(0, len(documents)):
    a = documents[x]
    
    # split document into individual words
    words = a.split()
    
    # find when "Injury:" and "Court:" occurs and get all the words in between
    try:
        start = words.index("Profile")
        end = words.index("Defendant")
        i = words[start:end]

        # join individual words together 
        phrase = ' '.join(i[0:0+len(i)])

        # add to injury dictionary
        plaintiff[x] = phrase

    # if no headline that starts with "Injury" just ignore, from year 2006
    except ValueError:
        plaintiff[x] = "NA"     
        
plaintiff        

{0: 'Profile Alexandra M. Lisowski',
 1: 'Profile Allen Williams',
 2: 'Profile Ana Budimlic',
 3: 'Profile Andrea Farro',
 4: 'Profile Anthony Faranca, Carolyn Faranca',
 5: 'Profile Anthony J. Conte III',
 6: 'Profile Antoinette Sheffer, James Sheffer',
 7: 'Profile Arthur Harriatt',
 8: 'Profile Ashley Dwyer, Jody Dwyer',
 9: 'Profile Bora Kochar',
 10: 'Profile Boris Lipski, Natasha Lipski',
 11: 'Profile Brian C. Hawkins',
 12: 'Profile Brian Keir, as guardian ad litem of Megan Keir, a minor child',
 13: 'Profile Charles Sjogren',
 14: 'Profile Chris Ruediger',
 15: 'Profile Christine Kelly',
 16: 'Profile Christopher Sage',
 17: 'Profile Chukwu Akalegbere',
 18: 'Profile Cornelio Dossantos',
 19: 'Profile Crystal Burkert',
 20: 'Profile Daniel McCurdy',
 21: 'Profile Daniel McCurdy',
 22: 'Profile Daniela Mirante',
 23: 'Profile David LaPolice and Nancy Lukomski-LaPolice',
 24: 'Profile David Shade and Theresa Shade',
 25: 'Profile Deborah Bonner',
 26: 'Profile Dennis Gillespie'

In [19]:
# fill in dataframe
# identify "and" or "," for Plaintiff Profile to label cases that have more than 1 person, 
# and just label it in a column

for x in range(0, len(plaintiff)):

    # identify plaintiff injury
    a = plaintiff[x]

    # split up injury into individual words
    words = a.split()
    
    # define keywords
    keywords = ['and', ','] #this also captures Alexandra...
    z = [i for e in keywords for i in words if e in i]
    if len(z) == 0:
        df.loc[x, 'multiplePlaintiff'] = 0
    else:
        df.loc[x, 'multiplePlaintiff'] = 1    


In [20]:
df.to_csv('injury.csv')