In [135]:
# import individual files from folder
# We are going to use the NLTK package for Natural language Processing. 
import numpy as np
import pandas as pd
# For the first model we will use the word tokenizer. 
# This means that we are going to treat words separately. The other choice would be sentence tokenizer. 
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# This is our own tokenizer
from nltk.tokenize import RegexpTokenizer

import os
from docx import Document

path_to_folder = '/Users/Felicia/Dropbox/Predicting Settlement Value/'
os.chdir('{}154 cases'.format(path_to_folder))

[nltk_data] Downloading package punkt to /Users/Felicia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Felicia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Felicia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [136]:
# list all file names
os.listdir('/Users/Felicia/Dropbox/Predicting Settlement Value/154 cases') 


['Allen Williams v. State Farm Insurance Co, MID-L-4430-05.docx',
 'Boris Lipski et al. v. Mark Vanselous et al, 04-06009.docx',
 'Robert Barrall, Barbara Barrall v. Christine Bachman, BUR-L-1934-05(2).docx',
 'Alexandra M. Lisowski v. New Jersey Transit, BER-L-13231-04.docx',
 'Ronye Leder v. Kumi Sarpong, et al, MER-L-0240-01.docx',
 'Mohammed M. Rahman et al. v. Bobby Dwain Watson et al, 06-00155.docx',
 'Jesenia Jimenez v. William Baglieri, et al, L 007120 93.docx',
 'Ashley Dwyer, Jody Dwyer v. Vera Armoogan, Keith Armoogan, MID-L-005468-04.docx',
 'Joan M. Rider, as executrix of the estate of Robert Rider, deceased, and individually v. Township of.docx',
 'Kebyna Caynard, Berlynda Caynard, Ketsia Jean v. Daniel A. Perez, Lucrecia Cruz, Heraux Jean, Allsta.docx',
 'Sara Petronis v. Roger Venorder, ATL-L-001199-04.docx',
 'Veronica Zapata v. Laura Roberts, WAR-L-602-02.docx',
 'Frani Feit, individually and assignee of David J. Feit v. Great-West Life And Annuity Insurance Comp.docx

In [137]:
# total number of files
list = os.listdir('/Users/Felicia/Dropbox/Predicting Settlement Value/154 cases') 

len(list)

155

In [138]:
# This code creates a dictionary called documents, where the files are stored. 
# Each document has a number as its index. 

# Initializing the dictionary
documents = {}
# Setting initial value for the first document
i = 0

# Initializing for loop over all files in folder
for filename in os.listdir():
    # Initialize document
    doc = ''
    # Create a temporary file. Within this file we will get every paragraph 
    temp = Document('{}'.format(filename))
    
    # Create a for loop for every paragraph in the file to form the complete file
    for p in temp.paragraphs:
        
        doc = doc + ' ' + p.text

    documents[i] = doc
    
    i+=1
    

PackageNotFoundError: Package not found at '.DS_Store'

In [None]:
# for every document
injury = {}

for x in range(0, len(documents)):
    a = documents[x]
    
    # split document into individual words
    words = a.split()
    
    # find when "Injury:" and "Court:" occurs and get all the words in between
    try:
        start = words.index("Injury:")
        end = words.index("Court:")
        i = words[start:end]

        # join individual words together 
        phrase = ' '.join(i[0:0+len(i)])

        # add to injury dictionary
        injury[x] = phrase
        print(phrase)

    # if no headline that starts with "Injury" just ignore, from year 2006
    except ValueError:
        injury[x] = "NA"    

In [None]:
# number of cases we have
len(injury)

In [None]:
# create dataframe

# abdominal/abdomen, ankle, arm, back, brain, burn, chest, ear, elbow, eye, face/facial, foot, leg, genital, 
# hand, head, heart, hip, knee, mouth, neck, nose, pelvic, shoulder, spinal, thigh, wrist, psychological, death
columns = ['abdominal', 'ankle', 'arm', 'back', 'brain', 'burn', 'chest', 'ear', 'elbow', 'eye', 'face', 'foot', 'leg', 'genital', 'hand', 'head', 'heart', 'hip', 'knee', 'mouth', 'neck', 'nose', 'pelvic', 'shoulder', 'spinal', 'thigh', 'wrist', 'psychological', 'death']
df = pd.DataFrame(index=range(0,153), columns=columns)

In [None]:
# fill in dataframe

for x in range(0, len(injury)):

    # identify injury
    a = injury[x]

    # split up injury into individual words
    injurywords = a.split()
    
    # define keywords
    keywords = ['abdominal', 'abdomen', 'torso',  'spleen', ' renal artery',  'upper extremity']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'abdominal'] = 0
    else:
        df.loc[x, 'abdominal'] = 1    

    keywords = ['ankle']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'ankle'] = 0
    else:
        df.loc[x, 'ankle'] = 1           
 
    keywords = ['arm', 'forearm', 'ulna',  'brachial plexus']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'arm'] = 0
    else:
        df.loc[x, 'arm'] = 1  

    keywords = ['back']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'back'] = 0
    else:
        df.loc[x, 'back'] = 1          
        
    keywords = ['brain', 'cerebral', 'concussion',  'loss of consciousness']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'brain'] = 0
    else:
        df.loc[x, 'brain'] = 1          
        
    keywords = ['burn', 'burns']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'burn'] = 0
    else:
        df.loc[x, 'burn'] = 1  
        
    keywords = ['chest', 'rib', 'ribs', 'lung', 'clavical', 'pulmonary', 'thoracic']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'chest'] = 0
    else:
        df.loc[x, 'chest'] = 1          
                
    keywords = ['ear', 'hearing']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'ear'] = 0
    else:
        df.loc[x, 'ear'] = 1          
        
    keywords = ['elbow']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'elbow'] = 0
    else:
        df.loc[x, 'elbow'] = 1          
        
    keywords = ['eye', 'eyes', 'blindness']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'eye'] = 0
    else:
        df.loc[x, 'eye'] = 1              
        
    keywords = ['face', 'facial', 'forehead']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'face'] = 0
    else:
        df.loc[x, 'face'] = 1          
        
    keywords = ['foot', 'feet', 'heel']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'foot'] = 0
    else:
        df.loc[x, 'foot'] = 1          
        
    keywords = ['leg', 'tibia', 'tibial',  'sciatica']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'leg'] = 0
    else:
        df.loc[x, 'leg'] = 1          
        
    keywords = ['genital']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'genital'] = 0
    else:
        df.loc[x, 'genital'] = 1          
        
    keywords = ['hand', 'pinky', 'finger', 'thumb']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'hand'] = 0
    else:
        df.loc[x, 'hand'] = 1          
        
    keywords = ['head', 'headaches']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'head'] = 0
    else:
        df.loc[x, 'head'] = 1          
        
    keywords = ['heart']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'heart'] = 0
    else:
        df.loc[x, 'heart'] = 1          
        
    keywords = ['hip']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'hip'] = 0
    else:
        df.loc[x, 'hip'] = 1          
        
    keywords = ['knee']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'knee'] = 0
    else:
        df.loc[x, 'knee'] = 1          
        
    keywords = ['mouth', 'lips', 'lip', 'jaw']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'mouth'] = 0
    else:
        df.loc[x, 'mouth'] = 1     

    keywords = ['neck', 'cervical']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'neck'] = 0
    else:
        df.loc[x, 'neck'] = 1          
        
    keywords = ['nose', 'nasal']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'nose'] = 0
    else:
        df.loc[x, 'nose'] = 1     
        
    keywords = ['pelvic', 'pelvis', 'acetabular', 'pubic ramus', 'sacroiliac']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'pelvic'] = 0
    else:
        df.loc[x, 'pelvic'] = 1          
        
    keywords = ['shoulder', 'clavicle', 'rotator',  'sternoclavicular','subacromial decompression','acromioplasty']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'shoulder'] = 0
    else:
        df.loc[x, 'shoulder'] = 1     

    keywords = ['spinal', 'spine', 'disc', 'discs', 'vertebrae', 'radiculopathy', 'laminectomy', 'forminatomy', 'diskectomy', 'T8', 'lumbar', 'herniation']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'spinal'] = 0
    else:
        df.loc[x, 'spinal'] = 1          
        
    keywords = ['thigh', 'femur', 'shaft']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'thigh'] = 0
    else:
        df.loc[x, 'thigh'] = 1    

    keywords = ['wrist', 'carpel tunnel']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'wrist'] = 0
    else:
        df.loc[x, 'wrist'] = 1     

    keywords = ['psychological', 'cognitive', 'memory']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'psychological'] = 0
    else:
        df.loc[x, 'psychological'] = 1          
        
    keywords = ['death']
    z = [i for e in keywords for i in injurywords if e in i]
    if len(z) == 0:
        df.loc[x, 'death'] = 0
    else:
        df.loc[x, 'death'] = 1            

In [None]:
os.chdir('/Users/Felicia/Documents/Felicia Zhang/Felicia/Princeton/DataScience/ComputationalLaw')

df.to_csv('injury.csv')

In [None]:
# can we remove cases that in involve more than 1 person?
# identify "and" or "," for Plaintiff Profile to label cases that have more than 1 person, and just label it in a column
