In [1]:
import os
import numpy as np
import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

# nltk.download("punkt")
# nltk.download("stopwords")
# nltk.download("wordnet")

# This is our own tokenizer
from nltk.tokenize import RegexpTokenizer

from docx import Document

# path_to_folder = '/Users/Felicia/Dropbox/Predicting Settlement Value/pred-case-outcomes/'
path_to_folder = '/Users/thymol/Desktop/Comp/pred-case-outcomes/'

os.chdir('{}154 cases'.format(path_to_folder))

# Change this if needed
NUM_OF_FILES= 151 

In [2]:
# Clean up file list. We do not want the files with filenames that start with '~'. 

file_list= []
for file_name in sorted(os.listdir()):
    if not file_name.startswith('~') and file_name.endswith('.docx'):
        file_list.append(file_name)
        
#print(file_list)


In [3]:
# Check if the number of files in the list is correct. 
assert(len(file_list) == NUM_OF_FILES)

In [4]:
# This code creates a dictionary called documents, where the files are stored. Each document has a number as its index. 

# Initializing the dictionary
documents = {}
headlines= {}
backgrounds= {}
results= {}
# Setting initial value for the first document
i = 0

# Vocabulary for gender searching
female_vocab= set(['she', 'her', 'woman', 'women', 'girl', 'girls', 'lady', 'ladies']);
male_vocab= set(['he', 'his', 'him', 'man', 'men', 'boy', 'boys', 'gentleman', 'gentlemen']);

# Initialize case ID list. 
case_id_list= []
case_year_list= []

# Initializing for loop over all files in folder
for filename in file_list:
    # Initialize document
    doc = ''
    #Create a temporary file. Within this file we will get every paragraph 
    temp = Document('{}'.format(filename))

    # Counter for paragraph
    j= 0
    for p in temp.paragraphs:
#         print('Paragraph ' + str(j))
#         print(p.text)
#         print('###########')
        doc = doc + ' ' + p.text
        lower_text= str.lower(p.text)
        
        # Get case ID (which is in paragraph 3). 
        if j == 3:
            case_id= p.text
        
        # Get year (which is in paragraph 4). 
        if j == 4:
            case_year= p.text[-4:len(p.text)]
        j+= 1        
            
        # Get headline
        if lower_text.startswith('headline'):
            headline= lower_text
           
        # Get background
        if lower_text.startswith('background'):
            background= lower_text
        
        # Get result
        if lower_text.startswith('result'):
            result= lower_text
            
    # Update the lists of case ID and Year
    case_id_list.append(case_id)
    case_year_list.append(case_year)

    documents[i] = doc
    headlines[i] = headline
    backgrounds[i]= background
    results[i]= result
    i+=1

In [5]:
#####################################
# Get gender and money (from Zhe)####
#####################################

In [6]:
# return_gender searches for gender in the text. It takes in lower_text, female_vocab, and male_vocab. 
# lower_text is a string in lower case. 
# female_vocab is a set of strings. 
# male_vocab is a set of strings. 
def return_gender(lower_text, female_vocab, male_vocab):
    # Clean up and split by space. 
    lower_text= lower_text.replace(u'\xa0', u' ')
    lower_text_list= lower_text.split(' ')
    
    # Iterate over the text list. 
    # Return gender immediately if a word matches the gender vocabs. 
    for word in lower_text_list:
        if word in female_vocab:
            return 0
        elif word in male_vocab:
            return 1
    
    # Return -1 if no word matches the gender vocabs. 
    return -1

In [7]:
def get_gender_data(headlines, backgrounds, female_vocab, male_vocab):
    gender_list= []

    for i in range(len(headlines)):
        headline= headlines[i]
        background= backgrounds[i]
        gender= return_gender(headline + background, female_vocab, male_vocab)
        gender_list.append(gender)

    data_gender= pd.DataFrame({'gender':gender_list})
    return data_gender

In [8]:
def get_money_data(results):
    money_list= []

    for i in range(len(results)):
        result= results[i]

        money_str_list= re.findall(r'\$\d[\d,\.]* million|\$\d[\d,\.]*', result)
    #     print(money_str_list)
        if len(money_str_list) == 0:
            money= -1.0
        else:
            money_temp_list= []
            for word in money_str_list:
                word= word.replace('$','').replace(',','')
                if ' million' not in word:
                    number= float(word)
                else:
                    number= float(word.replace(' million','')) * 1E6
                money_temp_list.append(number)
    #         print(money_temp_list)
            money= max(money_temp_list)

        money_list.append(money)
    
    data_money= pd.DataFrame({'money':money_list})
    return data_money

# test= ['$1,000,000 (abc million)','$1 million','$1.5 million','$1,000,000','$1 million(abc)',
#        '$5,820,304 ($3 million for suffering$, $425,000 for 83 and $3 nilliom','$508.40','settlement']
# data_test= get_money_data(test)
# data_test

In [9]:
#############################
# Get age (from Vinicius)####
#############################

In [10]:
def get_age_data(documents, list_):
    pattern = ['then','now','years old','is','was']
    age = []

    #print(documents[0])

    for i in range(len(documents)):
        temp = word_tokenize(documents[i])
        #print(temp)
        for j in range(len(temp)-1):
            try:
                int(temp[j])
                if temp[j+1] !='percent':
                    if (temp[j-1] in pattern) | (temp[j+1]+' '+temp[j+2] in pattern):
                        age.append([i,list_[i],temp[j]])
                        break

                    elif (temp[j-1]==',') & (temp[j+1] == ',') &(int(temp[j])<100):
                        age.append([i,list_[i],temp[j]])
                        break


            except:
                pass
    
    data_age = pd.DataFrame(age, columns = ['index','file name','age'])
    data_age.set_index('index', inplace=True)
    return data_age

In [11]:
#######################################
# Get injury (from Felicia and Joe)####
#######################################

In [12]:
# for every document find the injury 

injury = {}
for x in range(0, len(documents)):
    a = documents[x]
    
    # find when "Injury:" and "Court:" occurs and get all the words in between
    try:
        start = a.find('Injury:\xa0') 
        
        if start == -1: #Can't find "Injury:"
            injury[x] = "NA"
        else:
            start = a.find('Injury:\xa0') + len('Injury:\xa0')
            end = a.find('Court:\xa0')
            # add to injury dictionary
            injury[x] = a[start:end]

    # if no headline that starts with "Injury" just ignore, from year 2006
    except ValueError:
        injury[x] = "NA"

#injury        

In [13]:
# create empty dataframe

# abdominal/abdomen, ankle, arm, back, brain, burn, chest, ear, elbow, eye, face/facial, foot, leg, genital, 
# hand, head, heart, hip, knee, mouth, neck, nose, pelvic, shoulder, spinal, thigh, wrist, psychological, death
columns = ['abdominal', 'ankle', 'arm', 'back', 'brain', 'burn', 'chest', 'ear', 'elbow', 'eye', 'face', 'foot', 'leg', 'genital', 'hand', 'head', 'heart', 'hip', 'knee', 'mouth', 'neck', 'nose', 'pelvic', 'shoulder', 'spinal', 'thigh', 'wrist', 'psychological', 'death', 'multiplePlaintiff','Plaintiffs','multipleDefendants','Defendants','Judge','Court','State']
df = pd.DataFrame(index=range(0,NUM_OF_FILES), columns=columns)

In [14]:
# fill in dataframe with injuries

for x in range(0, len(injury)):

    # identify injury
    a = injury[x]
    
    if a == 'NA': # if injury is missing, paste -1 in row
        df.iloc[x,df.columns.get_loc("abdominal"):df.columns.get_loc("death")] = -1
        
    else: # if injury is not missing       
        # split up injury into individual words
        injurywords = a.split()

        # define keywords
        keywords = ['abdominal', 'abdomen', 'torso',  'spleen', ' renal artery',  'upper extremity']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'abdominal'] = 0
        else:
            df.loc[x, 'abdominal'] = 1    

        keywords = ['ankle']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'ankle'] = 0
        else:
            df.loc[x, 'ankle'] = 1           

        keywords = ['arm', 'forearm', 'ulna',  'brachial plexus']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'arm'] = 0
        else:
            df.loc[x, 'arm'] = 1  

        keywords = ['back']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'back'] = 0
        else:
            df.loc[x, 'back'] = 1          

        keywords = ['brain', 'cerebral', 'concussion',  'loss of consciousness']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'brain'] = 0
        else:
            df.loc[x, 'brain'] = 1          

        keywords = ['burn', 'burns']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'burn'] = 0
        else:
            df.loc[x, 'burn'] = 1  

        keywords = ['chest', 'rib', 'ribs', 'lung', 'clavical', 'pulmonary', 'thoracic']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'chest'] = 0
        else:
            df.loc[x, 'chest'] = 1          

        keywords = ['ear', 'hearing']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'ear'] = 0
        else:
            df.loc[x, 'ear'] = 1          

        keywords = ['elbow']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'elbow'] = 0
        else:
            df.loc[x, 'elbow'] = 1          

        keywords = ['eye', 'eyes', 'blindness']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'eye'] = 0
        else:
            df.loc[x, 'eye'] = 1              

        keywords = ['face', 'facial', 'forehead']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'face'] = 0
        else:
            df.loc[x, 'face'] = 1          

        keywords = ['foot', 'feet', 'heel']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'foot'] = 0
        else:
            df.loc[x, 'foot'] = 1          

        keywords = ['leg', 'tibia', 'tibial',  'sciatica']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'leg'] = 0
        else:
            df.loc[x, 'leg'] = 1          

        keywords = ['genital']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'genital'] = 0
        else:
            df.loc[x, 'genital'] = 1          

        keywords = ['hand', 'pinky', 'finger', 'thumb']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'hand'] = 0
        else:
            df.loc[x, 'hand'] = 1          

        keywords = ['head', 'headaches']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'head'] = 0
        else:
            df.loc[x, 'head'] = 1          

        keywords = ['heart']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'heart'] = 0
        else:
            df.loc[x, 'heart'] = 1          

        keywords = ['hip']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'hip'] = 0
        else:
            df.loc[x, 'hip'] = 1          

        keywords = ['knee']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'knee'] = 0
        else:
            df.loc[x, 'knee'] = 1          

        keywords = ['mouth', 'lips', 'lip', 'jaw']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'mouth'] = 0
        else:
            df.loc[x, 'mouth'] = 1     

        keywords = ['neck', 'cervical']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'neck'] = 0
        else:
            df.loc[x, 'neck'] = 1          

        keywords = ['nose', 'nasal']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'nose'] = 0
        else:
            df.loc[x, 'nose'] = 1     

        keywords = ['pelvic', 'pelvis', 'acetabular', 'pubic ramus', 'sacroiliac']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'pelvic'] = 0
        else:
            df.loc[x, 'pelvic'] = 1          

        keywords = ['shoulder', 'clavicle', 'rotator',  'sternoclavicular','subacromial decompression','acromioplasty']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'shoulder'] = 0
        else:
            df.loc[x, 'shoulder'] = 1     

        keywords = ['spinal', 'spine', 'disc', 'discs', 'vertebrae', 'radiculopathy', 'laminectomy', 'forminatomy', 'diskectomy', 'T8', 'lumbar', 'herniation']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'spinal'] = 0
        else:
            df.loc[x, 'spinal'] = 1          

        keywords = ['thigh', 'femur', 'shaft']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'thigh'] = 0
        else:
            df.loc[x, 'thigh'] = 1    

        keywords = ['wrist', 'carpel tunnel']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'wrist'] = 0
        else:
            df.loc[x, 'wrist'] = 1     

        keywords = ['psychological', 'cognitive', 'memory']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'psychological'] = 0
        else:
            df.loc[x, 'psychological'] = 1          

        keywords = ['death']
        z = [i for e in keywords for i in injurywords if e in i]
        if len(z) == 0:
            df.loc[x, 'death'] = 0
        else:
            df.loc[x, 'death'] = 1            

In [15]:
# fill in Plaintiffs and multiple plaintiffs info

for x in range(0, len(documents)):
    a = documents[x]
    
    # find when "Injury:" and "Court:" occurs and get all the words in between
    try:
        start = a.find('Plaintiff Profile') 
        
        if start == -1:  #Can't find "plaintiff profile"
            df.loc[x, 'Plaintiffs'] = "NA" 
            df.loc[x, 'multiplePlaintiff'] = "NA"
        else: 
            start = a.find('Plaintiff Profile') + len('Plaintiff Profile')
            end = a.find('Defendant Profile')
            # add to injury dictionary
            df.loc[x, 'Plaintiffs'] = a[start:end]
#             print(a[start:end])
            plaintiffinfo = a[start:end].split()

            # define keywords to identify multiple plaintiffs
            keywords = [' and ', ','] #HELP: this also captures Alexandra, Fernandes, and Fernandez...
            z = [i for key in keywords for i in plaintiffinfo if key in i]
            #print(z)
            if len(z) == 0:
                df.loc[x, 'multiplePlaintiff'] = 0
            else:
                df.loc[x, 'multiplePlaintiff'] = 1

    # if no headline that starts with "Injury" just ignore, from year 2006
    except ValueError:
        df.loc[x, 'Plaintiffs'] = "NA" 

In [16]:
# fill in Defendants and multiple Defendants info

for x in range(0, len(documents)):
    a = documents[x]
    
    try:
        start = a.find('Defendant Profile') 
        
        if start == -1: #Can't find "defendant profile"
            df.loc[x, 'Defendants'] = "NA" 
            df.loc[x, 'multipleDefendants'] = "NA" 
        else:
            start = a.find('Defendant Profile') + len('Defendant Profile')
            end = a.find('Plaintiff Counsel')
            
            df.loc[x, 'Defendants'] = a[start:end]
#             print(a[start:end])
            defendantinfo = a[start:end].split()
            
            # define keywords to identify multiple Defendants
            keywords = [' and ', ','] #HELP: this also captures Alexandra, Fernandes, and Fernandez...
            z = [i for key in keywords for i in defendantinfo if key in i]
            #print(z)
            if len(z) == 0:
                df.loc[x, 'multipleDefendants'] = 0
            else:
                df.loc[x, 'multipleDefendants'] = 1            

    # if no headline that starts with "Injury" just ignore, from year 2006
    except ValueError:
        df.loc[x, 'Defendants'] = "NA" 

In [17]:
# fill in Judge info

for x in range(0, len(documents)):
    a = documents[x]
    
    # find when it says "Judge"
    try:
        start = a.find('Judge:\xa0') 
        
        if start == -1: #can't find "Judge"
            df.loc[x, 'Judge'] = "NA" 
        else:         
            start = a.find('Judge:\xa0') + len('Judge:\xa0')
            end = a.find('Plaintiff Profile')
            # add to injury dictionary
            df.loc[x, 'Judge'] = a[start:end]

    # if no headline that starts with "Injury" just ignore, from year 2006
    except ValueError:
        df.loc[x, 'Judge'] = "NA" 
        

In [18]:
# fill in Court info

for x in range(0, len(documents)):
    a = documents[x]
    
    # find when it says "Judge"
    try:
        start = a.find('Court:\xa0') 
        if start == -1: #can't find "Court"
            df.loc[x, 'Court'] = "NA" 
            df.loc[x, 'State'] = "NA" 
        else:         
            start = a.find('Court:\xa0') + len('Court:\xa0')
            end = a.find('Judge:\xa0')
            if end == -1: #can't find "Judge"
                end = a.find('Plaintiff Profile') #use Plaintiff profile instead
            
            courtinfo = a[start:end]
            # add to injury dictionary
            df.loc[x, 'Court'] = courtinfo
            
            if courtinfo[0] == 'D':
                df.loc[x, 'State'] = courtinfo[3:7]
            else:            
                df.loc[x, 'State'] = courtinfo[0:4]
            
    # if no headline that starts with "Injury" just ignore, from year 2006
    except ValueError:
        df.loc[x, 'Court'] = "NA" 
        df.loc[x, 'State'] = "NA" 
        

In [19]:
# Combine all data frames
data_case_id= pd.DataFrame({'case_id':case_id_list})
data_case_year= pd.DataFrame({'case_year':case_year_list})
data_gender= get_gender_data(headlines, backgrounds, female_vocab, male_vocab)
data_money= get_money_data(results)
data_age= get_age_data(documents, file_list)
#data_gender_money= data_case_id.join(data_gender).join(data_money)
data_all= data_case_id.join(data_age['age']).join(data_case_year).join(data_gender).join(data_money).join(df)

In [20]:
# View final data frame
data_all

Unnamed: 0,case_id,age,case_year,gender,money,abdominal,ankle,arm,back,brain,...,wrist,psychological,death,multiplePlaintiff,Plaintiffs,multipleDefendants,Defendants,Judge,Court,State
0,BER-L-13231-04,33,2007,0,1500000.00,0,0,0,1,0,...,0,0,0,0,Alexandra M. Lisowski,0,New Jersey Transit,Elijah Miller Jr.,"N.J. Super., Bergen Co.",N.J.
1,MID-L-4430-05,,2006,1,3000000.00,0,0,0,0,0,...,0,0,0,0,Allen Williams,0,State Farm Insurance Co,Phillip Paley,"N.J. Super., Middlesex Co.",N.J.
2,04-00884,,2007,0,1038.30,0,0,0,0,0,...,0,0,0,0,Ana Budimlic,1,"National Retail Transportation, Ramon Aleman",Susan D. Wigenton,D. N.J.,N.J.
3,SAL-L-43-04,,2007,0,600000.00,0,0,1,0,0,...,0,0,0,0,Andrea Farro,1,"New Jersey Transit, Allstate Insurance Co",G. Thomas Bowen,"N.J. Super., Salem Co.",N.J.
4,05-01749,,2006,1,75000.00,-1,-1,-1,-1,-1,...,-1,-1,,1,"Anthony Faranca, Carolyn Faranca",1,"Luis Bonilla, Horizon Medical Day Care Center...",Mary L. Cooper,D. N.J.,N.J.
5,05-03951,,2006,1,-1.00,-1,-1,-1,-1,-1,...,-1,-1,,0,Anthony J. Conte III,0,Gina Marie and Karen A. Rongone,Freda L. Wolfson,D. N.J.,N.J.
6,BUR-L-748-04,64,2007,0,224627.84,0,0,0,0,0,...,0,0,0,1,"Antoinette Sheffer, James Sheffer",0,Linda Sullivan,Karen L. Suter,"N.J. Super., Burlington Co.",N.J.
7,UNN-L-3463-03,,2006,1,48107.05,-1,-1,-1,-1,-1,...,-1,-1,,0,Arthur Harriatt,0,Cynthia Scott and Simon Wilson,John Pisansky,"N.J. Super., Union Co.",N.J.
8,MID-L-005468-04,17,2006,0,98000.00,0,0,1,0,0,...,0,0,0,1,"Ashley Dwyer, Jody Dwyer",1,"Vera Armoogan, Keith Armoogan",Jessica Mayer,"N.J. Super., Middlesex Co.",N.J.
9,05-02682,,2006,1,-1.00,-1,-1,-1,-1,-1,...,-1,-1,,0,Bora Kochar,0,J&B Leasing and Carlos Calzadilla,Jose L. Linares,D. N.J.,N.J.


In [21]:
# Save to computer
# os.chdir('/Users/Felicia/Documents/ComputationalLaw/')
os.chdir('/Users/thymol/Desktop/Comp/pred-case-outcomes')

data_all.to_csv('combined_data.csv', index=False)