In [1]:
# Here we will put all the packages needed. All lines that starts with an exclamation mark (!) are packages to download (not 
# very common). If you don't have the package, just remove the comment symbol (#)

#! pip install python-docx

import os
import re
import pandas as pd

from docx import Document

path_to_folder = '/Users/thymol/Desktop/Comp/pred-case-outcomes/'
os.chdir('{}154 cases'.format(path_to_folder))

# Change this if needed
NUM_OF_FILES= 151

In [2]:
# Clean up file list. We do not want the files with filenames that start with '~'. 
file_list= []
for file_name in sorted(os.listdir()):
    if not file_name.startswith('~') and file_name.endswith('.docx'):
        file_list.append(file_name)
# print(file_list)

# Check if the number of files in the list is correct. 
assert(len(file_list) == NUM_OF_FILES)

In [3]:
# This code creates a dictionary called documents, where the files are stored. Each document has a number as its index. 

# Initializing the dictionary
documents = {}
headlines= {}
backgrounds= {}
results= {}
# Setting initial value for the first document
i = 0

# Vocabulary for gender searching
female_vocab= set(['she', 'her', 'woman', 'women', 'girl', 'girls', 'lady', 'ladies']);
male_vocab= set(['he', 'his', 'him', 'man', 'men', 'boy', 'boys', 'gentleman', 'gentlemen']);

# Initialize case ID list. 
case_id_list= []


# Initializing for loop over all files in folder
for filename in file_list:
    # Initialize document
    doc = ''
    #Create a temporary file. Within this file we will get every paragraph 
    temp = Document('{}'.format(filename))

    # Counter for paragraph
    j= 0
    for p in temp.paragraphs:
#         print('Paragraph ' + str(j))
#         print(p.text)
#         print('###########')
        doc = doc + ' ' + p.text
        lower_text= str.lower(p.text)
        
        # Get case ID (which is in paragraph 3). 
        if j == 3:
            case_id= p.text
        j+= 1
        
        # Get headline
        if lower_text.startswith('headline'):
            headline= lower_text
           
        # Get background
        if lower_text.startswith('background'):
            background= lower_text
        
        # Get result
        if lower_text.startswith('result'):
            result= lower_text
            
    # Update the lists of case ID. 
    case_id_list.append(case_id)

    documents[i] = doc
    headlines[i] = headline
    backgrounds[i]= background
    results[i]= result
    i+=1

In [4]:
headlines

{0: 'headline:\xa0new jersey woman seriously injured when her car collided with train awarded $1.5 million by state jury',
 1: 'headline:\xa0jury awards $3 million to new jersey man injured in accident with phantom car',
 2: 'headline:\xa0new jersey federal jury awards woman hurt in auto accident $1,038.30',
 3: 'headline:\xa0passenger injured when new jersey transit bus collides with car settles for $600,000',
 4: 'headline:\xa0new jersey man receives $75,000 settlement in auto accident',
 5: 'headline:\xa0parties agree to settlement in new jersey auto accident',
 6: 'headline:\xa0woman rear-ended by drunk driver awarded $224,627.84 in new jersey state court',
 7: 'headline:\xa0jury awards $48,107.05 to plaintiff in new jersey auto accident',
 8: 'headline:\xa0new jersey auto accident case settles for $98,000',
 9: 'headline:\xa0settlement reached in new jersey auto accident',
 10: 'headline:\xa0settlement reached in 3-car new jersey accident',
 11: 'headline:\xa0judge rules in favor 

In [5]:
# return_gender searches for gender in the text. It takes in lower_text, female_vocab, and male_vocab. 
# lower_text is a string in lower case. 
# female_vocab is a set of strings. 
# male_vocab is a set of strings. 
def return_gender(lower_text, female_vocab, male_vocab):
    # Clean up and split by space. 
    lower_text= lower_text.replace(u'\xa0', u' ')
    lower_text_list= lower_text.split(' ')
    
    # Iterate over the text list. 
    # Return gender immediately if a word matches the gender vocabs. 
    for word in lower_text_list:
        if word in female_vocab:
            return 0
        elif word in male_vocab:
            return 1
    
    # Return 'na' if no word matches the gender vocabs. 
    return -1

In [6]:
def get_gender_data(headlines, backgrounds, female_vocab, male_vocab):
    gender_list= []

    for i in range(len(headlines)):
        headline= headlines[i]
        background= backgrounds[i]
        gender= return_gender(headline + background, female_vocab, male_vocab)
        gender_list.append(gender)

    data_gender= pd.DataFrame({'gender':gender_list})
    return data_gender

In [7]:
data_gender= get_gender_data(headlines, backgrounds, female_vocab, male_vocab)
data_gender

Unnamed: 0,gender
0,0
1,1
2,0
3,0
4,1
5,1
6,0
7,1
8,0
9,1


In [8]:
def get_money_data(results):
    money_list= []

    for i in range(len(results)):
        result= results[i]

        money_str_list= re.findall(r'\$\d[\d,\.]* million|\$\d[\d,\.]*', result)
    #     print(money_str_list)
        if len(money_str_list) == 0:
            money= -1.0
        else:
            money_temp_list= []
            for word in money_str_list:
                word= word.replace('$','').replace(',','')
                if ' million' not in word:
                    number= float(word)
                else:
                    number= float(word.replace(' million','')) * 1E6
                money_temp_list.append(number)
    #         print(money_temp_list)
            money= max(money_temp_list)

        money_list.append(money)
    
    data_money= pd.DataFrame({'money':money_list})
    return data_money

test= ['$1,000,000 (abc million)','$1 million','$1.5 million','$1,000,000','$1 million(abc)',
       '$5,820,304 ($3 million for suffering$, $425,000 for 83 and $3 nilliom','$508.40','settlement']
data_test= get_money_data(test)
data_test

Unnamed: 0,money
0,1000000.0
1,1000000.0
2,1500000.0
3,1000000.0
4,1000000.0
5,5820304.0
6,508.4
7,-1.0


In [10]:
data_money= get_money_data(results)
data_money

Unnamed: 0,money
0,1500000.00
1,3000000.00
2,1038.30
3,600000.00
4,75000.00
5,-1.00
6,224627.84
7,48107.05
8,98000.00
9,-1.00


In [13]:
os.chdir('..')
os.getcwd()

'/Users/thymol/Desktop/Comp/pred-case-outcomes'

In [16]:
data_case_id= pd.DataFrame({'case_id':case_id_list})
data_gender_money= data_case_id.join(data_gender).join(data_money)

In [17]:
data_gender_money

Unnamed: 0,case_id,gender,money
0,BER-L-13231-04,0,1500000.00
1,MID-L-4430-05,1,3000000.00
2,04-00884,0,1038.30
3,SAL-L-43-04,0,600000.00
4,05-01749,1,75000.00
5,05-03951,1,-1.00
6,BUR-L-748-04,0,224627.84
7,UNN-L-3463-03,1,48107.05
8,MID-L-005468-04,0,98000.00
9,05-02682,1,-1.00


In [18]:
data_gender_money.to_csv('gender_and_money.csv', index=False)