In [1]:
# Here we will put all the packages needed. All lines that starts with an exclamation mark (!) are packages to download (not 
# very common). If you don't have the package, just remove the comment symbol (#)

#! pip install python-docx

import os

from docx import Document

path_to_folder = '/Users/thymol/Desktop/Comp/pred-case-outcomes/'
os.chdir('{}154 cases'.format(path_to_folder))


In [2]:
# Clean up file list. We do not want the files with filenames that start with '~'. 
NUM_OF_FILES= 154
file_list= []
for file_name in sorted(os.listdir()):
    if not file_name.startswith('~'):
        file_list.append(file_name)
# print(file_list)

# Check if the number of files in the list is correct. 
assert(len(file_list) == NUM_OF_FILES)


In [3]:
# get_gender searches for gender in the text. It takes in lower_text, female_vocab, and male_vocab. 
# lower_text is a string in lower case. 
# female_vocab is a set of strings. 
# male_vocab is a set of strings. 
def get_gender(lower_text, female_vocab, male_vocab):
    # Clean up and split by space. 
    lower_text= lower_text.replace(u'\xa0', u' ')
    lower_text_list= lower_text.split(' ')
    
    # Iterate over the text list. 
    # Return gender immediately if a word matches the gender vocabs. 
    for word in lower_text_list:
        if word in female_vocab:
            return 'female'
        elif word in male_vocab:
            return 'male'
    
    # Return 'na' if no word matches the gender vocabs. 
    return 'na'
        

In [4]:
# get_money searches for the specific settlement value in the text. It takes in lower_text and return money. 
# lower_text is a string in lower case. 
# money is a float number. 
def get_money(lower_text):
    # If a specific number for settlement is not found, return -1.0
    if '$' not in lower_text:
        return -1.0

    # Clean up and split by space. 
    lower_text= lower_text.replace(u'\xa0', u' ')
    lower_text_list= lower_text.split(' ')

    # Iterate over the text list: 
    for word in lower_text_list:
        if word.startswith('$'):
            money= float(word.replace('$','').replace(',','')) # Convert string to number
            if 'million' in lower_text_list: # If 'million' is found in the text, multiply the number by 1E6. 
                money= money * 1E6
            return money
    

In [5]:
# This code creates a dictionary called documents, where the files are stored. Each document has a number as its index. 

# Initializing the dictionary
documents = {}
# Setting initial value for the first document
i = 0

# Vocabulary for gender searching
female_vocab= set(['she', 'her', 'woman', 'women', 'girl', 'girls', 'lady', 'ladies']);
male_vocab= set(['he', 'his', 'him', 'man', 'men', 'boy', 'boys', 'gentleman', 'gentlemen']);

# Initialize lists of case ID, gender and money. 
######################################################
# I was using dictionary with case IDs as the keys.  #
# But it seems that we have cases with the same IDs. #
######################################################
case_id_list= []
gender_list= []
money_list= []

# Initializing for loop over all files in folder
for filename in file_list:
    # Initialize document
    doc = ''
    #Create a temporary file. Within this file we will get every paragraph 
    temp = Document('{}'.format(filename))

    # Counter for paragraph
    j= 0
    for p in temp.paragraphs:
        lower_text= str.lower(p.text)
#         print(p.text)
#         print('###########')
        doc = doc + ' ' + p.text

        # Get case ID (which is in paragraph 3). 
        if j == 3:
            case_id= p.text
        j+= 1
        
        # Get gender from Headline
        if lower_text.startswith('headline'):
            # Split the text by space. 
            gender= get_gender(lower_text, female_vocab, male_vocab)
#             print(p.text)
#             print(gender)
#             print('###########')
        
        # If gender is not found in Headline, check Background
        if lower_text.startswith('background'):
            if gender == 'na':
                gender= get_gender(lower_text, female_vocab, male_vocab)
    #                 print(gender)
    #                 print('###########')
        
        # Get money from Result
        if lower_text.startswith('result'):
            money= get_money(lower_text)
            
            
    # Update the lists of case ID, gender, and money. 
    case_id_list.append(case_id)
    gender_list.append(gender)
    money_list.append(money)
    
    # Print out cases in which gender is not found. 
    if gender == 'na':
        print('Gender not found')
        print('Case ID: ' + case_id)
        print('Case number: ' + str(i))
        print('##############')
    
    # Print out cases in which money is not found. 
    if money == -1.0: 
        print('Money not found')
        print('Case ID: ' + case_id)
        print('Case number: ' + str(i))
        print('###################')
            

    documents[i] = doc
    i+=1

Money not found
Case ID: 05-03951
Case number: 5
###################
Money not found
Case ID: 05-02682
Case number: 9
###################
Money not found
Case ID: 04-06009
Case number: 10
###################
Money not found
Case ID: 03-3979
Case number: 11
###################
Money not found
Case ID: 04-05754
Case number: 13
###################
Money not found
Case ID: 04-04541
Case number: 14
###################
Money not found
Case ID: 05-00540
Case number: 15
###################
Money not found
Case ID: 06-02178
Case number: 17
###################
Money not found
Case ID: 05-00616
Case number: 20
###################
Money not found
Case ID: 05-00616
Case number: 21
###################
Money not found
Case ID: 05-01254
Case number: 22
###################
Money not found
Case ID: 05-04261
Case number: 23
###################
Money not found
Case ID: 04-00488
Case number: 28
###################
Money not found
Case ID: 05-03231
Case number: 31
###################
Gender not found
Case I

In [6]:
# Print out the lists of case ID, gender, and money. 
print(case_id_list)
print(gender_list)
print(money_list)

assert(len(case_id_list) == NUM_OF_FILES)
assert(len(gender_list) == NUM_OF_FILES)
assert(len(money_list) == NUM_OF_FILES)

['BER-L-13231-04', 'MID-L-4430-05', '04-00884', 'SAL-L-43-04', '05-01749', '05-03951', 'BUR-L-748-04', 'UNN-L-3463-03', 'MID-L-005468-04', '05-02682', '04-06009', '03-3979', 'CAM-L-2026-04', '04-05754', '04-04541', '05-00540', 'ESX-L-8362-04', '06-02178', '04-06047', 'CAM-L-522210', '05-00616', '05-00616', '05-01254', '05-04261', '04-02146', 'MRS-L-658-05', '02-03566', '05-00331', '04-00488', '05-04211', '04-04432', '05-03231', 'MON-L-647-04', 'BER-L-14309-04', '06-00989', 'CUM-L-387-04', 'ATL-L-2787-03', 'L-3558-04', '05-00671', 'MON-L-2251-3', '03-02948', 'MID-L-004535-03', 'L-682-06', 'CAM-L-1839-03', 'MRS-L-2145-04; MRS-L-616-05; MRS-L-782-05', 'L 000906 90', 'BUR-L-002853-03', '03-04624', 'L 005316 95', '05-03021', '04-01261', 'BUR-L-838-05', 'BER-L-1048-05', '06-02775', '06-00065', 'OCE-L-001252-04', '03-02918', '05-01351', 'MID-L-734-04', 'SOM-L-1094-04', 'L 007120 93', 'SSX-L-723-04', '04-01283', '05-01364', '04-00237', 'L-6458-95', 'L-3150-03', 'BUR-L-003587-03', '05-04720', '

In [7]:
documents[0];