In [1]:
# Here we will put all the packages needed. All lines that starts with an exclamation mark (!) are packages to download (not 
# very common). If you don't have the package, just remove the comment symbol (#)

#! pip install python-docx

import os

from docx import Document

path_to_folder = '/Users/thymol/Desktop/Comp/pred-case-outcomes/'
os.chdir('{}154 cases'.format(path_to_folder))

# Change this if needed
NUM_OF_FILES= 151

In [2]:
# Clean up file list. We do not want the files with filenames that start with '~'. 
file_list= []
for file_name in sorted(os.listdir()):
    if not file_name.startswith('~') and file_name.endswith('.docx'):
        file_list.append(file_name)
# print(file_list)

# Check if the number of files in the list is correct. 
assert(len(file_list) == NUM_OF_FILES)

In [3]:
# get_gender searches for gender in the text. It takes in lower_text, female_vocab, and male_vocab. 
# lower_text is a string in lower case. 
# female_vocab is a set of strings. 
# male_vocab is a set of strings. 
def get_gender(lower_text, female_vocab, male_vocab):
    # Clean up and split by space. 
    lower_text= lower_text.replace(u'\xa0', u' ')
    lower_text_list= lower_text.split(' ')
    
    # Iterate over the text list. 
    # Return gender immediately if a word matches the gender vocabs. 
    for word in lower_text_list:
        if word in female_vocab:
            return 0
        elif word in male_vocab:
            return 1
    
    # Return 'na' if no word matches the gender vocabs. 
    return -1
        

In [4]:
import re
# get_money searches for the specific settlement value in the text. It takes in lower_text and return money. 
# lower_text is a string in lower case. 
# money is a float number. 
def get_money(lower_text):
    money_str_list= re.findall(r'\$\d[\d,\.]* million|\$\d[\d,\.]*', lower_text)
#     print(money_str_list)
    if len(money_str_list) == 0:
        money= -1.0
    else:
        money_list= []
        for word in money_str_list:
            word= word.replace('$','').replace(',','')
            if ' million' not in word:
                number= float(word)
            else:
                number= float(word.replace(' million','')) * 1E6
            money_list.append(number)
#         print(money_list)
        money= max(money_list)
    return money

In [5]:
print(get_money('$1,000,000 (abc million)'))
print(get_money('$1 million'))
print(get_money('$1.5 million'))
print(get_money('$1,000,000'))
print(get_money('$1 million(abc)'))
print(get_money('$5,820,304 ($3 million for suffering$, $425,000 for 83 and $3 nilliom'))
print(get_money('$508.40'))
print(get_money('settlement'))

1000000.0
1000000.0
1500000.0
1000000.0
1000000.0
5820304.0
508.4
-1.0


In [6]:
# # get_money searches for the specific settlement value in the text. It takes in lower_text and return money. 
# # lower_text is a string in lower case. 
# # money is a float number. 
# def get_money_2(lower_text):
#     # If a specific number for settlement is not found, return -1.0
#     if '$' not in lower_text:
#         return -1.0

#     # Clean up and split by space. 
#     lower_text= lower_text.replace(u'\xa0', u' ')
#     lower_text_list= lower_text.split(' ')
#     print(lower_text_list)

#     # Iterate over the text list: 
#     for i in range(len(lower_text_list)):
#         word= lower_text_list[i]
#         if word.startswith('$'):
#             money= float(word.replace('$','').replace(',','')) # Convert string to number
#             try:
#                 next_word= lower_text_list[i+1]
#                 if next_word == 'million':
#                     money= money * 1E6
#             except:
#                 pass
#             return money

In [7]:
# This code creates a dictionary called documents, where the files are stored. Each document has a number as its index. 

# Initializing the dictionary
documents = {}
# Setting initial value for the first document
i = 0

# Vocabulary for gender searching
female_vocab= set(['she', 'her', 'woman', 'women', 'girl', 'girls', 'lady', 'ladies']);
male_vocab= set(['he', 'his', 'him', 'man', 'men', 'boy', 'boys', 'gentleman', 'gentlemen']);

# Initialize lists of case ID, gender and money. 
######################################################
# I was using dictionary with case IDs as the keys.  #
# But it seems that we have cases with the same IDs. #
######################################################
case_id_list= []
gender_list= []
money_list= []
result_list= []

# Initializing for loop over all files in folder
for filename in file_list:
    # Initialize document
    doc = ''
    #Create a temporary file. Within this file we will get every paragraph 
    temp = Document('{}'.format(filename))

    # Counter for paragraph
    j= 0
    for p in temp.paragraphs:
        lower_text= str.lower(p.text)
#         print('Paragraph ' + str(j))
#         print(p.text)
#         print('###########')
        doc = doc + ' ' + p.text
        

        # Get case ID (which is in paragraph 3). 
        if j == 3:
            case_id= p.text
        j+= 1
        
        # Get gender from Headline
        if lower_text.startswith('headline'):
            # Split the text by space. 
            gender= get_gender(lower_text, female_vocab, male_vocab)
#             print(p.text)
#             print(gender)
#             print('###########')
        
        # If gender is not found in Headline, check Background
        if lower_text.startswith('background'):
            if gender == -1:
                gender= get_gender(lower_text, female_vocab, male_vocab)
    #                 print(gender)
    #                 print('###########')
        
        # Get money from Result
        if lower_text.startswith('result'):
            money= get_money(lower_text)
            result= lower_text.replace(',',' ')
            
            
    # Update the lists of case ID, gender, and money. 
    case_id_list.append(case_id)
    gender_list.append(gender)
    money_list.append(money)
    result_list.append(result)
    
#     # Print out cases in which gender is not found. 
#     if gender == 'na':
#         print('Gender not found')
#         print('Case ID: ' + case_id)
#         print('Case number: ' + str(i))
#         print('##############')
    
#     # Print out cases in which money is not found. 
#     if money == -1.0: 
#         print('Money not found')
#         print('Case ID: ' + case_id)
#         print('Case number: ' + str(i))
#         print('###################')
            

    documents[i] = doc
    i+=1

In [8]:
# Print out the lists of case ID, gender, and money. 
# print(case_id_list)
# print(gender_list)
# print(money_list)

assert(len(case_id_list) == NUM_OF_FILES)
assert(len(gender_list) == NUM_OF_FILES)
assert(len(money_list) == NUM_OF_FILES)

In [9]:
documents[0];

In [10]:
os.chdir('..')
os.getcwd()

'/Users/thymol/Desktop/Comp/pred-case-outcomes'

In [11]:
file_counter= 0
with open('gender_and_money.csv','w') as f_out:
    for case_id, gender, money, result in zip(case_id_list, gender_list, money_list, result_list):
        f_out.write(str(file_counter) + ',' + str(case_id) + ',' + str(gender) + ',' + str(money) + ',' + result + '\n')
#         f_out.write(',')
#         f_out.write(str(case_id))
#         f_out.write(',')
#         f_out.write(str(gender))
#         f_out.write(',')
#         f_out.write(str(money))
#         f_out.write('\n')
        file_counter+= 1
