## Loughran and McDonald Parser
Requires Python3

Make sure to change filepaths to TARGET_FILES, MASTER_DICTIONARY_FILE, OUTPUT_FILE and md in Load_MasterDictionary.py

In [7]:

"""
Program to provide generic parsing for all files in user-specified directory.
The program assumes the input files have been scrubbed,
  i.e., HTML, ASCII-encoded binary, and any other embedded document structures that are not
  intended to be analyzed have been deleted from the file.

Dependencies:
    Python:  Load_MasterDictionary.py
    Data:    LoughranMcDonald_MasterDictionary_2014.csv

The program outputs:
   1.  File name
   2.  File size (in bytes)
   3.  Number of words (based on LM_MasterDictionary
   4.  Proportion of positive words (use with care - see LM, JAR 2016)
   5.  Proportion of negative words
   6.  Proportion of uncertainty words
   7.  Proportion of litigious words
   8.  Proportion of modal-weak words
   9.  Proportion of modal-moderate words
  10.  Proportion of modal-strong words
  11.  Proportion of constraining words (see Bodnaruk, Loughran and McDonald, JFQA 2015)
  12.  Number of alphanumeric characters (a-z, A-Z, 0-9)
  13.  Number of alphabetic characters (a-z, A-Z)
  14.  Number of digits (0-9)
  15.  Number of numbers (collections of digits)
  16.  Average number of syllables
  17.  Averageg word length
  18.  Vocabulary (see Loughran-McDonald, JF, 2015)

  ND-SRAF
  McDonald 2016/06
"""

import csv
import glob
import re
import string
import sys
import time
#sys.path.append('D:\GD\Python\TextualAnalysis\Modules')  # Modify to identify path for custom modules
import Load_MasterDictionary as LM

# User defined directory for files to be parsed
TARGET_FILES = r'/home/traugerjacob/researchNCSABrunner/LM/2004QTR3/textOnly/*'
# User defined file pointer to LM dictionary
MASTER_DICTIONARY_FILE = r'/home/traugerjacob/researchNCSABrunner/LM/LoughranMcDonald_MasterDictionary_2014.csv'
# User defined output file
OUTPUT_FILE = r'/home/traugerjacob/researchNCSABrunner/LM/LMParse8KData.csv'
# Setup output
OUTPUT_FIELDS = ['file name,', 'file size,', 'number of words,', '% positive,', '% negative,',
                 '% uncertainty,', '% litigious,', '% modal-weak,', '% modal moderate,',
                 '% modal strong,', '% constraining,', '# of alphanumeric,', '# of digits,',
                 '# of numbers,', 'avg # of syllables per word,', 'average word length,', 'vocabulary']

lm_dictionary = LM.load_masterdictionary(MASTER_DICTIONARY_FILE, True)


def main():

    f_out = open(OUTPUT_FILE, 'w')
    wr = csv.writer(f_out, lineterminator='\n')
    wr.writerow(OUTPUT_FIELDS)

    file_list = glob.glob(TARGET_FILES)
    for file in file_list:
        print(file)
        with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
            doc = f_in.read()
        doc_len = len(doc)
        doc = re.sub('(May|MAY)', ' ', doc)  # drop all May month references
        doc = doc.upper()  # for this parse caps aren't informative so shift

        output_data = get_data(doc)
        output_data[0] = file
        output_data[1] = doc_len
        wr.writerow(output_data)


def get_data(doc):

    vdictionary = {}
    _odata = [0] * 17
    total_syllables = 0
    word_length = 0
    
    tokens = re.findall('\w+', doc)  # Note that \w+ splits hyphenated words
    for token in tokens:
        if not token.isdigit() and len(token) > 1 and token in lm_dictionary:
            _odata[2] += 1  # word count
            word_length += len(token)
            if token not in vdictionary:
                vdictionary[token] = 1
            if lm_dictionary[token].positive: _odata[3] += 1
            if lm_dictionary[token].negative: _odata[4] += 1
            if lm_dictionary[token].uncertainty: _odata[5] += 1
            if lm_dictionary[token].litigious: _odata[6] += 1
            if lm_dictionary[token].weak_modal: _odata[7] += 1
            if lm_dictionary[token].moderate_modal: _odata[8] += 1
            if lm_dictionary[token].strong_modal: _odata[9] += 1
            if lm_dictionary[token].constraining: _odata[10] += 1
            total_syllables += lm_dictionary[token].syllables

    _odata[11] = len(re.findall('[A-Z]', doc))
    _odata[12] = len(re.findall('[0-9]', doc))
    # drop punctuation within numbers for number count
    doc = re.sub('(?!=[0-9])(\.|,)(?=[0-9])', '', doc)
    doc = doc.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
    _odata[13] = len(re.findall(u'\b[-+\(]?[$€£]?[-+(]?\d+\)?\b',doc))
    _odata[14] = total_syllables / _odata[2]
    _odata[15] = word_length / _odata[2]
    _odata[16] = len(vdictionary)
    
    # Convert counts to %
    for i in range(3, 10 + 1):
        _odata[i] = (_odata[i] / _odata[2]) * 100
    # Vocabulary
        
    return _odata


if __name__ == '__main__':
    print('\n' + time.strftime('%c') + '\nGeneric_Parser.py\n')
    main()
    print('\n' + time.strftime('%c') + '\nNormal termination.')

 ...Loading Master Dictionary 85000
Master Dictionary loaded from file: 
  /home/traugerjacob/researchNCSABrunner/LM/LoughranMcDonald_MasterDictionary_2014.csv
  85,131 words loaded in master_dictionary.


Fri Jul  7 10:48:51 2017
Generic_Parser.py

/home/traugerjacob/researchNCSABrunner/LM/2004QTR3/textOnly/0000950152-04-007008.txt
/home/traugerjacob/researchNCSABrunner/LM/2004QTR3/textOnly/0001157523-04-008578.txt
/home/traugerjacob/researchNCSABrunner/LM/2004QTR3/textOnly/0001193125-04-145620.txt
/home/traugerjacob/researchNCSABrunner/LM/2004QTR3/textOnly/0000946275-04-000853.txt
/home/traugerjacob/researchNCSABrunner/LM/2004QTR3/textOnly/0000950134-04-013126.txt
/home/traugerjacob/researchNCSABrunner/LM/2004QTR3/textOnly/0001157523-04-008372.txt
/home/traugerjacob/researchNCSABrunner/LM/2004QTR3/textOnly/0000806085-04-000165.txt
/home/traugerjacob/researchNCSABrunner/LM/2004QTR3/textOnly/0001096906-04-000378.txt
/home/traugerjacob/researchNCSABrunner/LM/2004QTR3/textOnly/0000910680

In [8]:
#Prints the first 10 lines of the output of the parser
with open(OUTPUT_FILE,'r') as f:
    lines = f.readlines()
    for i in range(10):
        print(lines[i])

"file name,","file size,","number of words,","% positive,","% negative,","% uncertainty,","% litigious,","% modal-weak,","% modal moderate,","% modal strong,","% constraining,","# of alphanumeric,","# of digits,","# of numbers,","avg # of syllables per word,","average word length,",vocabulary

/home/traugerjacob/researchNCSABrunner/LM/2004QTR3/textOnly/0000950152-04-007008.txt,54978,1981,1.0600706713780919,1.2115093387178193,0.7067137809187279,0.6057546693589096,0.1514386673397274,0.20191822311963653,0.05047955577990913,0.4543160020191822,25734,4078,0,1.862190812720848,5.56082786471479,624

/home/traugerjacob/researchNCSABrunner/LM/2004QTR3/textOnly/0001157523-04-008578.txt,22003,1507,0.7299270072992701,0.53085600530856,0.8626410086264101,0.5972130059721301,0.13271400132714,0.06635700066357,0.26542800265428,0.33178500331785005,10536,1737,0,1.816191108161911,5.518911745189118,493

/home/traugerjacob/researchNCSABrunner/LM/2004QTR3/textOnly/0001193125-04-145620.txt,37488,1696,0.530660377