### Who the object (focus) of our hymns, praise, and worship songs: self or Other (God)?

Scrapable sites:

www.hymnlyrics.org


In [1]:
files = ([
    '../corpora/hymns/A_Call_To_Prayer.txt',
    '../corpora/hymns/All_Hail_the_Power_of_Jesus_Name.txt',
    '../corpora/hymns/All_That_I_Am.txt'
])

In [2]:
import nltk
from collections import Counter

In [3]:
SELF = 'self'
OTHER = 'other'
UNKNOWN = 'unknown'
BOTH = 'both'

In [4]:
SELF_WORDS = set(['I', 'me', 'my', 'mine', 'we', 'us', 'you', 'your', 'ye', 
                 'saints', 'congregation', 'church', 'martyr', 'martyrs',
                 'sinner', 'sinners', ])

# sometimes "you" lowercase refers to self, the congregation singing to each other. 
# Other times you/You refers to God. It partially depends if I can find a source of lyrics
# that maintains the upper/lower case distinction.

print(len(SELF_WORDS))

16


In [5]:
OTHER_WORDS = set(['You', 'Your', 'Thee', 'Thou', 'Thy', 'Him', 'His',
                  'Lord', 'God', 'King', 'Jesus', 'Savior',
                  ])

print(len(OTHER_WORDS))

12


In [6]:
def objectize(words):

    mwlen = len(SELF_WORDS.intersection(words))
    fwlen = len(OTHER_WORDS.intersection(words))

    if mwlen > 0 and fwlen == 0:
        return SELF
    elif mwlen == 0 and fwlen > 0:
        return OTHER
    elif mwlen > 0 and fwlen > 0:
        return BOTH
    else:
        return UNKNOWN

In [7]:
def count_object_words(sentences):

    sents = Counter()
    words = Counter()

    for sentence in sentences:
        object_words = objectize(sentence)
        sents[object_words] += 1
        words[object_words] += len(sentence)

    return sents, words

In [13]:
def parse_object_words(text):

#     sentences = [
#         [word.lower() for word in nltk.word_tokenize(sentence)]
#         for sentence in nltk.sent_tokenize(text)
#     ]
    
    
    # test difference if I maintain casing (upper/lower)
    sentences = [
        [word for word in nltk.word_tokenize(sentence)]
        for sentence in nltk.sent_tokenize(text)
    ]
    
    # SHOULD ALSO CHANGE IT TO READ LINES INSTEAD OF SENTENCES
    # TK

    sents, words = count_object_words(sentences)
    total = sum(words.values())

    for object_words, count in words.items():
        pcent = (count / total) * 100
        nsents = sents[object_words]

        print(
            "{:0.3f}% {} ({} sentences)".format(pcent, object_words, nsents)
        )

In [14]:
for file in files:
    print(file)
#     with open(file, 'r', encoding='utf8') as f: # UnicodeDecodeError: 'utf-8' codec can't decode 
#                                                 # byte 0x93 in position 103: invalid start byte
    with open(file, 'r', encoding='latin1') as f:
        parse_object_words(f.read())
        print()

../corpora/hymns/A_Call_To_Prayer.txt
58.915% other (3 sentences)
41.085% both (2 sentences)

../corpora/hymns/All_Hail_the_Power_of_Jesus_Name.txt
44.444% other (9 sentences)
55.556% both (7 sentences)

../corpora/hymns/All_That_I_Am.txt
100.000% both (1 sentences)



#### word.lower() output:
```
../corpora/hymns/A_Call_To_Prayer.txt
80.620% unknown (4 sentences)
19.380% self (1 sentences)

../corpora/hymns/All_Hail_the_Power_of_Jesus_Name.txt
44.444% unknown (9 sentences)
55.556% self (7 sentences)

../corpora/hymns/All_That_I_Am.txt
100.000% self (1 sentences)
```

#### word UPPER/LOWER output:
```
../corpora/hymns/A_Call_To_Prayer.txt
58.915% other (3 sentences)               # FROM UNKNOWN TO OTHER
41.085% both (2 sentences)                # FROM SELF TO BOTH

../corpora/hymns/All_Hail_the_Power_of_Jesus_Name.txt
44.444% other (9 sentences)               # FROM UNKNOWN TO OTHER
55.556% both (7 sentences)                # FROM SELF TO BOTH

../corpora/hymns/All_That_I_Am.txt
100.000% both (1 sentences)               # FROM SELF TO BOTH
```

WOULD BE MORE MORE INTERESTING TO SEE BREAKOUT BY LINE SINCE 40% + SENTENCES END UP BOTH.