In [1]:
# Import macula lowfat xml
import lxml.etree as ET
import glob

noun_lemmas_and_the_prepositions_that_govern_them = {}
for file_name in glob.glob("/home/jcuenod/Programming/symphony-stuff/symphony-backend-atlas-internal/data/Clear-Bible/macula-greek/SBLGNT/lowfat/*.xml"):
    # check that the filename begins with a digit
    if not file_name.split("/")[-1][0].isdigit():
        continue
    print(file_name)

    tree = ET.parse(file_name)
    root = tree.getroot()

    # we want prepositional phrases: class=pp
    pps = root.findall(".//wg[@class='pp']")

    # show all the times where a pp has more than 2 immediate children
    for pp in pps:
        # first child is the preposition
        preposition = pp[0]
        if preposition.attrib['class'] != 'prep':
            # there are a few instances in which the "preposition" is not ὡς or καί (or something like that)
            continue
        preposition_lemma = preposition.attrib['lemma']
        
        # if the second child is a wg, get its immediate children that are class=noun
        # otherwise get the second child (which is the noun)
        nouns = []
        if pp[1].tag == 'wg':
            nouns = pp[1].findall("./w[@class='noun']")
        else:
            # there are plenty of pronouns/adjectives that feature in prepositional phrases, we ignore them
            if pp[1].attrib['class'] == 'noun':
                nouns = [pp[1]]
        
        for noun in nouns:
            lemma = noun.attrib['lemma']
            if lemma not in noun_lemmas_and_the_prepositions_that_govern_them:
                noun_lemmas_and_the_prepositions_that_govern_them[lemma] = {}
            if preposition_lemma not in noun_lemmas_and_the_prepositions_that_govern_them[lemma]:
                noun_lemmas_and_the_prepositions_that_govern_them[lemma][preposition_lemma] = {
                    'count': 0,
                    'verses': []
                }
            noun_lemmas_and_the_prepositions_that_govern_them[lemma][preposition_lemma]['count'] += 1
            noun_lemmas_and_the_prepositions_that_govern_them[lemma][preposition_lemma]['verses'].append(preposition.attrib['ref'])

/home/jcuenod/Programming/symphony-stuff/symphony-backend-atlas-internal/data/Clear-Bible/macula-greek/SBLGNT/lowfat/07-1corinthians.xml
/home/jcuenod/Programming/symphony-stuff/symphony-backend-atlas-internal/data/Clear-Bible/macula-greek/SBLGNT/lowfat/15-1timothy.xml
/home/jcuenod/Programming/symphony-stuff/symphony-backend-atlas-internal/data/Clear-Bible/macula-greek/SBLGNT/lowfat/11-philippians.xml
/home/jcuenod/Programming/symphony-stuff/symphony-backend-atlas-internal/data/Clear-Bible/macula-greek/SBLGNT/lowfat/06-romans.xml
/home/jcuenod/Programming/symphony-stuff/symphony-backend-atlas-internal/data/Clear-Bible/macula-greek/SBLGNT/lowfat/05-acts.xml
/home/jcuenod/Programming/symphony-stuff/symphony-backend-atlas-internal/data/Clear-Bible/macula-greek/SBLGNT/lowfat/25-3john.xml
/home/jcuenod/Programming/symphony-stuff/symphony-backend-atlas-internal/data/Clear-Bible/macula-greek/SBLGNT/lowfat/12-colossians.xml
/home/jcuenod/Programming/symphony-stuff/symphony-backend-atlas-inter

In [2]:
print("Number of nouns: " + str(len(noun_lemmas_and_the_prepositions_that_govern_them)))

# show an example
sample_noun = 'θεός'
example = noun_lemmas_and_the_prepositions_that_govern_them[sample_noun]
print(sample_noun)
for preposition in noun_lemmas_and_the_prepositions_that_govern_them[sample_noun]:
    print("\t" + preposition + ": " + str(noun_lemmas_and_the_prepositions_that_govern_them[sample_noun][preposition]['count']))

Number of nouns: 941
θεός
	διά: 10
	ἐνώπιον: 19
	ἀπό: 18
	ἐν: 19
	ἐκ: 33
	ὑπό: 15
	παρά: 24
	εἰς: 12
	κατά: 10
	ἐπί: 7
	πρός: 20
	ὑπέρ: 1
	ἔναντι: 2
	ἔμπροσθεν: 1
	ἐναντίον: 1
	κατέναντι: 2
	χωρίς: 1


In [3]:
import pandas as pd

# we want the structure to be:
# noun, prep1_count, prep1_verses, prep2_count, prep2_verses, ...

# first we need to get all the prepositions
prepositions = set()
for noun in noun_lemmas_and_the_prepositions_that_govern_them:
    for preposition in noun_lemmas_and_the_prepositions_that_govern_them[noun]:
        prepositions.add(preposition)

print("Number of prepositions: " + str(len(prepositions)))
print(prepositions)

Number of prepositions: 49
{'παρεκτός', 'ἐνώπιον', 'ἔξω', 'διά', 'ἔξωθεν', 'ἕνεκεν', 'κύκλῳ', 'παρά', 'ὄπισθεν', 'μέχρι(ς)', 'ἀντιπέρα', 'ὑπέρ', 'ἄντικρυς', 'χωρίς', 'ἀπό', 'ἀντί', 'ἄτερ', 'πέραν', 'ὀψέ', 'κυκλόθεν', 'ἄχρι', 'ἀπέναντι', 'ἀπό', 'ἐκ', 'ἕως', 'πρός', 'περί', 'ἐπάνω', 'ἐν', 'ὀπίσω', 'ὑποκάτω', 'ἀνά', 'ἐγγύς', 'ἐκτός', 'ἔμπροσθεν', 'μετά', 'μεταξύ', 'ἔναντι', 'ἄνευ', 'ἐναντίον', 'εἰς', 'ὑπό', 'ἐπί', 'πρό', 'σύν', 'ἐπέκεινα', 'κατέναντι', 'πλήν', 'κατά'}


In [4]:
# now we can create the dataframe
prepositions_of_interest = [
    'εἰς',
    'ἐκ',
    'ἐν',
    'διά',
]
# p_cols = [(preposition + "_count", preposition + "_verses") for preposition in prepositions_of_interest]
# df = pd.DataFrame(columns=['noun', 'occurrences'] + [item for sublist in p_cols for item in sublist])
df = pd.DataFrame(columns=['noun', 'occurrences'] + [preposition + "_count" for preposition in prepositions_of_interest])

for noun in noun_lemmas_and_the_prepositions_that_govern_them:
    row = {'noun': noun}
    occurrences = 0
    for preposition in noun_lemmas_and_the_prepositions_that_govern_them[noun]:
        if preposition not in prepositions_of_interest:
            continue
        row[preposition + "_count"] = noun_lemmas_and_the_prepositions_that_govern_them[noun][preposition]['count']
        # row[preposition + "_verses"] = noun_lemmas_and_the_prepositions_that_govern_them[noun][preposition]['verses']
        occurrences += noun_lemmas_and_the_prepositions_that_govern_them[noun][preposition]['count']
    row['occurrences'] = occurrences
    df = df.append(row, ignore_index=True)

# remove rows with no occurrences
df = df[df['occurrences'] > 0]

# sort by occurrences
df = df.sort_values(by=['occurrences'], ascending=False)

df.head()

  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append

Unnamed: 0,noun,occurrences,εἰς_count,ἐκ_count,ἐν_count,διά_count
50,οὐρανός,149,24,56.0,69,
3,Χριστός,115,16,1.0,84,14.0
1,θεός,74,12,33.0,19,10.0
4,Ἰησοῦς,69,6,1.0,50,12.0
13,κύριος,60,1,,57,2.0


In [5]:
# write to csv
df.to_csv('noun_preposition.csv', index=False)