In [23]:
# Import macula lowfat xml
import lxml.etree as ET

xml_root = "/home/jcuenod/Programming/symphony-stuff/symphony-backend-atlas-internal/data/Clear-Bible/macula-greek/SBLGNT/lowfat"
file_name = "10-ephesians.xml"
tree = ET.parse(xml_root + "/" + file_name)
root = tree.getroot()

# Get all the words
words = root.findall(".//w")
print("Number of words: " + str(len(words)))

# print the first 5 words
for word in words[:5]:
    print(word.text)

Number of words: 2416
Παῦλος
ἀπόστολος
Χριστοῦ
Ἰησοῦ
διὰ


In [24]:
# find words with both role as 'v' and mood must be either 'indicative' or 'imperative' or 'subjunctive' or 'infinitive'
main_clause_candidates = root.xpath(".//w[@role='v' and (@mood='indicative' or @mood='imperative' or @mood='subjunctive' or @mood='infinitive')]")
print("Number of main clauses: " + str(len(main_clause_candidates)))
for clause in main_clause_candidates[:5]:
    print(clause.text)

Number of main clauses: 170
ἐξελέξατο
ἐχαρίτωσεν
ἔχομεν
ἐπερίσσευσεν
προέθετο


In [25]:
# we're looking for siblings of the main_clause_candidates that have a child with a role of v or vc but mood participle
# sibling must precede the main clause
SHOULD_PRINT_PARTICIPLE_CANDIDATES = True

# check that no ancestor of node has clauseType=nominalized up to ancestor
def node_does_not_have_nominalized_ancestor(node, ancestor):
    if node is None:
        return True
    if node.attrib.get("clauseType") == "nominalized":
        return False
    return node_does_not_have_nominalized_ancestor(node.getparent(), ancestor)

def first_node_without_nominalized_ancestor(nodes, ancestor):
    nodes_without = [participle for participle in adjunct_participles
        if node_does_not_have_nominalized_ancestor(participle, sibling)
    ] if adjunct_participles is not None else []
    return nodes_without[0] if len(nodes_without) > 0 else None

print_counter = 0
adjunct_participle_candidates = []
for clause in main_clause_candidates:
    # get siblings that precede the main clause
    siblings = clause.itersiblings(preceding=True)
    for sibling in siblings:
        # sibling role must be adv (note sibling may not have a "role")
        if sibling.attrib.get("role") != "adv":
            continue
        adjunct_participles = sibling.findall(".//w[@role='v'][@mood='participle']")
        adjunct_participle = first_node_without_nominalized_ancestor(adjunct_participles, sibling)
        if adjunct_participle is not None:
            adjunct_participle_candidates.append(sibling)
            if SHOULD_PRINT_PARTICIPLE_CANDIDATES and print_counter < 5:
                print(adjunct_participle.text, adjunct_participle.attrib)
                print_counter += 1
            continue
        adjunct_participles = sibling.findall(".//w[@role='vc'][@mood='participle']")
        adjunct_participle = first_node_without_nominalized_ancestor(adjunct_participles, sibling)
        if adjunct_participle is not None:
            adjunct_participle_candidates.append(sibling)
            if SHOULD_PRINT_PARTICIPLE_CANDIDATES and print_counter < 5:
                print(adjunct_participle.text, adjunct_participle.attrib)
                print_counter += 1
            continue

print("Number of adjunct participle candidates: " + str(len(adjunct_participle_candidates)))

ἀκούσας {'role': 'v', 'ref': 'EPH 1:15!4', 'after': ' ', 'class': 'verb', '{http://www.w3.org/XML/1998/namespace}id': 'n49001015004', 'lemma': 'ἀκούω', 'normalized': 'ἀκούσας', 'strong': '191', 'number': 'singular', 'gender': 'masculine', 'case': 'nominative', 'tense': 'aorist', 'voice': 'active', 'mood': 'participle', 'english': 'heard', 'mandarin': '听到', 'gloss': 'having heard of', 'domain': '033015', 'ln': '33.212', 'morph': 'V-AAP-NSM', 'unicode': 'ἀκούσας', 'frame': 'A0:n49001001001 A1:n49001015008;n49001015015', 'subjref': 'n49001001001'}
ὄντας {'role': 'vc', 'ref': 'EPH 2:1!3', 'after': ' ', 'class': 'verb', '{http://www.w3.org/XML/1998/namespace}id': 'n49002001003', 'lemma': 'εἰμί', 'normalized': 'ὄντας', 'strong': '5607', 'number': 'plural', 'gender': 'masculine', 'case': 'accusative', 'tense': 'present', 'voice': 'active', 'mood': 'participle', 'english': 'were', 'mandarin': '是', 'gloss': 'being', 'domain': '013001', 'ln': '13.1', 'morph': 'V-PAP-APM', 'unicode': 'ὄντας'}
ὄντ

In [26]:
# now we're going to get the text corresponding to the adjunct

import re
def get_text(node):
    text = ''.join(node.itertext())
    text = re.sub(r'\s+', ' ', text)
    return text

def get_id_list(node):
    words = node.findall(".//w")
    return [word.attrib.get("{http://www.w3.org/XML/1998/namespace}id") for word in words]

print_counter = 0
id_list_by_ref = []
for candidate in adjunct_participle_candidates:
    
    # It's useful to print with a ref, this is just a rough-and-ready way to do it (not perfect)
    ref = candidate.find(".//w[@ref]")
    actual_ref = ref.attrib.get("ref").split("!")[0]
    ref_to_print = actual_ref + " " * (10 - len(actual_ref))

    id_list_by_ref.append((actual_ref, get_id_list(candidate)))
    if print_counter < 10:
        print(ref_to_print, get_text(candidate))
        print_counter += 1


EPH 1:15    ἀκούσας τὴν καθ’ ὑμᾶς πίστιν ἐν τῷ κυρίῳ Ἰησοῦ καὶ τὴν ἀγάπην τὴν εἰς πάντας τοὺς ἁγίους 
EPH 2:1     ὑμᾶς ὄντας νεκροὺς τοῖς παραπτώμασιν καὶ ταῖς ἁμαρτίαις ὑμῶν 
EPH 2:5     καὶ ὄντας ἡμᾶς νεκροὺς τοῖς παραπτώμασιν 
EPH 2:4     πλούσιος ὢν ἐν ἐλέει 
EPH 2:17    ἐλθὼν 
EPH 4:8     Ἀναβὰς εἰς ὕψος 
EPH 4:15    ἀληθεύοντες ἐν ἀγάπῃ 
EPH 4:25    ἀποθέμενοι τὸ ψεῦδος 
EPH 5:13    ἐλεγχόμενα ὑπὸ τοῦ φωτὸς 
EPH 6:13    ἅπαντα κατεργασάμενοι 


In [27]:
# Differences are mainly the inclusion of the subject in Logos data

# I list more significant differences below.
# False Negatives:

# False Positives:
#  - 5:46
#  - 7:11
#  - 8:34

In [28]:
# export to csv
import csv
with open('backgrounded-phrases.ephesians.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    for row in id_list_by_ref:
        writer.writerow(row)

In [29]:
# Let's use the macula TSV to print out verses and highlight backgrounded phrases

# Get the macula TSV
import csv
path_to_macula_tsv = "/home/jcuenod/Programming/symphony-stuff/symphony-backend-atlas-internal/data/Clear-Bible/macula-greek/SBLGNT/tsv/macula-greek-SBLGNT.tsv"
with open(path_to_macula_tsv, newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    macula_tsv = list(reader)
# first row is the header
header = macula_tsv[0]
print(header)
# convert to dictionarys
macula_tsv = [dict(zip(header, row)) for row in macula_tsv[1:]]

# show the first row
print(macula_tsv[0])

['xml:id', 'ref', 'role', 'class', 'type', 'english', 'mandarin', 'gloss', 'text', 'after', 'lemma', 'normalized', 'strong', 'morph', 'person', 'number', 'gender', 'case', 'tense', 'voice', 'mood', 'degree', 'domain', 'ln', 'frame', 'subjref', 'referent']
{'xml:id': 'n40001001001', 'ref': 'MAT 1:1!1', 'role': '', 'class': 'noun', 'type': 'common', 'english': 'book', 'mandarin': '谱', 'gloss': '[The] book', 'text': 'Βίβλος', 'after': ' ', 'lemma': 'βίβλος', 'normalized': 'βίβλος', 'strong': '976', 'morph': 'N-NSF', 'person': '', 'number': 'singular', 'gender': 'feminine', 'case': 'nominative', 'tense': '', 'voice': '', 'mood': '', 'degree': '', 'domain': '033005', 'ln': '33.38', 'frame': '', 'subjref': '', 'referent': ''}


In [30]:
# Now we're grouping rows of macula_tsv by ref so we can get a whole verse at a time
macula_tsv_by_ref = {}
for row in macula_tsv:
    ref = row.get("ref").split("!")[0]
    if ref not in macula_tsv_by_ref:
        macula_tsv_by_ref[ref] = []
    macula_tsv_by_ref[ref].append(row)

In [31]:
print(id_list_by_ref[0])

('EPH 1:15', ['n49001015004', 'n49001015005', 'n49001015006', 'n49001015007', 'n49001015008', 'n49001015009', 'n49001015010', 'n49001015011', 'n49001015012', 'n49001015013', 'n49001015014', 'n49001015015', 'n49001015016', 'n49001015017', 'n49001015018', 'n49001015019', 'n49001015020'])


In [32]:
from IPython.display import display, HTML

html_string = "<table>"
for ref, id_list in id_list_by_ref:
    # get the macula tsv for the roi
    macula_tsv_for_roi = macula_tsv_by_ref[ref]

    html_string += f"<tr><td width='100px'><b>{ref}</b></td><td style='text-align: left !important;'>"
    for word in macula_tsv_for_roi:
        text = word["text"]
        if word["xml:id"] in id_list:
            text = "<span style='background-color: #FFFF00'>" + text + "</span>"
        after = word["after"]
        html_string += text + after
    html_string += "</td></tr>"
html_string += "</table>"

display(HTML(html_string))


0,1
EPH 1:15,"Διὰ τοῦτο κἀγώ,ἀκούσας τὴν καθ’ ὑμᾶς πίστιν ἐν τῷ κυρίῳ Ἰησοῦ καὶ τὴν ἀγάπην τὴν εἰς πάντας τοὺς ἁγίους,"
EPH 2:1,"Καὶ ὑμᾶς ὄντας νεκροὺς τοῖς παραπτώμασιν καὶ ταῖς ἁμαρτίαις ὑμῶν,"
EPH 2:5,καὶ ὄντας ἡμᾶς νεκροὺς τοῖς παραπτώμασιν συνεζωοποίησεν τῷ Χριστῷ χάριτί ἐστε σεσῳσμένοι
EPH 2:4,"ὁ δὲ θεὸς πλούσιος ὢν ἐν ἐλέει,διὰ τὴν πολλὴν ἀγάπην αὐτοῦ ἣν ἠγάπησεν ἡμᾶς,"
EPH 2:17,καὶ ἐλθὼν εὐηγγελίσατο εἰρήνην ὑμῖν τοῖς μακρὰν καὶ εἰρήνην τοῖς ἐγγύς·
EPH 4:8,"διὸ λέγει·Ἀναβὰς εἰς ὕψος ᾐχμαλώτευσεν αἰχμαλωσίαν,ἔδωκεν δόματα τοῖς ἀνθρώποις."
EPH 4:15,"ἀληθεύοντες δὲ ἐν ἀγάπῃ αὐξήσωμεν εἰς αὐτὸν τὰ πάντα,ὅς ἐστιν ἡ κεφαλή,Χριστός,"
EPH 4:25,"Διὸ ἀποθέμενοι τὸ ψεῦδος λαλεῖτε ἀλήθειαν ἕκαστος μετὰ τοῦ πλησίον αὐτοῦ,ὅτι ἐσμὲν ἀλλήλων μέλη."
EPH 5:13,"τὰ δὲ πάντα ἐλεγχόμενα ὑπὸ τοῦ φωτὸς φανεροῦται,"
EPH 6:13,"διὰ τοῦτο ἀναλάβετε τὴν πανοπλίαν τοῦ θεοῦ,ἵνα δυνηθῆτε ἀντιστῆναι ἐν τῇ ἡμέρᾳ τῇ πονηρᾷ καὶ ἅπαντα κατεργασάμενοι στῆναι."
