In [63]:
# Import macula lowfat xml
import lxml.etree as ET

xml_root = "/home/jcuenod/Programming/symphony-stuff/symphony-backend-atlas-internal/data/Clear-Bible/macula-greek/SBLGNT/lowfat"
file_name = "01-matthew.xml"
tree = ET.parse(xml_root + "/" + file_name)
root = tree.getroot()

# Get all the words
words = root.findall(".//w")
print("Number of words: " + str(len(words)))

# print the first 10 words
for word in words[:10]:
    print(word.text)

Number of words: 18329
Βίβλος
γενέσεως
Ἰησοῦ
χριστοῦ
υἱοῦ
Δαυὶδ
υἱοῦ
Ἀβραάμ
Ἀβραὰμ
ἐγέννησεν


In [64]:
# find words with both role as 'v' and mood must be either 'indicative' or 'imperative' or 'subjunctive' or 'infinitive'
main_clause_candidates = root.xpath(".//w[@role='v' and (@mood='indicative' or @mood='imperative' or @mood='subjunctive' or @mood='infinitive')]")
print("Number of main clauses: " + str(len(main_clause_candidates)))
for clause in main_clause_candidates[:10]:
    print(clause.text)

Number of main clauses: 2744
ἐγέννησεν
ἐγέννησεν
ἐγέννησεν
ἐγέννησεν
ἐγέννησεν
ἐγέννησεν
ἐγέννησεν
ἐγέννησεν
ἐγέννησεν
ἐγέννησεν


In [78]:
# we're looking for siblings of the main_clause_candidates that have a child with a role of v or vc but mood participle
# sibling must precede the main clause
SHOULD_PRINT_PARTICIPLE_CANDIDATES = True
adjunct_participle_candidates = []
for clause in main_clause_candidates:
    # get siblings that precede the main clause
    siblings = clause.itersiblings(preceding=True)
    for sibling in siblings:
        # sibling role must be adv (note sibling may not have a "role")
        if sibling.attrib.get("role") != "adv":
            continue
        adjunct_participles = sibling.find(".//w[@role='v'][@mood='participle']")
        if adjunct_participles is not None:
            adjunct_participle_candidates.append(sibling)
            if SHOULD_PRINT_PARTICIPLE_CANDIDATES:
                print(adjunct_participles.text, adjunct_participles.attrib)
            continue
        adjunct_participles = sibling.find(".//w[@role='vc'][@mood='participle']")
        if adjunct_participles is not None:
            adjunct_participle_candidates.append(sibling)
            if SHOULD_PRINT_PARTICIPLE_CANDIDATES:
                print(adjunct_participles.text, adjunct_participles.attrib)
            continue

print("Number of adjunct participle candidates: " + str(len(adjunct_participle_candidates)))

θέλων {'role': 'v', 'ref': 'MAT 1:19!10', 'after': ' ', 'class': 'verb', '{http://www.w3.org/XML/1998/namespace}id': 'n40001019010', 'lemma': 'θέλω', 'normalized': 'θέλων', 'strong': '2309', 'number': 'singular', 'gender': 'masculine', 'case': 'nominative', 'tense': 'present', 'voice': 'active', 'mood': 'participle', 'english': 'wanting', 'mandarin': '愿意', 'gloss': 'willing', 'domain': '025001', 'ln': '25.1', 'morph': 'V-PAP-NSM', 'unicode': 'θέλων', 'frame': 'A0:n40001019001 A1:n40001019012', 'subjref': 'n40001019001'}
ἐγερθεὶς {'role': 'v', 'ref': 'MAT 1:24!1', 'after': ' ', 'class': 'verb', '{http://www.w3.org/XML/1998/namespace}id': 'n40001024001', 'lemma': 'ἐγείρω', 'normalized': 'ἐγερθείς', 'strong': '1453', 'number': 'singular', 'gender': 'masculine', 'case': 'nominative', 'tense': 'aorist', 'voice': 'passive', 'mood': 'participle', 'english': 'woke up', 'mandarin': '醒来', 'gloss': 'Having been awoken', 'domain': '023005', 'ln': '23.74', 'morph': 'V-APP-NSM', 'unicode': 'ἐγερθεὶς

In [106]:
# now we're going to get the text corresponding to the adjunct

import re
def get_text(node):
    text = ''.join(node.itertext())
    text = re.sub(r'\s+', ' ', text)
    return text

def get_id_list(node):
    words = node.findall(".//w")
    return [word.attrib.get("{http://www.w3.org/XML/1998/namespace}id") for word in words]

id_list_by_ref = []
for candidate in adjunct_participle_candidates[:100]:
    
    # It's useful to print with a ref, this is just a rough-and-ready way to do it (not perfect)
    ref = candidate.find(".//w[@ref]")
    actual_ref = ref.attrib.get("ref").split("!")[0]
    ref_to_print = actual_ref + " " * (10 - len(actual_ref))

    print(ref_to_print, get_text(candidate))
    id_list_by_ref.append((actual_ref, get_id_list(candidate)))


MAT 1:19    δίκαιος ὢν καὶ μὴ θέλων αὐτὴν δειγματίσαι 
MAT 1:24    ἐγερθεὶς ἀπὸ τοῦ ὕπνου 
MAT 2:3     ἀκούσας 
MAT 2:4     συναγαγὼν πάντας τοὺς ἀρχιερεῖς καὶ γραμματεῖς τοῦ λαοῦ 
MAT 2:7     λάθρᾳ καλέσας τοὺς μάγους 
MAT 2:8     πέμψας αὐτοὺς εἰς Βηθλέεμ 
MAT 2:8     Πορευθέντες 
MAT 2:8     ἐλθὼν 
MAT 2:9     ἀκούσαντες τοῦ βασιλέως 
MAT 2:9     ἐλθὼν 
MAT 2:10    ἰδόντες τὸν ἀστέρα 
MAT 2:11    ἐλθόντες εἰς τὴν οἰκίαν 
MAT 2:11    πεσόντες 
MAT 2:11    ἀνοίξαντες τοὺς θησαυροὺς αὐτῶν 
MAT 2:12    χρηματισθέντες κατ’ ὄναρ μὴ ἀνακάμψαι πρὸς Ἡρῴδην 
MAT 2:13    Ἐγερθεὶς 
MAT 2:14    ἐγερθεὶς 
MAT 2:16    ἰδὼν ὅτι ἐνεπαίχθη ὑπὸ τῶν μάγων 
MAT 2:16    ἀποστείλας 
MAT 2:20    Ἐγερθεὶς 
MAT 2:21    ἐγερθεὶς 
MAT 2:22    ἀκούσας ὅτι Ἀρχέλαος βασιλεύει τῆς Ἰουδαίας ἀντὶ τοῦ πατρὸς αὐτοῦ Ἡρῴδου 
MAT 2:22    χρηματισθεὶς κατ’ ὄναρ 
MAT 2:23    ἐλθὼν 
MAT 3:7     Ἰδὼν πολλοὺς τῶν Φαρισαίων καὶ Σαδδουκαίων ἐρχομένους ἐπὶ τὸ βάπτισμα αὐτοῦ 
MAT 3:15    ἀποκριθεὶς 
MAT 3:16    βαπτισθεὶς 
MAT 4:

In [80]:
# Differences are mainly the inclusion of the subject in Logos data

# I list more significant differences below.
# False Negatives:

# False Positives:
#  - 5:46
#  - 7:11
#  - 8:34

In [107]:
# export to csv
import csv
with open('backgrounded-phrases.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    for row in id_list_by_ref:
        writer.writerow(row)

In [89]:
# Let's use the macula TSV to print out verses and highlight backgrounded phrases

# Get the macula TSV
import csv
path_to_macula_tsv = "/home/jcuenod/Programming/symphony-stuff/symphony-backend-atlas-internal/data/Clear-Bible/macula-greek/SBLGNT/tsv/macula-greek-SBLGNT.tsv"
with open(path_to_macula_tsv, newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    macula_tsv = list(reader)
# first row is the header
header = macula_tsv[0]
print(header)
# convert to dictionarys
macula_tsv = [dict(zip(header, row)) for row in macula_tsv[1:]]

# show the first 10 rows
for row in macula_tsv[:10]:
    print(row)

['xml:id', 'ref', 'role', 'class', 'type', 'english', 'mandarin', 'gloss', 'text', 'after', 'lemma', 'normalized', 'strong', 'morph', 'person', 'number', 'gender', 'case', 'tense', 'voice', 'mood', 'degree', 'domain', 'ln', 'frame', 'subjref', 'referent']
{'xml:id': 'n40001001001', 'ref': 'MAT 1:1!1', 'role': '', 'class': 'noun', 'type': 'common', 'english': 'book', 'mandarin': '谱', 'gloss': '[The] book', 'text': 'Βίβλος', 'after': ' ', 'lemma': 'βίβλος', 'normalized': 'βίβλος', 'strong': '976', 'morph': 'N-NSF', 'person': '', 'number': 'singular', 'gender': 'feminine', 'case': 'nominative', 'tense': '', 'voice': '', 'mood': '', 'degree': '', 'domain': '033005', 'ln': '33.38', 'frame': '', 'subjref': '', 'referent': ''}
{'xml:id': 'n40001001002', 'ref': 'MAT 1:1!2', 'role': '', 'class': 'noun', 'type': 'common', 'english': 'genealogy', 'mandarin': '族', 'gloss': 'of [the] genealogy', 'text': 'γενέσεως', 'after': ' ', 'lemma': 'γένεσις', 'normalized': 'γενέσεως', 'strong': '1078', 'morph

In [93]:
macula_tsv_by_ref = {}
for row in macula_tsv:
    ref = row.get("ref").split("!")[0]
    if ref not in macula_tsv_by_ref:
        macula_tsv_by_ref[ref] = []
    macula_tsv_by_ref[ref].append(row)
print("Number of verses: " + str(len(macula_tsv_by_ref)))
print(macula_tsv_by_ref["MAT 1:1"])

Number of verses: 7939
[{'xml:id': 'n40001001001', 'ref': 'MAT 1:1!1', 'role': '', 'class': 'noun', 'type': 'common', 'english': 'book', 'mandarin': '谱', 'gloss': '[The] book', 'text': 'Βίβλος', 'after': ' ', 'lemma': 'βίβλος', 'normalized': 'βίβλος', 'strong': '976', 'morph': 'N-NSF', 'person': '', 'number': 'singular', 'gender': 'feminine', 'case': 'nominative', 'tense': '', 'voice': '', 'mood': '', 'degree': '', 'domain': '033005', 'ln': '33.38', 'frame': '', 'subjref': '', 'referent': ''}, {'xml:id': 'n40001001002', 'ref': 'MAT 1:1!2', 'role': '', 'class': 'noun', 'type': 'common', 'english': 'genealogy', 'mandarin': '族', 'gloss': 'of [the] genealogy', 'text': 'γενέσεως', 'after': ' ', 'lemma': 'γένεσις', 'normalized': 'γενέσεως', 'strong': '1078', 'morph': 'N-GSF', 'person': '', 'number': 'singular', 'gender': 'feminine', 'case': 'genitive', 'tense': '', 'voice': '', 'mood': '', 'degree': '', 'domain': '010002 033003', 'ln': '10.24 33.19', 'frame': '', 'subjref': '', 'referent': '

In [108]:
print(id_list_by_ref[0])

('MAT 1:19', ['n40001019006', 'n40001019007', 'n40001019008', 'n40001019009', 'n40001019010', 'n40001019011', 'n40001019012'])


In [125]:
from IPython.display import display, HTML

html_string = "<table>"
for ref, id_list in id_list_by_ref:
    # get the macula tsv for the roi
    macula_tsv_for_roi = macula_tsv_by_ref[ref]

    html_string += f"<tr><td width='100px'><b>{ref}</b></td><td style='text-align: left !important;'>"
    for word in macula_tsv_for_roi:
        text = word["text"]
        if word["xml:id"] in id_list:
            text = "<span style='background-color: #FFFF00'>" + text + "</span>"
        after = word["after"]
        html_string += text + after
    html_string += "</td></tr>"
html_string += "</table>"

display(HTML(html_string))


0,1
MAT 1:19,"Ἰωσὴφ δὲ ὁ ἀνὴρ αὐτῆς,δίκαιος ὢν καὶ μὴ θέλων αὐτὴν δειγματίσαι,ἐβουλήθη λάθρᾳ ἀπολῦσαι αὐτήν."
MAT 1:24,ἐγερθεὶς δὲ ὁ Ἰωσὴφ ἀπὸ τοῦ ὕπνου ἐποίησεν ὡς προσέταξεν αὐτῷ ὁ ἄγγελος κυρίου καὶ παρέλαβεν τὴν γυναῖκα αὐτοῦ·
MAT 2:3,"ἀκούσας δὲ ὁ βασιλεὺς Ἡρῴδης ἐταράχθη καὶ πᾶσα Ἱεροσόλυμα μετ’ αὐτοῦ,"
MAT 2:4,καὶ συναγαγὼν πάντας τοὺς ἀρχιερεῖς καὶ γραμματεῖς τοῦ λαοῦ ἐπυνθάνετο παρ’ αὐτῶν ποῦ ὁ χριστὸς γεννᾶται.
MAT 2:7,"Τότε Ἡρῴδης λάθρᾳ καλέσας τοὺς μάγους ἠκρίβωσεν παρ’ αὐτῶν τὸν χρόνον τοῦ φαινομένου ἀστέρος,"
MAT 2:8,"καὶ πέμψας αὐτοὺς εἰς Βηθλέεμ εἶπεν·Πορευθέντες ἐξετάσατε ἀκριβῶς περὶ τοῦ παιδίου·ἐπὰν δὲ εὕρητε,ἀπαγγείλατέ μοι,ὅπως κἀγὼ ἐλθὼν προσκυνήσω αὐτῷ."
MAT 2:8,"καὶ πέμψας αὐτοὺς εἰς Βηθλέεμ εἶπεν·Πορευθέντες ἐξετάσατε ἀκριβῶς περὶ τοῦ παιδίου·ἐπὰν δὲ εὕρητε,ἀπαγγείλατέ μοι,ὅπως κἀγὼ ἐλθὼν προσκυνήσω αὐτῷ."
MAT 2:8,"καὶ πέμψας αὐτοὺς εἰς Βηθλέεμ εἶπεν·Πορευθέντες ἐξετάσατε ἀκριβῶς περὶ τοῦ παιδίου·ἐπὰν δὲ εὕρητε,ἀπαγγείλατέ μοι,ὅπως κἀγὼ ἐλθὼν προσκυνήσω αὐτῷ."
MAT 2:9,"οἱ δὲ ἀκούσαντες τοῦ βασιλέως ἐπορεύθησαν,καὶ ἰδοὺ ὁ ἀστὴρ ὃν εἶδον ἐν τῇ ἀνατολῇ προῆγεν αὐτούς,ἕως ἐλθὼν ἐστάθη ἐπάνω οὗ ἦν τὸ παιδίον."
MAT 2:9,"οἱ δὲ ἀκούσαντες τοῦ βασιλέως ἐπορεύθησαν,καὶ ἰδοὺ ὁ ἀστὴρ ὃν εἶδον ἐν τῇ ἀνατολῇ προῆγεν αὐτούς,ἕως ἐλθὼν ἐστάθη ἐπάνω οὗ ἦν τὸ παιδίον."
