In [1]:
import unicodedata
from tf.fabric import Fabric

DATABASE = '~/Programming/text-fabric-data'
BHSA = 'bhsa/tf/2017'
TF = Fabric(locations=[DATABASE], modules=[BHSA], silent=False)

api = TF.load('''
    sp nu gn ps vt vs st
    rela typ
    otype
    g_word_utf8 lex_utf8
    gloss
''')
api.makeAvailableIn(globals())

This is Text-Fabric 3.1.1
Api reference : https://github.com/Dans-labs/text-fabric/wiki/Api
Tutorial      : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb
Example data  : https://github.com/Dans-labs/text-fabric-data

115 features found and 0 ignored
  0.00s loading features ...
   |     0.03s B otype                from /home/jcuenod/Programming/text-fabric-data/bhsa/tf/2017
   |     0.18s B g_word_utf8          from /home/jcuenod/Programming/text-fabric-data/bhsa/tf/2017
   |     0.17s B lex_utf8             from /home/jcuenod/Programming/text-fabric-data/bhsa/tf/2017
   |     0.09s B sp                   from /home/jcuenod/Programming/text-fabric-data/bhsa/tf/2017
   |     0.10s B nu                   from /home/jcuenod/Programming/text-fabric-data/bhsa/tf/2017
   |     0.08s B gn                   from /home/jcuenod/Programming/text-fabric-data/bhsa/tf/2017
   |     0.10s B ps                   from /home/jcuenod/Programming/text-fabric-data/bhsa/tf/2017
 

In [2]:
def sectionToRef(section_tuple):
    return section_tuple[0] + "." + str(section_tuple[1]) + "." + str(section_tuple[2])

person_conversion = {"p1": 1, "p2": 2, "p3": 3}
def setWordData(n, element):
    r = {
        "n": n,
        "lemma": unicodedata.normalize("NFC", F.lex_utf8.v(n)),
        "partOfSpeech": F.sp.v(n),
        "person": person_conversion[F.ps.v(n)] if F.ps.v(n) in person_conversion else F.ps.v(n),
        "number": F.nu.v(n),
        "gender": F.gn.v(n),
        "tense": F.vt.v(n), # vt = verbal tense
        "stem": F.vs.v(n), # vs = verbal stem
        "state": F.st.v(n), # construct/absolute/emphatic
        "gloss": F.gloss.v(n),
    }
    for key, value in r.items():
        if value == "NA" or value == "" or value == "unknown":
            continue
        element.set(key, str(value))

In [3]:
from lxml import etree, builder

em = builder.ElementMaker()
root_document = em.base()

OUTPUT_ROOT = "./output/"

print("Processing...")
for book in F.otype.s("book"):
    print(" - ", T.bookName(book, lang="osis"))
    book_doc = em.book()
    book_doc.set('id', T.bookName(book, lang="osis"))

    for sentence in L.d(book, otype="sentence"):
        refTuple = T.sectionFromNode(sentence, lang="osis")

        milestone = em.milestone(sectionToRef(refTuple), unit="verse")
        milestone.set("id", sectionToRef(refTuple))
        sentence_element = em.sentence(
            milestone,
            em.p(T.text(L.d(sentence, 'word'))),
            n=str(sentence)
        )
        for clause in L.d(sentence, otype="clause"):
            clauseRefTuple = T.sectionFromNode(clause, lang="osis")
            if clauseRefTuple != refTuple:
                milestone = em.milestone(sectionToRef(clauseRefTuple), unit="verse")
                milestone.set("id", sectionToRef(clauseRefTuple))
                sentence_element.append(milestone)
                refTuple = clauseRefTuple
            clause_element = em.wg(n=str(clause))
            clause_element.set("class", F.typ.v(clause))
            if F.rela.v(clause) != "NA":
                clause_element.set("role", F.rela.v(clause)) 

            for phrase in L.d(clause, otype="phrase"):
                phrase_element = em.wg(n=str(phrase))
                phrase_element.set("class", F.typ.v(phrase))
                if F.rela.v(phrase) != "NA":
                    phrase_element.set("role", F.rela.v(phrase))

                for word in L.d(phrase, otype="word"):
                    w_element = em.w( unicodedata.normalize("NFC", F.g_word_utf8.v(word)) )
                    setWordData(word, w_element)

                    phrase_element.append(w_element)
                clause_element.append(phrase_element)
            sentence_element.append(clause_element)
        book_doc.append(sentence_element)
    
    OUTPUT_FILE = OUTPUT_ROOT + T.bookName(book, lang="osis").lower() + ".xml"
    etree.ElementTree(book_doc).write(OUTPUT_FILE, pretty_print=True, xml_declaration=True, encoding="utf-8")
    
    root_document.append(book_doc)

Processing...
 -  Gen
 -  Exod
 -  Lev
 -  Num
 -  Deut
 -  Josh
 -  Judg
 -  1Sam
 -  2Sam
 -  1Kgs
 -  2Kgs
 -  Isa
 -  Jer
 -  Ezek
 -  Hos
 -  Joel
 -  Amos
 -  Obad
 -  Jonah
 -  Mic
 -  Nah
 -  Hab
 -  Zeph
 -  Hag
 -  Zech
 -  Mal
 -  Ps
 -  Job
 -  Prov
 -  Ruth
 -  Song
 -  Eccl
 -  Lam
 -  Dan
 -  Ezra
 -  Neh
 -  Esth
 -  1Chr
 -  2Chr


In [4]:
phrase_nodes = []
for phrase in F.otype.s("phrase"):
    phrase_nodes.append(phrase)
print(len(phrase_nodes))

253187


In [5]:
len(root_document.xpath("//sentence/wg/wg"))

253187

In [6]:
OUTPUT_FILE = OUTPUT_ROOT + "root.xml"
etree.ElementTree(root_document).write(OUTPUT_FILE, pretty_print=True, xml_declaration=True, encoding="utf-8")