# Converting ETCBC's BHSA to XML

The reason for this conversion is basically to try out ETCBC's data with XML querying tools (like Jonathan Robie has done at https://github.com/biblicalhumanities/greek-new-testament/blob/master/labnotes/lxml-tutorial.ipynb)

**NOTE:** You will need the osis book names (included in this repo). I simply dumped it into my text-fabric data folder.

To begin with we need to do some basic imports and load text-fabric features.

In [1]:
import unicodedata
from tf.fabric import Fabric

DATABASE = '~/Programming/text-fabric-data'
BHSA = 'bhsa/tf/2017'
TF = Fabric(locations=[DATABASE], modules=[BHSA], silent=False)

api = TF.load('''
    sp nu gn ps vt vs st gloss
    rela typ
    otype
    g_word_utf8 g_lex_utf8
''')
api.makeAvailableIn(globals())

This is Text-Fabric 3.1.1
Api reference : https://github.com/Dans-labs/text-fabric/wiki/Api
Tutorial      : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb
Example data  : https://github.com/Dans-labs/text-fabric-data

115 features found and 0 ignored
  0.00s loading features ...
   |     0.02s B otype                from /home/jcuenod/Programming/text-fabric-data/bhsa/tf/2017
   |     0.16s B g_lex_utf8           from /home/jcuenod/Programming/text-fabric-data/bhsa/tf/2017
   |     0.22s B g_word_utf8          from /home/jcuenod/Programming/text-fabric-data/bhsa/tf/2017
   |     0.10s B sp                   from /home/jcuenod/Programming/text-fabric-data/bhsa/tf/2017
   |     0.12s B nu                   from /home/jcuenod/Programming/text-fabric-data/bhsa/tf/2017
   |     0.12s B gn                   from /home/jcuenod/Programming/text-fabric-data/bhsa/tf/2017
   |     0.11s B ps                   from /home/jcuenod/Programming/text-fabric-data/bhsa/tf/2017
 

Here we're going to set up some functions that will come in handy later.
- `sectionToRef` will convert a reference to an osisId
- `setWordData` takes a node and an xml element and sets the attributes of the xml element according to data associated with the word node in text-fabric

In [2]:
def sectionToRef(section_tuple):
    return section_tuple[0] + "." + str(section_tuple[1]) + "." + str(section_tuple[2])

person_conversion = {"p1": 1, "p2": 2, "p3": 3}
def setWordData(n, element):
    r = {
        "n": n,
        "lemma": unicodedata.normalize("NFC", F.g_lex_utf8.v(n)),
        "partOfSpeech": F.sp.v(n),
        "person": person_conversion[F.ps.v(n)] if F.ps.v(n) in person_conversion else F.ps.v(n),
        "number": F.nu.v(n),
        "gender": F.gn.v(n),
        "tense": F.vt.v(n), # vt = verbal tense
        "stem": F.vs.v(n), # vs = verbal stem
        "state": F.st.v(n), # construct/absolute/emphatic
        "gloss": F.gloss.v(L.u(n, otype='lex')[0]),
    }
    for key, value in r.items():
        if value == "NA" or value == "" or value == "unknown":
            continue
        element.set(key, str(value))

Now we build up our XML tree, note the export at the end of each book element.

In [3]:
from lxml import etree, builder

em = builder.ElementMaker()
root_document = em.base()

OUTPUT_ROOT = "./output/"

clause_relations = { "Adju": "Adjunctive clause", "Attr": "Attributive clause", "Cmpl": "Complement clause", "Coor": "Coordinated clause", "Objc": "Object clause", "PrAd": "Predicative adjunct clause", "PreC": "Predicative complement clause", "ReVo": "Referral to the vocative", "Resu": "Resumptive clause", "RgRc": "Regens/rectum connection", "Spec": "Specification clause", "Subj": "Subject clause" }
phrase_relations = { "PrAd": "Predicative adjunct", "Resu": "Resumption" }
clause_types = { "AjCl": "Adjective clause", "CPen": "Casus pendens", "Defc": "Defective clause atom", "Ellp": "Ellipsis", "InfA": "Infinitive absolute clause", "InfC": "Infinitive construct clause", "MSyn": "Macrosyntactic sign", "NmCl": "Nominal clause", "Ptcp": "Participle clause", "Reop": "Reopening", "Unkn": "Unknown", "Voct": "Vocative clause", "Way0": "Wayyiqtol-null clause", "WayX": "Wayyiqtol-X clause", "WIm0": "We-imperative-null clause", "WImX": "We-imperative-X clause", "WQt0": "We-qatal-null clause", "WQtX": "We-qatal-X clause", "WxI0": "We-x-imperative-null clause", "WXIm": "We-X-imperative clause", "WxIX": "We-x-imperative-X clause", "WxQ0": "We-x-qatal-null clause", "WXQt": "We-X-qatal clause", "WxQX": "We-x-qatal-X clause", "WxY0": "We-x-yiqtol-null clause", "WXYq": "We-X-yiqtol clause", "WxYX": "We-x-yiqtol-X clause", "WYq0": "We-yiqtol-null clause", "WYqX": "We-yiqtol-X clause", "xIm0": "x-imperative-null clause", "XImp": "X-imperative clause", "xImX": "x-imperative-X clause", "XPos": "Extraposition", "xQt0": "x-qatal-null clause", "XQtl": "X-qatal clause", "xQtX": "x-qatal-X clause", "xYq0": "x-yiqtol-null clause", "XYqt": "X-yiqtol clause", "xYqX": "x-yiqtol-X clause", "ZIm0": "Zero-imperative-null clause", "ZImX": "Zero-imperative-X clause", "ZQt0": "Zero-qatal-null clause", "ZQtX": "Zero-qatal-X clause", "ZYq0": "Zero-yiqtol-null clause", "ZYqX": "Zero-yiqtol-X clause" }
phrase_types = { "VP": "Verbal phrase", "NP": "Nominal phrase", "PrNP": "Proper-noun phrase", "AdvP": "Adverbial phrase", "PP": "Prepositional phrase", "CP": "Conjunctive phrase", "PPrP": "Personal pronoun phrase", "DPrP": "Demonstrative pronoun phrase", "IPrP": "Interrogative pronoun phrase", "InjP": "Interjectional phrase", "NegP": "Negative phrase", "InrP": "Interrogative phrase", "AdjP": "Adjective phrase" }

print("Processing...")
for book in F.otype.s("book"):
    print(" - ", T.bookName(book, lang="osis"))
    book_doc = em.book()
    book_doc.set('id', T.bookName(book, lang="osis"))

    for sentence in L.d(book, otype="sentence"):
        refTuple = T.sectionFromNode(sentence, lang="osis")

        milestone = em.milestone(sectionToRef(refTuple), unit="verse")
        milestone.set("id", sectionToRef(refTuple))
        sentence_element = em.sentence(
            milestone,
            em.p(T.text(L.d(sentence, 'word'))),
            n=str(sentence)
        )
        for clause in L.d(sentence, otype="clause"):
            clauseRefTuple = T.sectionFromNode(clause, lang="osis")
            if clauseRefTuple != refTuple:
                milestone = em.milestone(sectionToRef(clauseRefTuple), unit="verse")
                milestone.set("id", sectionToRef(clauseRefTuple))
                sentence_element.append(milestone)
                refTuple = clauseRefTuple
            clause_element = em.wg(n=str(clause), level="clause")
            clause_element.set("class", clause_types[F.typ.v(clause)])
            if F.rela.v(clause) != "NA":
                clause_element.set("role", clause_relations[F.rela.v(clause)]) 

            for phrase in L.d(clause, otype="phrase"):
                phrase_element = em.wg(n=str(phrase), level="phrase")
                phrase_element.set("class", phrase_types[F.typ.v(phrase)])
                if F.rela.v(phrase) != "NA":
                    phrase_element.set("role", phrase_relations[F.rela.v(phrase)])

                for word in L.d(phrase, otype="word"):
                    w_element = em.w( unicodedata.normalize("NFC", F.g_word_utf8.v(word)) )
                    setWordData(word, w_element)

                    phrase_element.append(w_element)
                clause_element.append(phrase_element)
            sentence_element.append(clause_element)
        book_doc.append(sentence_element)
    
    OUTPUT_FILE = OUTPUT_ROOT + T.bookName(book, lang="osis").lower() + ".xml"
    etree.ElementTree(book_doc).write(OUTPUT_FILE, pretty_print=True, xml_declaration=True, encoding="utf-8")
    
    root_document.append(book_doc)

Processing...
 -  Gen
 -  Exod
 -  Lev
 -  Num
 -  Deut
 -  Josh
 -  Judg
 -  1Sam
 -  2Sam
 -  1Kgs
 -  2Kgs
 -  Isa
 -  Jer
 -  Ezek
 -  Hos
 -  Joel
 -  Amos
 -  Obad
 -  Jonah
 -  Mic
 -  Nah
 -  Hab
 -  Zeph
 -  Hag
 -  Zech
 -  Mal
 -  Ps
 -  Job
 -  Prov
 -  Ruth
 -  Song
 -  Eccl
 -  Lam
 -  Dan
 -  Ezra
 -  Neh
 -  Esth
 -  1Chr
 -  2Chr


Export the whole file to "`root.xml`"

In [4]:
OUTPUT_FILE = OUTPUT_ROOT + "root.xml"
etree.ElementTree(root_document).write(OUTPUT_FILE, pretty_print=True, xml_declaration=True, encoding="utf-8")
print("done - see ", OUTPUT_FILE)

done - see  ./output/root.xml
