## How to use the usfm-grammar python APIs


### Installation

#### From PyPI

In [None]:
# Good to set up a virtual environment
# requires python >= 3.10
!pip install usfm-grammar

#### From code base

In [None]:
! pip install -e ./../python-usfm-parser/ # from the code base

In [None]:
! usfm-grammar -h # to view the command line options

In [None]:
# to bring the changes, after update on the local tree-sitter-usfm grammar
# in terminal from the project root run the following
# >>> python python-usfm-parser/src/grammar_rebuild.py ./tree-sitter-usfm3/ python-usfm-parser/src/usfm_grammar/my-languages.so

### Parsing an input USFM

In [None]:
from usfm_grammar import USFMParser, Filter # importing from the local module, not from an installed library

In [None]:
input_usfm_str = '''
\\id EXO 02EXOGNT92.SFM, Good News Translation, June 2003
\\h പുറപ്പാടു്
\\toc1 പുറപ്പാടു്
\\toc2 പുറപ്പാടു്
\\mt പുറപ്പാടു്
\\c 1
\\p
\\v 1 യാക്കോബിനോടുകൂടെ കുടുംബസഹിതം ഈജിപ്റ്റിൽ വന്ന 
\\p യിസ്രായേൽമക്കളുടെ പേരുകൾ : 
\\v 2 രൂബേൻ, ശിമെയോൻ, ലേവി,
\\v 3 
\\li1 യെഹൂദാ, 
\\li1 യിസ്സാഖാർ, 
\\li1 സെബൂലൂൻ, 
\\li1 ബെന്യാമീൻ
\\p
\\v 4 ദാൻ, നഫ്താലി, ഗാദ്, ആശേർ.
\\v 12-83 They presented their offerings in the following order:
\\tr \\th1 Day \\th2 Tribe \\th3 Leader
\\tr \\tcr1 1st \\tc2 Judah \\tc3 Nahshon son of Amminadab
\\tr \\tcr1 2nd \\tc2 Issachar \\tc3 Nethanel son of Zuar
\\tr \\tcr1 3rd \\tc2 Zebulun \\tc3 Eliab son of Helon
\\p
\\v 5 യാക്കോബിന്റെ സന്താനപരമ്പരകൾ എല്ലാം കൂടി എഴുപതു പേർ ആയിരുന്നു; യോസേഫ് മുമ്പെ തന്നെ ഈജിപ്റ്റിൽ ആയിരുന്നു. \w gracious|grace\w* and then a few words later \w gracious|lemma="grace" x-myattr="metadata"\w*
\\c 2
\\s1 A Prayer of Habakkuk
\\p
\\v 1 This is a prayer of the prophet Habakkuk:
\\b
\\q1
\\v 2 O \\nd Lord\\nd*, I have heard of what you have done,
\\q2 and I am filled with awe.
\\q1 Now do again in our times
\\q2 the great deeds you used to do.
\\q1 Be merciful, even when you are angry.
\\p
\\v 20 Adam \\f + \\fr 3.20: \\fk Adam: \\ft This name in Hebrew means “all human beings.”\\f*
named his wife Eve, \\f + \\fr 3.20: \\fk Eve: \\ft This name sounds similar to the Hebrew
word for “living,” which is rendered in this context as “human beings.”\\f* because she
was the mother of all human beings.
\\v 21 And the \\nd Lord\\nd* God made clothes out of animal skins for Adam and his wife,
and he clothed them.
\\qt-s |sid="qt_123" who="Pilate"\\*“Are you the king of the Jews?”\\qt-e |eid="qt_123"\\*
'''

In [None]:
print(input_usfm_str)

In [None]:
my_parser = USFMParser(input_usfm_str)

In [None]:
# To validate the input USFM file. 
# The rest of operations will work even if there are small errors, if "ignore_errors" is set
my_parser.errors 

### USFM-Grammar JSON
Working with the dict/json object

In [None]:
usfm_json = my_parser.to_dict()

In [None]:
usfm_json

In [None]:
# A generic function to traverse the structure
def scan_json(json_obj, filters = ["book", "chapter", "verse"]):
    if isinstance(json_obj, list):
        for obj in json_obj:
            scan_json(obj, filters)
    else:
        if json_obj['cat'] in filters:
            if "ref" in json_obj:
                print(json_obj['ref'], end="-->")
            if "value" in json_obj:
                print(json_obj['value'], end="-->")
            print(f"({json_obj['cat']})")
        if "children" in json_obj:
            for child in json_obj['children']:
                scan_json(child, filters)
    return


In [None]:
# Get cleaned verse texts
scan_json(usfm_json, filters=["verseText"])

In [None]:
# Get footnotes only
scan_json(usfm_json, filters=["footnote", "noteText"])

In [None]:
# Get table and lists usages in the USFM
scan_json(usfm_json, filters=["table", "list", "verse", "chapter", "book"])

In [None]:
# Get versification
scan_json(usfm_json, filters=["book", "chapter", "verse"])

In [None]:
def find_attribute(json_obj, filters=[]):
    if isinstance(json_obj, list):
        for obj in json_obj:
            find_attribute(obj, filters)
        return
    if "attributes" in json_obj:
        found = []
        for filt in filters:
            if filt in json_obj['attributes']:
                found.append(f"{filt}={json_obj['attributes'][filt]}")
        if found:
            print(f"{json_obj['value']} -->{','.join(found)}")
    if "children" in json_obj:
        for child in json_obj['children']:
            find_attribute(child, filters)
    return


In [None]:
# search for strongs and lemma
find_attribute(usfm_json, filters=["strongs", "lemma"])

In [None]:
!head ~/Downloads/eBible_engBBE_2020-04-17_RUT_usfm.txt


In [None]:
complex_usfm = open("/home/kavitha/Downloads/eBible_engBBE_2020-04-17_RUT_usfm.txt", 'r', encoding='utf-8').read()

In [None]:
second_parser = USFMParser(complex_usfm)
complex_json = second_parser.to_dict()

In [None]:
find_attribute(complex_json, filters=['strong', 'lemma'])

In [None]:
scan_json(complex_json, filters=['verseText'])

In [None]:
# Re-construct a simple USFM
def make_usfm(json_obj, filters=['book', 'chapter', 'verse', 'verseText']):
    if isinstance(json_obj, list):
        for obj in json_obj:
            make_usfm(obj, filters)
        return
    if json_obj['cat'] in filters:
        if "tag" in json_obj:
            print("\\"+json_obj['tag'], end=" ")
        if "value" in json_obj:
            print(json_obj["value"], end="\n")
    if "children" in json_obj:
        for child in json_obj['children']:
            make_usfm(child)
    return

In [None]:
make_usfm(usfm_json)

In [None]:
make_usfm(complex_json)

### Filtering for specific contents

In [None]:
my_parser.to_dict()

In [None]:
my_parser.to_dict([Filter.SCRIPTURE_TEXT])

In [None]:
my_parser.to_dict([Filter.NOTES])

In [None]:
my_parser.to_dict([Filter.SCRIPTURE_TEXT, Filter.PARAGRAPHS, Filter.TITLES])

### Converting to other formats - list, table, USX

In [None]:
list_output = my_parser.to_list()

In [None]:
print("\n".join(["\t".join(row) for row in list_output]))

In [None]:
table_output = my_parser.to_list([Filter.MILESTONES, Filter.NOTES])
print("\n".join(["\t".join(row) for row in table_output]))


In [None]:
table_output = my_parser.to_list([Filter.SCRIPTURE_TEXT])
print("\n".join(["\t".join(row) for row in table_output]))


In [None]:
from lxml import etree
usx_elem = my_parser.to_usx()
usx_str = etree.tostring(usx_elem, encoding="unicode", pretty_print=True) 
print(usx_str)

### Work with the syntax tree itself

In [None]:
my_st = my_parser.syntax_tree
print(my_st.children)

In [None]:
# to just view the syntax-tree
print(my_parser.to_syntax_tree())