# Get the Base Information from a Gene File

In [1]:
import os
import xmltodict

In [2]:
def extractBases(xfile):
    """
    Extract essential information including han charactere, bases, stroke base start/stop points, and stroke order from an XML gene file
    """
    xml_data = open(xfile, "r").read()
    root = xmltodict.parse(xml_data)
    bases = root["genome"]["bases"]
    strokes = root["genome"]["genes"]["gene"]["hanReferences"]["hanReference"]["strokes"]["stroke"]
    ordered_strokes = {int(s["@correspondsTo"]): (int(s["@baseFirst"]), int(s["@baseLast"])) for s in strokes}
    ordered_stroke_list = [ordered_strokes[i+1] for i in range(len(ordered_strokes))]
    han_char = root["genome"]["genes"]["gene"]["hanReferences"]["hanReference"]["@unicode"]
    return (han_char, bases, ordered_stroke_list)

def minXml(han_char, bases, stroke_bases, stroke_order):
    """
    Convert a set of minimal information for an xml Stylus input into an XML bytestring
    """
    xml = b"<?xml version='1.0' encoding='UTF-8' ?>\n<genome xmlns='http://biologicinstitute.org/schemas/stylus/1.5'>\n"
    xml += bytes(f"<bases>{bases}</bases>\n", "UTF-8")
    xml += bytes(f"<genes>\n<gene baseFirst='1' baseLast='{len(bases)}'>\n<hanReferences>\n<hanReference unicode='{han_char}'>\n<strokes>\n", "UTF-8")
    for i in stroke_order:
        first, last = stroke_bases[i]
        xml += bytes(f"<stroke baseFirst='{first}' baseLast='{last}' correspondsTo='{i+1}' />\n", "UTF-8")
    xml += b"</strokes>\n</hanReference>\n</hanReferences>\n</gene>\n</genes>\n</genome>"
    return xml
    

In [3]:
char, bases, strokes = extractBases("./Genes/9E7C.02L.gene")

In [4]:
print(minXml(char, bases, strokes, range(len(strokes))).decode())

<?xml version='1.0' encoding='UTF-8' ?>
<genome xmlns='http://biologicinstitute.org/schemas/stylus/1.5'>
<bases>ATGCAGCACCATCAACAAGTCGGGACTGCTACCGGCCCGTTGTTCTTTGGTAGCAGTGTTGGACTGGTAGGCGTTATCATCATAATAATAATCGTGGTGCTCCTCGAACTTCAATCACAGTCTCCACCACCGTCACCGTCACCTCCGGCTAGCAGCGTAGGACGGGCGGGCGGGGGAGGTGGAAGAGGCGGTGTAAGCCGCGTCAGCAGTGTAGGCGGGGTCCAGGTACGTGGAGTAGGTAGAGGGGGGAGTGGGGTACTAGATCTACTCTATTACCAACAACAGCGCGAAGCTGTTCACGCCGGTCCTCCCCCACCGCTGGTTCCGGATCATGAACAGAATAACAACGAGCAGAATGACGTTCAAACTCTACCTGCGTCACAAATCCCATGTCTTCTATGTCGGCGGCGGCGCCGTCGGTGGCGGCGACATCACATCCCACAGCCACAAGTAACCGAGACTCTTCCTCCTCCACCACGAGCTAAATATGTCCCATATCTACCCCCTAGTGGGGGTAGAGGGGGGGGCAGTAGAGGAGCGGAGTACCTGCTGTTCCTTCTGTTATTGTTCTTCCTCCTTTTGTTATTACAGCTCGCGCCCACAGCCGCGACTGCAGCCACAACTCAATACCAGTACTATTACCCTTATTATCAACAGGTGCAGGTGCAGGTTGTTGTGGTGGTCTTAGTTTTGCCGCTACCCGATGCTCCCGCACCTCCGCCACTCCCCGCGGAGCAAGTCATTATCCTTCTAATGCTGCCGTTAGCTGGTGGTGGCAGCGTGCCCTTTTTTTTACCCGGTACAGCTACAACATACCAATATCGGTATTATCGGGATGCGTCCTCTCCATCATAG</bases>
<genes>
<gene baseFirst=