# Get the Base Information from a Gene File

In [1]:
import os
import re
import sys
import xmltodict

In [2]:
sys.path.append('/Users/douglasa6/Documents/stylus-master/src')

import stylusengine

stylusengine.setLogFile(b'errors.log')
stylusengine.setScope(
    b'file:///Applications/Stylus.app/Contents/Resources/hans',
    b'file:///Applications/Stylus.app/Contents/Resources/stylus/schemas'
)

2022-06-28T22:27:26.854024Z [INFO ] Stylus initialized - Stylus 1.5.0 [RELEASE - Jun 28 2022 11:17:28] (c) 2006-2009 Biologic Institute


0

In [3]:
def extractBases(xfile):
    """
    Extract essential information including han charactere, bases, stroke base start/stop points, and stroke order from an XML gene file
    """
    xml_data = open(xfile, "r").read()
    root = xmltodict.parse(xml_data)
    bases = root["genome"]["bases"]
    strokes = root["genome"]["genes"]["gene"]["hanReferences"]["hanReference"]["strokes"]["stroke"]
    stroke_series = [(int(s["@baseFirst"]), int(s["@baseLast"])) for s in strokes]
    stroke_order = [int(s["@correspondsTo"]) for s in strokes]
    han_char = root["genome"]["genes"]["gene"]["hanReferences"]["hanReference"]["@unicode"]
    return (han_char, bases, stroke_series, stroke_order)

def minXml(han_char, bases, stroke_bases, stroke_order):
    """
    Convert a set of minimal information for an xml Stylus input into an XML bytestring
    """
    xml = b"<?xml version='1.0' encoding='UTF-8' ?>\n<genome xmlns='http://biologicinstitute.org/schemas/stylus/1.5'>\n"
    xml += bytes(f"<bases>{bases}</bases>\n", "UTF-8")
    xml += bytes(f"<genes>\n<gene baseFirst='1' baseLast='{len(bases)}'>\n<hanReferences>\n<hanReference unicode='{han_char}'>\n<strokes>\n", "UTF-8")
    for i, base in enumerate(stroke_bases):
        first, last = base
        xml += bytes(f"<stroke baseFirst='{first}' baseLast='{last}' correspondsTo='{stroke_order[i]}' />\n", "UTF-8")
    xml += b"</strokes>\n</hanReference>\n</hanReferences>\n</gene>\n</genes>\n</genome>"
    return xml
    

In [4]:
char, bases, strokes, order = extractBases("./Genes/testing set/maint_0.2 on 5EA6.01/5EA6.01.1.gene")

In [5]:
x = minXml(char, bases, strokes, order)
print(x.decode())

<?xml version='1.0' encoding='UTF-8' ?>
<genome xmlns='http://biologicinstitute.org/schemas/stylus/1.5'>
<bases>ATGCACTATCCTTGTTCAAGTGGTGGTGTCGAAGTATTTTTACAGTGTAGAGCGCCCACTACGACTACCCGGTCCCGATGTGCTCGGTTGGAAAGGTCAATTAGTGTATTCAGCAAACCACCGTTACCCGACACTATAGAAGGTCCACACATAGGTGGCAGCGGTAGGAGCAGGACGAAGACGTGTCACTATCACTACTACCAGAAGGACTCAGGTAAAGATACCGTAGAATCCGCAATTGCACTAGCTTTAACGTTGATCGCGGTGGCCAGGGAATGGAGACGCAAGTGGCACTGTACGGTAAGAATCCTGTCCTGGAAGCTCTATTTTGATCGGACCTATCGACAGCCCTCACCATCCTCTCATTCAGTTTCGGTTGTTTGA</bases>
<genes>
<gene baseFirst='1' baseLast='384'>
<hanReferences>
<hanReference unicode='5EA6'>
<strokes>
<stroke baseFirst='4' baseLast='12' correspondsTo='6' />
<stroke baseFirst='19' baseLast='27' correspondsTo='7' />
<stroke baseFirst='34' baseLast='42' correspondsTo='5' />
<stroke baseFirst='52' baseLast='69' correspondsTo='4' />
<stroke baseFirst='115' baseLast='123' correspondsTo='1' />
<stroke baseFirst='154' baseLast='174' correspondsTo='2' />
<stroke baseFirst='187' baseLast='222' corre

In [6]:
stylusengine.setGenome(x, b"")
g = stylusengine.getGenome([b"all"]).decode()
s = re.search(r"score='([e\d.+-]+)'", g)
score = float(
            re.search(r"score='([e\d.+-]+)'", g).group(1)
        )
score

0.2020960507841868