# Run exaustive search for stroke arrangement using stylusengine to check scores and evaluate results

In [1]:
import os
import re
import sys

import matplotlib.pyplot as plt
import numpy as np

from itertools import permutations
from math import factorial

from xmlparse import loadRef, loadGeometryBases, minXml
from score_strokes import alignStrokes

In [2]:
#sys.path.append('/Users/douglasa6/Documents/stylus-master/src')

import stylusengine


stylusengine.setLogFile(b'errors.log')

stylusengine.setScope(
    b'file:///home/tulip/Documents/College/Stewart/stylusapp/hans',
    b'file:///home/tulip/Documents/College/Stewart/stylus/schemas'
)


2023-03-31T21:16:45.390510Z [INFO ] Stylus initialized - Stylus 1.5.0 [RELEASE - Feb 20 2023 13:35:24] (c) 2006-2009 Biologic Institute


0

In [3]:
def getXmlScore(xml_bstring):
    stylusengine.setGenome(xml_bstring, b"")
    g = stylusengine.getGenome([b"all"]).decode()
    print(g)
    score = float(
            re.search(r"score='([e\d.+-]+)'", g).group(1)
        )
    return score


In [4]:
f_read = [f"56DB.08.{i}.gene" for i in range(1, 101)]

han_char = "56DB"

ref_g, ref_l, output_size = loadRef(han_char, "Reference")
g_data, _, base_data, stroke_sets, stroke_orders, _ = loadGeometryBases("Genes//maint_0.001 on 56DB.08/", han_char, output_size, f_read = f_read)
"""
bad_data = []
for i in range(len(g_data)):
    if len(ref_g) is not len(g_data[i][0]):
        bad_data.append(i)
for i in bad_data[::-1]:
    g_data.pop(i)
    base_data.pop(i)
    stroke_sets.pop(i)
    stroke_orders.pop(i)
"""

good_characters = []
for i in range(100):
    if len(g_data[i][0]) == 5:
        print(f"{i} is good")
        good_characters.append(i)
        
character_num = good_characters[2]
print(f"Testing character {character_num}")

g, l = g_data[character_num]
bases = base_data[character_num]
stroke_set = stroke_sets[character_num]
stroke_order = stroke_orders[character_num]


7 is good
14 is good
17 is good
21 is good
22 is good
24 is good
28 is good
33 is good
40 is good
46 is good
73 is good
81 is good
Testing character 17


In [5]:
# reference-gene alignments are flipped along the index and value - reversing it
heuristic_alignments_flipped = alignStrokes(g, ref_g, l, ref_l)
heuristic_alignments = np.zeros(len(heuristic_alignments_flipped), dtype=int)
heuristic_alignments[heuristic_alignments_flipped] = np.array([range(len(heuristic_alignments_flipped))])+1

print(heuristic_alignments, stroke_order)

[1 5 2 4 3] [1 2 5 3 4]


In [6]:
heuristic_xml = minXml(han_char, bases, stroke_set, heuristic_alignments)
original_xml = minXml(han_char, bases, stroke_set, stroke_order)
heuristic_score = getXmlScore(heuristic_xml)
original_score = getXmlScore(original_xml)

<?xml version='1.0' encoding='UTF-8' ?>
<genome xmlns='http://biologicinstitute.org/schemas/stylus/1.5'>
<seed processorID='4AFBBE2E-00CD-40B6-8A81-816F51B31EF6'>kaENEkEu7ANv9yVUoX2RekGu4AzovzN1KFZY13xIxEY+/WK44z3KAoFFEGEQbJd5oL/uWMyQ/cJixV//0b7dIjlf3g8+qR9c4t0qFVROlQF5Iy2oYwwe3kKPnn2YwIqIvVDQGCmIs2SPCjhe7fjMAiHtEGScjhqcCCdecf6zfgARpyndzHzLWBPIiqtxxBmEdDuhxyQE3S/Xh5pyXZfaFD1JPFRaFyrJj8J6Mvr7ABO4lNlgGT0F8idWSy6vFtW7hH31IRaRiQhEEzWylY/9AVqzwN009fDjDvQ0jJJgyDTxKcpDdBQADVMrYZGOZHZWN7VKZAuaAVJVMLhRf40xfVwDm/VdjmaL8hrEm3u4hbcxJV2YMlC9Db/O0mg1e/HvMRRPgzwNwmq7HiDLiwueXtsOcGcrEep93XBHQ55cK+SNdVgTWl6Gc1uufU9TbgWBVltAjL2XzcR4abHFw3ktDT542hS4bx/I3PMe/+MYxmPupjwfP8AmPQkIPZwnzDqMM12GkWHw0B/ZhxGQ4aYrBCv7c+mLDVr081BETl5yKBnqPSsgPloBmU26ao6JxAFoRvRxLyAbHDNnC38+CODGZidSjPmMIYE5CxDDRWzVi/p+waPjhPNUH+csBEc5kwi8qQNF0xkGib9bCOsAebuJK6OfbLnTCAH9QZGd1YmapkGgYW6QehZ6qaN8+UaPjtNjAy7Ll7gqK8VTW95NfxsFTR4Mep1bf+VFdVUJJctmM8LxT3hM0SlGHzMVvytVOity9ZmIf6YXwBiTRWn/Yf8Mq4qxHTp5OttjEGJ4zZzCNYhEztYxbav95YD5b6YQBIzvrpRKbR

In [7]:
def saveExhaustive(ref_char, han_char, f_read, data_dir="HanBitmap", exhaust_dir = "Exhaustive"):
    ref_g, ref_l, output_size = loadRef(ref_char, "Reference")
    g_data, _, base_data, stroke_sets, _, f_names = loadGeometryBases(data_dir, han_char, output_size, f_read = f_read)
    print(f"Pruning bad data from {len(f_read)} files...")
    bad_data = []
    for i in range(len(g_data)):
        if len(ref_g) != len(g_data[i][0]) or len(ref_g) != len(g_data[i][1]):
            bad_data.append(i)
    for i in bad_data[::-1]:
        g_data.pop(i)
        base_data.pop(i)
        stroke_sets.pop(i)
    print(f"Pruning finished, dropped {len(f_read)-len(g_data)}/{len(f_read)} bad samples")
    for i in range(len(g_data)):
        print(f"Generating exhaustive scores for sample {f_read[i]}")
        g, l = g_data[i]
        bases = base_data[i]
        stroke_set = stroke_sets[i]
        exhaustive_alignments = permutations(range(1, len(l)+1))
        exhaustive_scores = np.zeros(factorial(len(l)))
        for j, p in enumerate(exhaustive_alignments):
            p_xml = minXml(ref_char, bases, stroke_set, p)
            exhaustive_scores[j] = getXmlScore(p_xml)
            if j%10000 == 0:
                print(f"Scoring permutation {j} of {len(exhaustive_scores)}")
        f_name_cleaned = f_names[i].replace("/", "_")
        print(f"Wrote exhaustive scores to {exhaust_dir}/exhaust_{ref_char}_{han_char}_{f_name_cleaned}.npy")
        np.save(f"{exhaust_dir}/exhaust_{ref_char}_{han_char}_{f_name_cleaned}.npy", exhaustive_scores)


In [8]:
stylusengine.setGenome(heuristic_xml, b"")
g = stylusengine.getGenome([b"all"]).decode()
score = float(
            re.search(r"score='([e\d.+-]+)'", g).group(1)
        )

In [9]:
score, original_score

(2.348968756405016e-05, 0.001053491840388537)

In [10]:
f_read = ["56DB.08.15.gene"]
ref_char = "56DB"
han_char = "56DB"
data_dir = "Genes//maint_0.001 on 56DB.08/"

In [11]:
saveExhaustive(ref_char, han_char, f_read, data_dir)

Pruning bad data from 1 files...
Pruning finished, dropped 0/1 bad samples
Generating exhaustive scores for sample 56DB.08.15.gene
<?xml version='1.0' encoding='UTF-8' ?>
<genome xmlns='http://biologicinstitute.org/schemas/stylus/1.5'>
<seed processorID='4AFBBE2E-00CD-40B6-8A81-816F51B31EF6'>kaENEkEu7ANv9yVUoX2RekGu4AzovzN1KFZY13xIxEY+/WK44z3KAoFFEGEQbJd5oL/uWMyQ/cJixV//0b7dIjlf3g8+qR9c4t0qFVROlQF5Iy2oYwwe3kKPnn2YwIqIvVDQGCmIs2SPCjhe7fjMAiHtEGScjhqcCCdecf6zfgARpyndzHzLWBPIiqtxxBmEdDuhxyQE3S/Xh5pyXZfaFD1JPFRaFyrJj8J6Mvr7ABO4lNlgGT0F8idWSy6vFtW7hH31IRaRiQhEEzWylY/9AVqzwN009fDjDvQ0jJJgyDTxKcpDdBQADVMrYZGOZHZWN7VKZAuaAVJVMLhRf40xfVwDm/VdjmaL8hrEm3u4hbcxJV2YMlC9Db/O0mg1e/HvMRRPgzwNwmq7HiDLiwueXtsOcGcrEep93XBHQ55cK+SNdVgTWl6Gc1uufU9TbgWBVltAjL2XzcR4abHFw3ktDT542hS4bx/I3PMe/+MYxmPupjwfP8AmPQkIPZwnzDqMM12GkWHw0B/ZhxGQ4aYrBCv7c+mLDVr081BETl5yKBnqPSsgPloBmU26ao6JxAFoRvRxLyAbHDNnC38+CODGZidSjPmMIYE5CxDDRWzVi/p+waPjhPNUH+csBEc5kwi8qQNF0xkGib9bCOsAebuJK6OfbLnTCAH9QZGd1YmapkGgYW6QehZ6qaN8+UaPjtNjAy7