# Small Parsimony in rooted tree

```R
SmallParsimony(T, Character)
    for each node v in tree T
        Tag(v) ← 0
        if v is a leaf
            Tag(v) ← 1
            for each symbol k in the alphabet
                if Character(v) = k
                    sk(v) ← 0
                else
                    sk(v) ← ∞
    while there exist ripe nodes in T
        v ← a ripe node in T
        Tag(v) ← 1
        for each symbol k in the alphabet
            sk(v) ← minimumall symbols i {si(Daughter(v))+αi,k} + minimumall symbols j {sj(Son(v))+αj,k}
    return minimum over all symbols k {sk(v)}
```

In [1]:
import numpy as np

In [2]:
def MkEdge(node1,node2):
    return f"{node1}->{node2}"
def RvEdge(edge):
    node1,node2 = edge.split("->")
    return node2 + "->" + node1

In [94]:
def HammingDistance(p,q):
    return sum([(i!=k) for i,k in zip(p,q)])
def IsAlphabet(string):
    try:
        int(string)
    except:
        return True
    
def ParsingInputW3(ip, i = None):
    T = {}
    T["nodes"] = {}
    T["characters"] = {}
    T["adjacency_list"] = {}
    T["leaves"] = {}
    ip = ip.strip()
    ip = ip.split("\n")
    n = ip.pop(0)
    for edge in ip:
        node1,node2 = edge.split("->")
        if i != None:
            if IsAlphabet(node2):
                node2 = node2[:i]
        
        if IsAlphabet(node2):
            leave_counts = len(T["leaves"])
            leave_counts = str(leave_counts)
            T["leaves"][leave_counts] = [node2]
            T["characters"][leave_counts] = node2
            T["nodes"][leave_counts] = []
            if node1 not in T["nodes"]:
                T["nodes"][node1] = [leave_counts]
                T["characters"][node1] = ""
            else:
                T["nodes"][node1].append(leave_counts)
        else:
            if node1 not in T["nodes"]:
                T["nodes"][node1] = [node2]
                T["characters"][node1] = ""

            else:
                T["nodes"][node1].append(node2)
    T["nodes"] = {k:v for k,v in sorted(T["nodes"].items(), key=(lambda item: int(item[0])))}
    T["characters"] = {k:v for k,v in sorted(T["characters"].items(), key=(lambda item: int(item[0])))}    
    return T

def IsRipe(node_v,T,Tag):
    s = map(lambda child_node: Tag[child_node], T["nodes"][node_v])
    if Tag[node_v] == 0 and sum(s) == 2 :
        return True
    else:
        return False

    
def UntagExists(Tag):
    if len(Tag) > sum(Tag.values()):
        return True
    else:
        return False

def FindRipeList(T,Tag):
    ripe_list = []
    if UntagExists(Tag):
        for node_v in Tag:
            if Tag[node_v] == 0 and IsRipe(node_v,T,Tag):
                ripe_list.append(node_v)
    return ripe_list


In [488]:
def SmallParsimony(T, Character):
    alphabet = ["A","C","G","T"]
    Tag = {}
    S = {}
    for node_v in T["nodes"]:
        Tag[node_v] = 0
        S[node_v] = {}
        if node_v in T["leaves"]:
            Tag[node_v] = 1
            for k in alphabet:
                if Character[node_v] == k:
                    S[node_v][k] = 0
                else:
                    S[node_v][k] = float("inf")
    Backtrack = {}
    RipeList = FindRipeList(T,Tag)
    while len(RipeList) > 0:
        node_v = RipeList[0]
        son,daughter = T["nodes"][node_v]
        Tag[node_v] = 1
        for k in alphabet:
            son_min_value = np.min([S[son][j] + HammingDistance(j,k) for j in alphabet])
            son_min_char_idx = np.argmin([S[son][j] + HammingDistance(j,k) for j in alphabet])
            son_min_char = alphabet[son_min_char_idx]

            daughter_min_value = np.min([S[daughter][i] + HammingDistance(i,k) for i in alphabet])
            daughter_min_char_idx = np.argmin([S[daughter][i] + HammingDistance(i,k) for i in alphabet])
            daughter_min_char = alphabet[daughter_min_char_idx]

            S[node_v][k] = son_min_value+daughter_min_value
            


            
        #Update RipeList
        RipeList = FindRipeList(T,Tag)

    return S

def BackTrack(T,S):
    alphabet = ["A","C","G","T"]
    Backtrack = {}
    for node in range(len(T["nodes"])-1,len(T["leaves"])-1,-1):
        if node == len(T["nodes"])-1:
            root = str(node)
            root_node_char = min(S[root],key=S[root].get)
            score = S[root][root_node_char]
            Backtrack[root] = root_node_char

            son,daughter = T["nodes"][root]        
            k = root_node_char
            son_min_value = np.min([S[son][j] + HammingDistance(j,k) for j in alphabet])
            son_min_char_idx = np.argmin([S[son][j] + HammingDistance(j,k)*1.01 for j in alphabet])
            son_min_char = alphabet[son_min_char_idx]

            daughter_min_value = np.min([S[daughter][i] + HammingDistance(i,k) for i in alphabet])
            daughter_min_char_idx = np.argmin([S[daughter][i] + HammingDistance(i,k)*1.01 for i in alphabet])
            daughter_min_char = alphabet[daughter_min_char_idx]
            Backtrack[son] = son_min_char
            Backtrack[daughter] = daughter_min_char
        else:
            node = str(node)
            son,daughter = T["nodes"][node]        
            k = Backtrack[node]
            son_min_value = np.min([S[son][j] + HammingDistance(j,k) for j in alphabet])
            son_min_char_idx = np.argmin([S[son][j] + HammingDistance(j,k)*1.01 for j in alphabet])
            son_min_char = alphabet[son_min_char_idx]

            daughter_min_value = np.min([S[daughter][i] + HammingDistance(i,k) for i in alphabet])
            daughter_min_char_idx = np.argmin([S[daughter][i] + HammingDistance(i,k)*1.01 for i in alphabet])
            daughter_min_char = alphabet[daughter_min_char_idx]
            Backtrack[son] = son_min_char
            Backtrack[daughter] = daughter_min_char
    return score,Backtrack

def SmallParsimonyString(T):
    
    P_score = 0

    for i_character in range(len(T["leaves"]["0"][0])):
        Character = {}
        for (key, value) in T["leaves"].items():
            Character[key] = value[0][i_character]

        S = SmallParsimony(T,Character)
        score,Backtrack = BackTrack(T,S)
        for node in T["nodes"]:
            if node not in T["leaves"]:
                T["characters"][node] += Backtrack[node]
        P_score += score
    T["P_score"] = P_score
    for node_v in T["nodes"]:
        if node_v not in T["leaves"]:
            for node_child in T["nodes"][node_v]:
                node_v_char = T["characters"][node_v]
                node_child_char = T["characters"][node_child]
                edge = MkEdge(node_v_char,node_child_char)
                rv_edge = RvEdge(edge)
                length = HammingDistance(node_v_char, node_child_char)
                T["adjacency_list"][edge] = length
                T["adjacency_list"][rv_edge] = length
                
    return T

def TestTree(T):
    sum_HD = 0
    for k,v in T["adjacency_list"].items():
        sum_HD +=v
    if int(sum_HD/2) == T["P_score"]:
        print ("PASSED TEST")
    else:
        print ("DID NOT PASS")

In [489]:
ip = """
8
8->C
8->C
9->A
9->C
10->G
10->G
11->T
11->C
12->8
12->9
13->10
13->11
14->12
14->13
"""
T = ParsingInputW3(ip)
T = SmallParsimonyString(T)
TestTree(T)



PASSED TEST


In [490]:
print(int(T["P_score"]))
for k,v in T["adjacency_list"].items():
    print(f"{k}:{v}")

3
C->C:0
C->A:1
A->C:1
G->G:0
C->T:1
T->C:1
C->G:1
G->C:1


In [469]:
ip = """
128
128->CTGTGCCGTTTCATTCCTTCAGTGAGGTCTCTTCACATAACTCGTCTCCTGGAGCGTCTCTGGGCTGGCAGATATGATCCGTGTGCTCACCCATTCTTACTCGCACAGAGGAACGCGTGGATTAGGTATGCGTTTGAACGTCAACCCGTTCGATATCGGATTGGTGCCAAGCTGAGGAGC
128->TCTAGCTTCAGGATCAGTCTGAAGTTGGGAGAAAGGTGCCCTGCGATGACGTCATACGTCTCCCCCATACGATACCTGATCGATACACCCCGTTCGGCAAGACGCTTGCCGAATAAGAGGGTTAAGCTTCTTCCGCTGCCGCGCCTCGCAGCGAATTACCGGGGCAGTAAACACGCTCCT
129->TTGGCAAGGGACATTAACCTCTAGAAGCTTGAAAAATGGGGCCCGAGCAATGCCAAGCGTTCCGTAAAGAGCATCTGCAGGCCACTTCGTCAAGCGTAGCAAAGGCTCAGGAGGAGGCGCCCACATGACGGGCATCTCCGGCATCGGCTTAAACACGGTCCATATGCTTCACGTCAGTCT
129->CTACTCAACTTCAGGTCCTTCGGTGTGATTCCCCAAAATTGCTGCGTGTTATATCAACAGTCCAAGCGACCGTCGACACGGCTACAAATATTTGTAGAATAAGATCGGGGTAAAGTTAAGCTCGCTATCTATAACGGTCCTAACTGTCGGGACTCAAATCATATACCCGCCCACCGCTAG
130->GAAGGCTATTAGATACTCGTCTCGACTACTTGCCGGACACTCGATAGAAGAGAACCTGAGGTACCAAATCGGACCTCTAATTCCTACTGTCATCCCTACTGGCGGTTTCTCCGACCCGACCTGGGGTTCAGGCATTTAACGGGAACTGCCGAACCCGAGCTGTTTTACGAAGGGCGTTAA
130->CCTGCAACTGACGCTATCAAGCTTTTGACCCTGAGGAGGTGGCCGGGACCGAATGTTATTGCGGGTGGCGTACCAAATCCATTTCATCCGGTCGCCAGCCACATTCTCCTGATGCAGAGAACCCCATGTGAGTTGAGATTCGTTTATACAAACGGCATCCCCTGATAGGCTCCAGAGCCC
131->GTAGAGATACGAGAATGCATTAATTGAATAACGGGGCGGCAGAGTTCACGGGGTATGCGATGTCTCCATATCCTTGGGCAGTGATCTGATCCAGTTTAAGGCGCGTTGGGAACCCGTTTCCAGGCATTCAATCCCACCGAGGATCCCCTGAGAGGGGACAACGGCCTTGTCAGATACGCG
131->AGTGTAGGGTGCCACGCGTGTTCGAGCACAATTGACTCGCATGATACAATCGCGCAGTGCTCGGAAGGTCAGTGTGCATTGTAGGCACGGCTAGGGCACGATTTCTACCAGGTCGTGCTCGCAAGTGAGCCGGAATCGAAATGGCGAGCTTATAGACCCCCACGATGAGTATACGTAGAG
132->TAGTTTGGTTGAAGCCGACTCGGGGTAACAAGTTGCGTCGCAAACAAGAGGACCCCAGCGACGGGAGAGTTCGTCTAGAGCTAGGTACAGAATACCAGACCGACGTTCATCGAATTGCTGGGTAGATCACAACGGTCCTTAGGGAGCCATTCAAACAAGTAGGACCACTAGTCTGGGGAT
132->ACGTATCTGGCTCGGCAGCCTATACTGCTCCGGCATAATTTTGTACATCCTGAAGTATTTCGAATAAATGGGAATACTAGCTGGCATAGATGCTGCTCCTCAAACAAATGACGAATACAGAAGGTACTCAACAGCTGTTCATGAGATTTGGTAGTGTACATCGGCTTGGGCATTCGTGCG
133->GTCCGCAGGCGTAGTAGACCCCGGGCCAACCAGCTAACGGAAAACCCACGTGACGCGTCTGCATTCGAAGCCATATGTTCGAATACCCCACCCAACCTCCAAGAACTGATCAGCCCCTTTCCTATAACCGTGTAGTCCTAACATTGATGGCGGTTGGGAGACTGCTATTCCATCTACTTT
133->CCAAGGAAATAGGATGAGTAATTTTAAATCGCATTTAAGGGCGCATGGTAGCAATGGTGTGTCGGGTACACCGAGGCCCTACCCGAATACACACACTAAAAAAAAGATTTTGTCACTAGCTGACGAACACCATTAGGTGTTGGCAAGACGAATGTGATACCATTTGAGCATATGAAAAAT
134->GAGGAACCGGACCGCCATTCTTCGCTCCGAAGGATGGGCTGAAACCATAGCCGAGTTTCTCCCGTTGGAACCTATTGGTCTAGGAAGAAGGACACTCAATCTGAGCTTAAGGAGTTCGGTAACGACTAGAGAAACGACGGATGTATTCTCGTATTTAAATTATGTGCTGCTGTGAATCGC
134->GAAAAGCGTGTATAAATACTGTAGGGAATGCTTGCGACATGGGCCATGCCCGAGTTTCAGTACTCTCACATCCTTCAACAGTGGCGATACAGTCCAAAAGCTGCGGTATTTAATACGGTTTTGAGGGCACTGCATATCCACGATTCAGCCAAAGTGGGGTCAAAATAGCAAATTTCACGC
135->CTCTCTGATTTAAGTAGACTGGTCTCTGTGTATGAGTTGGCTTACAGGCAACTCACGCAACAACGGATTGATGACGATCTACGCACTGCGCCGAGGCCAACGTCCTGGAAAATCTACGTTACCGGGGACAAATGGTGGCATGGTAGGGAAGTCCTTACCCGGCTCCATGACGGGCCAGTC
135->TAAATGTGGAGCTAGATCAATAAATGGCGTGATGGGCCTAAGCTCTTCTGATACTCCATTCATTTATTCGGGACCCTCGAGGCGCCGTGGCCTCCGGCGGGGGCAGCGCCAATTCACGCTCCCACACAAATTGAACGCAGGCCTTACAGTATTTTAATCACATGCCTGGCGAATGCCTCC
136->GGTAATGGCAGGTGTATGTTTCATGCGTTCACTAAATGATTCGAAGAACGTTGGACGATTCAGCTCAACCCGCAGCCCACATATAATTGGTCAACAGCACGCAGGCTGGATATATTATACTCAGGAAAAAATTATTCTGGCTACATTCTTCTGAGGGACAGTAACCTAAGGAGGACATTA
136->TCGTCGGTTCTCGGGCATTCTTCCACTATTGATACCATAGACTGGGAAACTAAGAGGCGTGCTACCTTGTTATCTTGTGGATCCTACGCGGATCGGTTATTTGATCCCGTACACTTTTTAGTCATTGGAAGATGGGAGCTGACCGTTGTCAAACGCTCATCACCCGTTCTGAGTCCGTGG
137->CCTATCCGAGACCCTAAGTAGGCGTCGATAGGGCGTCGTAGCCACCGAGTTCTGTAGAGCAGCAGGTGAACGCGTGTCACCATAACACATCCGAGAAATGTCTCCTTCAGATTGGTGCCGAGGTTACTACACAGCATGGAGTTTCTGTATTACAAGAGAGGATCTTCTTTACTCACTTCC
137->CTTCTGTCGTGGTACCAGCGAACGTAACCCAGTCCTGAAAAGGCGTCCCAGTTATTAACGAGTACGAAGTAGGCTGGAAATCTCTATGCGGGAGGCGGGAATGCCGCTAGACATTAAGTTCAAACAACCTTCACTTGTCCTGGAAGCACAACGTGTAAGATTCAGGCCTAGGGCCCTAAT
138->TTATAGTTCACCGGTTCCCCATAGTAGGCGAAAGATCCATTATTACGGGGGAACAATAAGTAAACTTAGTTCTCACTCTAGGGCGAACATGGTCAAAGGACACTATTCCGCCAAGTCAGTACGACACCACTGTAAACGGCGGTTCTTTGGATCGGAGGTGATTGACGCTGGAAGGCCCGA
138->GGCTCACAAGCGTTCCAATGGCCTAAGGTAGTTCGCGGGTCCATACTACACAGGAAAATGGTGTGGATGCGAAATACAGGTTTCGATGGAAGCCTACCTATTATCTAATTGCGACTTTTTAGTATGTGAACCAATATTGCTTTACCCGCTAACTAATTGACGTCCGTCGACATGCTCTTT
139->CTGCGGTTAGTGCAGTGCTCACTGGGCTGCGTTTCGTTCCGATCACGCTCGTATACTGGAAACCACCCGTTGACACGTGCTAACTCCTCACAACTGAGGTTCAAGGAGAGATAGGGTCAAGGTTCGGGTTTCTATAGATGTCTCCATACGGGGAGACATTCCCAGTCGTGCATATGTGTT
139->TGAATCCCGCGATTGGCTGGTCTTCTCGGACCGCGACTGTGTTCGATGTGACGACCGCGTAGGTACATCCCGTGAATACGCCCTTCGTCCCGTAGGTCTGTAATCTATTTTGGATCATCTTATTAGCAACTCTAATTTTACTGTGGGAATTTATTATTTCTGATTGGGGGCGTGGATCTT
140->GACATACAGTAGACTGGCAGTAGATGGATGCGTGGCCCAGCTTAACTCCTTCAGCGAAGATGATATGACAATCCGATGGAACACTAACTTGAAAAGTTGAGCTGGCCGTCGGGACTGTTTAATCCGCCCCACAATACACCGTACCTAGATGCATCGCGGTATGTGGAAAGGGCATAATCG
140->CTGACGGTATTAACACCCACTTTAGGGAGATCCCTCTTACTCGCAGGACGCCATCTAAACGCTTCCACGCGCCGAAGGAAAACGACGTATTGACGCGTGTTACACGCACAACTATAAGGTAGTAATTTCATCCCGAGTTAACGGACGCAGCTCAGTTCGGGTATTTCTATTTTATTTAAG
141->AGGTTGAGGAAGCGAGTTGACTTGCGATATTGAGTCTCGACGTCCGAGCTCTAGTACCATTCGACTTAGTTATACGGTTACCTAGTGTAACTTACCAACTAATTATGTCTTGCAATACGGCCGGAGCCCGAAGGAAACACTAGTGATATGTAATGAATTGTTTTCGCTTAAGTATAATCG
141->TCTGGAGGGTAGTGTGAAGGTTCGATACGATTTCCACATACATCCATGTAGTCGTGCACCCGCGCACGAGGGCCTACTCACAAATACTACTGCTACACACGAAAATCCAAGCGCGAGGGAGTTCGCTGCCCCGTTAAAGACAAATCTATACATCCTAAGGCGTAATCTCTCGTCCGTCAT
142->GGAATGTCATTTTTGGACACCCTACTGTTCCCATCTAAAAGCATATTTATGAATTCGCAGTGGTGACACGAAATACGCCGGGTTACGAGACGGTAGGACATCATCTCACTGAGAGCTTGCCTACTACCAAAACAGGTCCAGTTCTCAGTAGGCCGCCCCTTCTGAGTAACGCAGTGATGC
142->CTGTGATTCACACCAACAGCGCTCGAAGCTGTCTCAGCCGGCATGTCGAGGCTTTGGTCGTATACGTCAATGTAAACAGTTAACTACGCAGATCCCCAGGTGTGACAACGATATTGGTGACGACGAGCGGGTCCCGGCGCCAAGCGCGAAAGGGGTCCGGGGCTTCGAATGTTCCAGGCA
143->CAAACCTGATGATCAGGCAGATAAGAACTGAGTAGGATGCTAACGGACCTCTTAAGTTGAGTGTCGTATGGTGCCGGTGGGGAAGGCAAGGTATCCCCGAAAGGAGGGTCGCTCTCGTAAGTAGGGGGGTCGGGAAGAGTGTACTGGGGGCCGCGGTTACCTTGACCGAGGCGTTGAAGG
143->TGCTTCGACTGGGCAGACAACTTAGATATCACAAACTTGCGGAACGTGATTTGTTCATTTGCTAGGCGTCGACGAATGCGACTGACCAAACTCGATTATTTGATTCCCGCCCCTATTGCCAATTGGTATGCGATACGCCGCTCATCGGATGCAAATAACGACACCTAGGGTGGGTGTACC
144->CTCCTCACCGCGTTTAAAACCTGCTATACTCCTCTACACAAACCGCCCCTCTCTCAAACGTCAGTGTCGTAGAGGCTTTATTCTGACTCGATGTGCATGACAAGAGCCGCATCTCGAAGTCATGATTTACAATTTTCGGTCTGCATAATATTGGGGAGATTAAAGCCAACTAACTATTGA
144->GGCGCATGCTTGGGCTTGTCGGCGATAAGTGTAAGCCATTGTAATTCACAAGCACTCCGTTCACGATGTATTTACGTCTCTGGTTATGGCACTGTCCCGGCGAGTTGAAGCGATTACAGGCGTTGCGCACAGGACTCTAACGCTAATTATTTGACCCCAATTGTGGTAGCCCATCAGACT
145->TCCTGTACACTTGCGACTCGGCGCCTTGGCGAACCGCTAGGTGGATCACAAGAGTGGCTGGTTTGACAAACGTACACTCCGTCCAGATAAGGGGCTGAGCATAGCTTGTACGGAGATTTCCAAATCTGCGGTTCCTACCGCATTGAGATGGTCTATCGACTGCTCTTCACTAAGTGATAG
145->CACCTGCAAGGACGTATAGAATCTGTCTTCCACAGAGCCCAGTTACGAAAGAGGAACGTTCGCGTGTTACGGGTCGATTCAAATTATTGCAACGAGGTTAGGAGTCGGAAGGATATAACCTGTTTCGGTTGTGCTTGATCGGCCTGCCGCCAATTCTACTTTGCGCCCGTCACACCCAGT
146->GAGAACAGCGGTTCCTTATGACTTAGGTCCGAGGACTGTTCAGGAAGATTAGCGCGTCGCACGGAGGTCCTGTAGCTATTGTGCGCGTTCCGGTGGTCACACAAGCGATGGCGACGAGCTTCGACAACCCTATGTCAAGACGACCCCTCACTGGTCCGGTGGTGCAACTTCTGAAGAAGT
146->TGTGCGGATCCTGAGTTCAATGCACATAGGCGAGGGTTCACTAGCGTTACCGTCTACCAGCTACGTCTTATCGATCACCTGCCTGGCTGAATCCGTAGGCTAATACCCGACTCGTGAATAATACCCTTACGGGTGAGCCATCCGAGGAGTCGGCCAGATACGGGCCTTGAGCCGCGCACG
147->TGTGCGCTGTATACCCATCGGGACTCCTTACGGGTCTAGCCAGGATTATCGGAGCTGGCAATCTACGCGGGTAGCAGACTTTTAGTGTTCGCCCTGCGTCCGAAGGCCGGGCGGGGGATGTTCTGCGTTCACTGCTCCGGGGCATCCATACGGGAGTTCGCTGATAGAAGGGTGAGAATG
147->GGCAGCCTAACCGATTACTCGGCACAACAGTGGTGATATCCTACATGTGGGTTCTGTCCTTCCGAAGAGGGTGGCGACTAAGACAGTGGATTAGACTATAACTTAAGCATAGATCTTTTTAGATTCTCTCCTTACGCGCAGTCCGACTCCAGTATTGGGGTTTTATTCAGAGCGAGCCCA
148->CAAGGATTCACGAGGCGACACGTTCCCGAGGAAAACTTACCGGTTTCATCTAAATGTGTGCGACGGGCTTGAGGCTTCCTGGGTGAGAACGCATCGTCGGTTTCGTCGAATCTTCTTCCAGTTACTTGCCGGTAGAAAACCCTGGATAGCTTAATTCCAGTCCAATCGTTTAAAGGCCCA
148->ACCAGTATAGCGAGTTCCCTCGTGACGGCCGTTGAACGATAAACTACTGTTGCCTGCCAGATCCTCACAGTCAATGCCAAACTCTGTCGTGTCTTGGAAGCTTCGTCCAACTAAGTGGCGCAGTTTACGACACCGATTCTCAGGGCAACACAAATTCCCGTGCCCGCAGCCAGGCTAATA
149->CGGAACTTACTAACTAACCGTTATTGTATTGCGTGGACTGGTACGCGGTAACGAGATAACACTCTAACCTAACATTCGTGTACAGCGAATCTAGAACTATCGAATCGATCAGCCGGCACCCGCTTCACATAAATCGGAGTTCGTACGGCGGACTTTAGGTGACTGCTATCTTATGTTTGC
149->GTCCCATCCGTAGTGTTGTTCTGTGACGTATTACAACGCTTCTTCACACGTTGATGCCGATAGTTGCCCTACCGCGTTATACTTGCAGTACGCGGGACCTGTACGCACATCATTTTAACCCCCAATAGAGATTGTGCCTGATACTAGAACCTTGTTTCAGGACAGGAGTTGGTTTATTCG
150->TCCATTATTGGCGCGTGGGGGGATTATGAAGGGTCGTCGCCCTACATCGTTTCATTCAGTCTCTTAGGGGCCTAACTGAGCGATCAGAAGTCTTATGTTCACTGGACCGGAACTTACTAGAGGGCGGTGCTCATAGTTGGCCTCTTCCTTATCGAGTCCAAAATTACACTCCGGAAGCCT
150->TAAGATGCGTAGTAATACTCTATCCTCTTCGAGATCGAGCAACGTTTCCTCCCAGCTTCCCCGCGGAAGAGGATCTGAAGGCACTGTGCGCGGACTCTAAGCATCCATGGCGGGTGGGCGACATAAGGGACGCCGCGTAACACGCTTATGTGAGGGGAACGAACTGAGGGCCTGAGGCCA
151->CGTAAAGTAATAGAGACTTGGCGACCCCTACACCACGGACTGATGACCGTGTGGTCGAGAGTATAATCGCCTAGACTGCAATGGTCGTCGATGGAGTCTCATACAGAAGCTCTGCCGTATTCGCCTGAAATGGTTCGTATTGTTCTCTGACCCGGGAGGAAGTGACTAATTAGCTTGAAA
151->CGAAGGCTCGCAGGATTGCCCCCCGCAGTTCGCTTCGATGGCTTACTGGGGATCTCGTCAAGGCTTTCTCTTTACGGCTTTCATGACGGACGCTGTAATCTCGATTCGGGGATACGGTAACAGAAGGTAGTCGGCACCTAGGTGAACGCTTGCTTCGATGAATGAACACTCTCGGTATTG
152->ACTTTCGAACAGCTATCGTGGTTGCTTCAGGCATATTCTCCGATATCACGATACTCGAGAACCGACGGGGTATGCTCTAGGCCGGAGGTTCCAGAATGACACTTTAGTGTCAGCTGGTGGCACGGCCTGTCAATCTAAAGTGAAGATCCGTTTTTAGTCGCGCCCGACCCTGACGTTTTT
152->TACGCGTTGTGTGCGCGGCTATGTACGCTTCGATAGTACTAGCTTGTAACTATTAGACTCTTTGTCGAGAGGTCCGGATTTAGCACTGACACCAGCGATCACTAAGCTCATCCTGATGTATTCAATAGCCCGTATAAGGGACAGGGTTAGGAGGATCAGAAGCGCCTAGGGGCAACCACA
153->GGACCAGGAGCGGAATCCACACCCCGTCATAGGCGGAACGCACCCTGGATACAACCCTAATGAATTATTCTGACTAGGTCAGCTCTCCGGTAGTGGAAAACCTGAATGGAAAATTGAGCGTAGTCGTTGATCTACTCCGTTCGGCACTCTTCCCCCTTGCGGCTAATGGGATCGATTGCC
153->GTATTACTTTCGGGACTAACATGTCGCTATACTTGTTTCAACGAATTGAACACACGACCTACTATCTGGAGGGAACGACGTTCGAGACATATAGTGGCATTAGGATTTCAGTTCGACGTGGAAAGTGACCCAGAAGTACCTGGGAGGACATATAGCATGGAATTGATCAAAAATAAATAA
154->CACCTTGAATGACCATACTATTGCGAATAGCCAGATGCTGATGGATCTTCGAAGCCCTGCCTTCTAAAAGGCTCATGGGACAATGGGGGAATAACCGAATTGCGATAGCGATAACTCTCAGAGTTGTCTGGAAGGTTCAATCGCCAGTCGAACCAGAATCGAGCGCGGGTCGGACTTCAT
154->GGGTAACGAAGCGATTGAAGCCTGCGGGACGTCTTCGGACGCTACGTTGGCTAGGACAAACCTATCCCCTGCGCTGGTATCGAGTCTCTTTCCCCCCCAGAATTCGGATGCGCAGGGTTTTGTGATTATATATCCGGCAAGGCACAGCCAACAGCACGGAACTTCCTGAAATCTGAGCCG
155->AACCAACGCGAGCGTGAAGTGAAAGCTGCTCGTAAATTGACTCATATCAAATCTCGACCGGATCATGGCGCATAGGCAGATAGGTAACTTTCCGTATGCGTCGCTGATGCAGATTTATAGATAACGGTACTCGCCCTATTTCCCAATTACTGGATGAGACACGATATTACTCGCACACAG
155->CTTTTGTACGGATAATTGTAAACCCCCGTCCCATCCCGTTATTAGACTTTACCTAGTCCTCGCGTGACCCGTCAATGGGTCGTACTAAAAACATCTAGTAGACAGGTGCCGTAGGTGTCACGGGCATTTCTTGACTGATTCTCGAAAGCTATCCACCATCGTGTGCAGGTCCTGTAGCGG
156->TCTTGCGAGGCAGATTGCACTGGGTGCTAGCTACTAAAAGTGACATGGGTTGCGCTTTAGTAATGGACCTGTCCTCGAGGAGAGAAGTAATGTTCCGCGTGCATCTGAGACGAATGCCCAAAGAATCGTCCGATCTTCGTTGGGAAATGTCTGATTAGCATTATTACATGACCGGTTCAA
156->CGCATCACGATGCCTGAGATTTCTTTAAAGAGATATTCGTTGCTCATAGACGATTCGTGGCTGTGAAAGCGCTACAAACAGCTCCGGCACGCCTCCGGCTCCAGGGCGAGTGGAGGTCAATAAGGCTCCAGAGAACACCTACAAGTCGACCAAGTCGGAGTAACTTCGGGGACAAGGGAT
157->ACGTGGCCCCTGTCGCCCTTAAGCATTGCGCCGATGGAGGTGAGAATAATCCGTTTTCGACCGAGCACGATAACGACCTCCGAACCTTGGCTTCTGCTCACAAGTAGCCGCTTTCATACACCTACCTGGCCCAGCCATGTTTGTCTCCTAAGTATGACTTCAGTTATCATTAGAGGAGCA
157->CTGCCAGTCTCTGCCTACAGAACCAGAGGAACCAGCTCGCGTACCGCGACGGCACATCGTTCATACTTGAAATTCCAATGGCGGGTGATCAAACAACTTATTTTGCCTTCGGCGCAAGGTATGCGAACCCACCGACGCACGAATCTACAATCATACGTTGGGGTACGGGCGTTGTGGTCC
158->GGCTTCACAGTTTTGCGGGCGCCACATGGCGCTGGCTCCGCGTACTTATGTGCGCGCGCTAATTATCGATCACTACATCTAAAGGTGCACAACCCGACCTTTAAATCGTCCAAATGCCGAATCCGTAGCTAAGCGTCTGAAGTATAGGGCCTGTATGCCGGCGTGAAAGTCTCAGCTATG
158->GCCGATCCATATAGAAATATATCCTTTTGGCTGCCACCATGCGTGCTAGGAGTCTGGACATTTGCAGTCTAATCTATTAGTCGGTGGGAATTAGTCTGTAGTCGACCTGCCGAGTATTCCGCCCGGCAGGCGCGGTGTTACCTTGTACTTCGAACGGCGCAGGATAAACGTCTTAGACTC
159->CTGCCAGCACTACAGTTCAGTGTACTCCCGTAATCTTACCGCGTGTATTTATTCACGCCGTAGCCCTTACCGCTGCAAGGAACGAATATCCTCATCATAGGCTACTATCCGTGCATGGAATCTTACCCTAAGCCACTCGTAGCAATGACCATGAAACTGCCGTAATAGCGGAAGACATGA
159->GGACCCGTTATGGTCGTATTAAGGCTGGCGAGTCGTCGAAGAATCAGAGGGGTGGTGTATCCGCGGAACGTCCCACTAATGTTCGTTGCACCGATCGTTTAGGCAGAATGGTTTGGCTTGGGCGGCGTACGCTGTCGACCTAATATACTTGGATCATTGGAAGACAGGCGCAACGCGCGC
160->TCGCTAGGGAGGTGAAGTCACATCTAAGCGCCTCGTAGATCTACATCCTCTCCCTATCTCAATAGCTAGACGAACCCAGCGACGCAGCACATGGGTGCATGGTAATTCAGTGTGGCCAACGGTGTCACCATTCCCCACGAAGCCACGACCCGGATCACAAATAATGTCGCTGTCGTTTCG
160->TTCCTTATGTCGCTCCCATTTGTATTATTTTACTTAGCATACACGATCAACTTCCGTCCTAGAAGAGTTAACTCTAGAGTTGTGGGTTGAATGTAATATAGCCATTGAATACGCGGTGCTTACAGTAAAAGTTCAAATTGGACTGTACATGCAACGTCACCTAGTGGGCCCTCCGGAGCA
161->GGTGGTAACCAGATGCATAGTCGTTTCTACTACATAACTATTAGGTCGTTGCTGGACCGAAGAGGGCCCCCACTGGTTACTAGGGTTGCTTCCTAACGGGATGCGCCGTTGCGGATCAGCGAACCGCCTACAAGTGAAATTAAGCGGAAGTGCAAACCGCCCGTATTAGGAAGCAGTCGA
161->TCGTATTGCACTGCACCATTGCCCACGTAAACGCTCTGGCCCATCAAGGGGAGTGTGCAGCGGCAAGGGCCCCAATTGCTAACTGCACCAGCACTAGTTCCGGTTATACTTAGGATTCGATGACAGCACGATAGTGTATCTGGAGCGCATGTTAACTCACTAACTTAGAATGTACAGTTG
162->GTGCAGGCTAATTTCACTCTATCTCCAGTTACAAAGACTCTCCACACAACGAGCTAACCAGGAAAAGCCAGAAGGGGAGAGTGTCGTACGCCTTCGCAATTGGAACTAAGTATTGTCTCACGGGCATACGGTAGGCGAGGACAGATTAAAAACACCAGCTCCCGATAGCGTCCAACCCGG
162->GGAAAAACCGGGCACTCCGCCGCTATCAGTGATGTACGTATAAGCCGGCCCACCACCTTTGGAGCTTTAATAGGTCGAGATACTCTCCACGCAGGATACCAATCTAGAACGTACGTCGATCCATCTCATAGATCACTAGAATTCAAGAGGATTTGCATGGACGTCTGACAACGTGGCGCA
163->TCTCTAGAATGGATCTGTTGTCTACCCCTCGTTATGACCTGACACAAAGTTTAAGGGAGCAAGGTACTATTATAATACCCGGCACTCTACGAGCCATGTTTATAACCGGCCGTTGTGTGTTGCGGAAGGATATGCGCTCCCGGAGACACCCGAGTCGTCGGATAGTACCGATTCACCCAA
163->TATAATCGCGTGGTTCGGGTTGTAGTACTTAGTGCCGCTTCGCTTACTCGCCGTACGTATTGAGTTACTCGGACCCTACTTGTACCGCCAGACGACAGTGAGGCCCCAAAGTATAATTGGCTCTATCCATAGGCTGCATACTTAAGGTCAAGCACGAACTCCAACCTCAAGGCTTGAAAG
164->GGCCTTCAAGACTCACGGACGGAGCCCGCTATTAGTACCGACTTACCCGGATTAAAATAAGTGGAACGGGCTAGCTATTGACACGAGCAGATAGTGGGTGTGTTGGCGTATTCATCGAGATTCTAAAATGAGCGATGGGAACAAGGCTCGCAACAGGCACATGGACGTCAACACAACTCA
164->CACAATGGCGAAGTGTACGAAAACCGTATGTACGCTGAGTCTGTGGTTCTGCATAACTCAGAATATCTCTAGCTCGCATGAATTTGAGCGCGAAACCTAGCCTCTGATGAGTCCTAATGTTTTAATGCTGTGGCATAAACTAGCTAGCATAAACCCACCCGGCTCCGCGGCGGTGCACCC
165->CCGCTTTGTCCTTGATACCTTCAAGACTTGGATTGCTTGGCCACCTGCAGTACTATACATGTGGGTATCTGGACTGGTCGACAGCCGAGACGCGTAATGCTCACGAATTATCTGTGTAAAGACGTTACCTTGGCCTCTTACTATACGGTTGTATAGACTTGAACACCGAATTCATCTTAT
165->ACAGAGTGTCGGTGCGTACGGGCGCCGTTAATAGAACACCCCTCGAGCCGTTTGGGGACGGCAAACGCGTCACGAGCAGTATGGGCAGGGAGATAATAAACCGTCGAGATTCCCATAGCATTGTGAGACTCACAAACCAGCGGATCTATTAAGGGCGCAACAACGCCCCTGTTGCCTTCA
166->GAACTGGGCGCTATGGCCATGGGAGAAGGGGCAGGTCTAGCTTAGGGCGGGCAACCGTTTGTAGTGGCGCAGCCATGGGGCTTTAGCGCGGTTCCAGCCAACTATGCTCCACGCCATGAACGCGGCTAATTGCCCCTCATTGCGGGACGTCGGGCGCCGTATAGGAACGGGCGAGGGCTG
166->TGCAGCTCAAACTAGCGGGACGGGCTCTTCCATACGGGCGTGGAGCCCTATAACTCACTAACAAATAGCAGATGGTTCATAAAACTGCGGGTTGGAGCCTCTGCGGCCAAGATCGCCTTAAATTTTGGGAACACTCCAAGGTCTAAGACGGGCTAGCTATGTTCCGGAGATCTCATTACA
167->TAGAGAAAAATCCGTAGGGGTGAACACTGAACTATCCTCAGGGGCGCATAGGGCAGTACATAGTCTGGTGGTACAAAGAAGGAACAGCGCCAGTTACGCGTACCGCGTTCATGAGAAAGAGCCGCGCCAGCTTGAATCTAGGGGATCGAACCCGGGGTTTTGTCAGCGAAGTGTTATCAA
167->ACCGCTATAGGTACGTTATTAACCACTAGAACTCTCAGCGCCTAACATGTCAGTCCTCATCAATTTCGCGGTACTAATTCAAAACCGGATCAGCTACGGCGCCGTTGGCACCTGAAAGCAGTGTAGCTTTGGTCATAGAGTTAAACTTATGGATTATAATCGTGCCTGAACTTACGTTAA
168->AACGGAGTCTCCTTCTTGCACTAAACTTTTCCTTTGCCACCATACGCCAAACATTACCGATCGCTCGTTTGAGCGCCTTCCGATTCATCAAATGAAGTGAAGAGCAGCTCAGTTTCAGCACCAATAACGACTCTCTGCCTACCGGCACACACTGAGTGCAATTAGGATAGGAAGTGTCTT
168->AGCGCTGATCGAACGCGCGGCAGTATCAGATTAGGATTGAGCTGGGCTAAAAATGTTTTGAATATGCAACTACCATAACATTGCTTGAAAGAAACGCTTGGAGTCGCTCTAGGATGTTTTGGGATATGCTGGGTTCTAAAGATGAAATGTACTACGGCAGTTGGCATAGAAATCCTTCGA
169->TCTCCCATTCAAGCCTGGCGGTCGGCTTATTACACAACTAAAGGCGGGCCGTTGAACGGGGAGGAGCAGAACCAGTCCGTGCAGCGGCACGGGAATAGTGGTCACTACTATATTCACTCTGGTGGCTTCTGACTACGTGATTCGTAAGGGTTGCCCCACGATTTGCGCAGGAACTGTCTA
169->TCGCAGCAACTGGACGCATGCGCGTCGTGCTCATATGATTACCAGCGTGGGGTCGTCACGAACGCGTACGCTCCGTCCAGTTACAGGTTACCTAAACGAACGTTTCGGGAGACCTCAGAGCATGCCAATGAGGGCCTCTCAAGAGAGGCAGTTGCCAATTGCTTAGAACGTCGGATCAGG
170->GTTGGACAGAAGAAAGTGCAATGTCGAGGCCCCCGTCACTTCCTACCCAAGGCCAGTGGTGGAACCCCGGGAATATGCAAAAACTGCAGTTCTAATATGAGGCTCAGAAAATTACAGATCCTACGAGCCACCCCTATTGAGTCCGTCTTAGAATATGAGCCTAGCCTAACCAGCGTAATT
170->GCGATTATTATCACCCCTCCCGACATTAACCATACACTCCGACAAGGTACGCTATATTGGTCCACACTAAGTGGCACCGTTGTTAGGGGGAGTATTCTAACTTTGACAATCAGATTAAAAGTCCAGCAAATCTCTTACAGACCGCCTCACCGGACATTTGACTCTAGAGGAGACTCAGTT
171->AGGGCGAGCTCAAAACCAGAGACCGGAACGACCCCCTCGAATCCCCTTTGCTAAAGCCAACCATTACCTTTTTTATATAGCTAATCAGGAATGAAGCTCTAGTTGTGTGCGAACCCTCGATATTCTCATAGAGCTGGGTTAAATTTTTGCCGTAGTAGTTTCTGTGCGAATTGTAACGAT
171->ATGCTGACCTTCGTGAAGTAACGAAGAGTGAGACAGAACAGGCTTTACCAGATCCTGCCCGGGAGTACGACGGCACGTTGGCGCGGTGGGATTGCGACAAGGGACCCTATTGTACCGTCGTACTTAGGTCAGACCTTACGGGTTGCCTCGTACCATGACTAAAAGCGTCGAGTATTTTCG
172->GAATCCTTTATAATATGCCTTTGTCAATTGGAATAGGCCAATAGTCTCCAGATTGCTTATACCGGCCTGGTGCTTACCAGTTAGCATAGCTACGCCCTATCCGGACCAGCTTTGGACTGCATGATTTCCTTTAGGAATTACGTCTATCCAAGGCGTTTACTATTATTAGTGATGACGCCC
172->TCACCCATTTCTTAGTGCACATTGGGTCTATGTGAACCTGGCCTCTAAGGACATCCCGGGAATCGACGGATTGGCAGCGTCCTAATCTGCCGCGACTAGGTAGACCCAGGGGTTAATCACAGTCAAAGTTGTGAAGTCTTCACTATATGATCTACTTCCCCGTCCATTATAGTCACCCCA
173->TGCAGAGCGATAGACCCACGGAAGCTGAGAGTATTACTCAAGAATCGTGGGAGCCTGTTAGATATGACGCGTCGAGGTGTTGTGCTCCTCGGACCCAGCGAATTTAGAGGCCCATGCTCTTAAGACGCTTTTGGGATATATATAGACCCGTAATCAAATAGCGCGGCCGGACCTTGAGCG
173->TCATCACGACCGAGGAGTCCACGACTGCAGACCCTCGACGCGATGGGTCTGGGGTCAACAGAGGACTAGTCAGCGTGCAAGGTCAGCTCATGAGCGCATTCGTCGTCACCCTCTAAGCTACAACGAACGGGGGATCGGAACTTACCTCTCTGCGACGATTAATACGGTAAGCTACTCGTA
174->GAACCGTACGGCTGGATTGTAGTCGCCTCCAGGTATTAGTAAATTACTAGAGTACGTACCGGATGCTTCATTGTTATTTGGGCTTCTTCCCAAAATATGGACCATTCAACCTAGGGAATCGACAACGGTCATATGGGTTATTTAAACAACGCTCACAGCTCATATTTTGCCCAAGGCGGA
174->GCCTCACACATATGAAACGTATTTGAGAGCTCGGCCAGGTATCTGTTCTGCACAATTAGAATAGACTATTAACCAGCCAAGGTACGCGGAATCGAAGGAAGACACAAGGCGATAAAGTCCATGGAAGGAATTACTCCGGCGAGGGCATATGCCCTCATATACCCCTGCGAGCTCAGGTGT
175->GGCTTCACCATCCCTGTTCCTGCGATCGACCTAAATGAATCCCTGCTCAGCGGTAGTTAATTTTAACCACTGGTGATTCCCAGACTTCGTCATTTCAGTGTAAGTCGATCCTATGCACCAGATATGTACATTTTTCTTGACGCTTCGGTACCTTACCACACGCACCGGTTTAGCTGAGTG
175->CATTTATACGCTGGGCTGTGCACCAGCTAGATGGTAAAGAGGAAGATCAGCCTGATGCTGCGATTAGTATACTTGCTTTCGAGAATCTCAACGGGGGTACATGTGTTTACTCGTGAGTTGTTCTCTTTCTGACGCGCAGCCCTAATGGCAGTACCAAGGACCATCATGAGATTAACAGTT
176->GCCGCCGGCGAATCGTACATCGACCTAATCCCCCTACTTTCCTCTATGAGAGGGAGACGGCGCTTAGGTTGCGACGGAATGTGTACTCTCAACGTTGATGCGAGGAGCAGCATTGATGATTTTCGGCAAACGGTACCTCGGACCGGATCGCTTACGTTTAGCATTGTCTGCCAATTCCAC
176->CTGAGCGGAAAGAGCTGTACCACACGAAGGACCTTCATTAATGCTGTTTGTAAGTACGCCCTTCGTGATAAGTGTAGCACCGCTTTCTGTCTATTGTGGTCGTGCTGTTCGTAGCTATCCTTAGCTTAGTGATCGTTGGAATTTCTGCCATCCATTTCGACTTGATACCGCTCTAACAGA
177->TCCCAGAATCTACTCCTAAAGTACCTCGGGAGAGGCCAAAAAACGGTCGGCGGAAATTGACGGAGTTAGAGTTTATGTGCCAGACGTATTTATTGTGACGAATAAATGTATCGATATCAGATCCGATTCACGTAGCACACTAGGTACAGTATATTCAAGCGATACTGCAGGTCCTATCTA
177->CTCTGAGCATAACGAACTCTCATGGACACTGCACAAAGACCATGGTGAGGCTTGTGTGAAACGGCACAGCGCGCTTTTAGCGACCGACATCGACCTCTGTGCCTTGGGCTCCACCTTCGCCTTATGAATAGGTGCTCGCCCCACCCCGACCGGGCACGGACAACGTCCATCGTTTCTTAT
178->ATAATCTCTCCTCTACCTGACACGAGTCCTTACTAGATCTAACTCATGGGAATGCTTCCTAATATCACACCCATCCGTTACCTCTACACCCGGGGATTTATTCCTGGGAAACGTACAGATTACTGGAAAAGCGGGTATCTCCCCCCTTAATGCGGTCATACATCTATGACCATCTGCGGT
178->AAGTTACCTCTAGAGCCTACTTACTGAGCTCCACAAACACTAAGACACAACCTACACTCTAAAGGTCACGCGGGAACTCTCGTCGATTCGGTCTTCTACAAAGTAGTACATGGCATAAGTCTAGGTACCGATTCATGTCCTACAACGCGCCCACAACTAAAGCAAGTGAAGAGTGAAACG
179->TAGGTATCAACCTACTCATCAAGAATGAGCAGAACTATATGTCTCGGGTGGAGTACTCCCATCCGAAGCACGCTTCGTACGTTTGCAAACCCACACGACTCTCCCGCAACCGGTGAATGCGAATATCACCCAGAAGCTTATCCTCCCGAGTACACTTGGATCAACGGCTCGTTGTTATTC
179->GACAGCCGGAAGGAGCGTGATTCAGGTACAACCGCTTGCACCATTAGTACTTCGAGTGATTTGCGGCGTGGTTGTCCACGCACTGGATCGCAAGTACTACGAACCGATGCACGAATTCTAAGATCTAAATTGTCATAGAGCACATCCTGGTTCTCCCGCGCCGAGCCACACCGTGACTGC
180->TATGTTTCCATATAACGGACGTACATAGGACTGGCGTTGATACCCGGTCCTTGGAGTGAAAATTATTGGACGATTACGAAACCAGTAACGTCGCGCCTCTCTATAGTTACCCACTATGCTTGTGGCAAAGGTTCTCATAATGTCTCATTTAGGAAGCTGCTCTTAATGTGGTCTATCGGA
180->CTTGAAATCTCATGCTGCGTAAGCCTCGTATGTGCGGTCAGTTACATTACCGACCGATTTGCAGGTAATTATTGTATGTCCGTGAGTTGCAGCGTCTACTAAGTGTTTGCGGTGCATCAACTTGAGACGGCCTGACAAAAGAATGCACCGTTGCGGGAATCTGCAAAACAGAAACCTAGA
181->TATACTAGTCCCGCGAAGATTGAGTCGCTAAGTACGAAACGGTCTAGCAGGCAAGCACTGTATTAAGCCCGCAGGGCAGTTCCGACGCATCACCTCTAGGCCCACGATTGGATCGGGTTTGCTTGACGTAACCATCGACCGCACAATTGCCATTTTGACAGGCGCACAAGGTGTTAACAC
181->CAAAGCCGTAAAAGTGACTCGGGAACCGAGGTCGGTCGCGTCATTGTAATCTTCTGGTCCGGTACTCTACGATTAGGGGAAAGAAATGGTGGACGAACAGCTCCAACCAGGGACGTCTCTACAGCCTTAGGTGATTGTCGCAACGGACTTCAATCTTTTCCCTTCCTCCTTACTCTAAGG
182->ATACCACAACCCGTGTATCACGAATCACATTGTTAGATCCAGGACGGAATCTCTAGGTGCCTGGATAACGAATTAAATCGCCACCCAGGATAAAACGTGCTTGCGCTGACTCGTTGGGGGTGGCCATGCAATCTCCATTCGAGCACATGAGCTACTTGTACCATTTTCGCGTGAACCTCT
182->AGAAAATCACCTTACCAAGAATGATACGACCCGGCCAACAGTTTGTTGGTAGGCAATCCAACACATCCACTTAGCAAATTAAGCGTGATCACGTAGCATCATTCAGATATACAGGCTATGTGCGGGTGTACGGTGAAGAACCGACGGCGACACCCGACCATCTTTCAGTACGTTAGTACA
183->TTCACTTCTAGTGACTACGTACCTTCCGGGTGCAATCCTGCGATGAAATGACTCCGGCGAAAGTGCATATGAGATACCAAGGGTGGCGAGTGCTTACAGATCCCCAGTCTAAACGTCAGTAACCCCATTGACTTGGCGGGGCTGGTATGGTCCCCGCTAATCAATCCAAGACCTACTTTA
183->ACGGTAAATTGATCTTCAAAGTTTAGGATTTTCAATTGCAGAATCTAAGACATAGGTGCTACCAATTAGACTACTGACCCCGTCGAGACCAGGAAGAAGTCGGCTGTGTCAAAAATCTCTGCCACACACAAAAGGGGGGGCTGCGGTAGCCTTCGGCACCTATGAGTCTGTAAGTACATT
184->CTGTCACACCGATATACATGACTTATGGTATTTAATATTTGCGAGTGACTAGCTCCCTGAGGTGGTGTTTGCCATTTGACAGGGATGTGAGGCTGGTCATGGTTACGCTGGGGCCACCACTCTATAGAGATCTCCTACTAGCAGAACACTAACTCTAACCAGATAAATAGCTCAGGCAAG
184->TACTCCTCTACGAGATCATTCAGAGAGACTGGTAAAAACGAATCGTCTGCGGATTCCTGATGTTGTGGCAAGAGCCGGGCGATACAATGACATACGAGTTACCCTGTTTGTATGCCTGACCATGAAGTACAAGCAAGTACATGCACTGTGAGACAAAGGGCTACTATGTTGTTTGGCAGC
185->TGAGAGGAAACAGAGCGAATGCGTGAGCGGTCTAGAGCCTATTCTTAACTACAGGCCGGTCCATGGTTGCAGCGGTATCCGTTCCTCCCTCGGGTATGTCGTCCGATTTGATGACCCCGCCCATGTACGACGATTTGCGGTGATCAGTGAGAGTGATGAATAGGTAGCTCTTAGGTCAGT
185->CAACCCCAAGAGCTTAGGCTCGTTTAGAGTTAGTGCGAGATCTTGGCAGGTGTAACCGATTAAAAGTCACACTCCTTCCTATACAAACATCCAGTACTTTTGCATTAGCACCTAAGCTTCTGGCAACTTGCGAAGAGATTATTAAAGGTAATCTTGGACAACTCCATAGTATACTGCAGG
186->TTCTTCAAGTCGTGTCGCATACAAGTATAGGCCAGTATTTGATGTTCCGTCCACGGCCCCCTTGCCCAACTCATTGCCGGTGTAGAGCCAACCCATTAAGTTGACCGTTTTCCGTGCCACGCTGTCATCTGTAAGGAGACAGCAAACTGTCATGCAATAGCAGGTGGGGAGACAAGCGTC
186->AGTGGCCCCTTCGTTACTTACCCAGGAGTTCTTTAAGGATGGCGATCCAGTGGGTGTCATTTTACCCACTTTCGGGAGACGATCTAGCACGCCTCGGAGGCTCCGTTTCCTAGCGATACATTGACTTTTGGATTTTGGGCCTTCAAAACACTAGATTTAGAGATTCCCTCACGAAGGCGC
187->AGGCTACTTGATAATCCTTCGGAGTCATTTGATTCATGCTTCATGCGCTTCTGTCTGTCTGTGGGCCTCTTACACAAGCGTGTGGACGCGCTCGTCACCCTAGTGCCCCAAAAAGAAGTGACTGCCAATGATCCCGCCACATATGCTCTCCTCACAAGTCCGGCTACGAAAGGGGTGAGA
187->CTATATTCCGGCGTAACGTTAACCAGACGTCTAAACACACAGCTTGCCTAGAATCTACTGGGAAGACAGGAGCTTAGGGAGGTGGCCTTTTTAACATGACAAGGGGACTTTCTAACGGACACCCCATAAAGCCCCCAAGGTGGGGTATGGAATTGATGGAGGGATATACTGTAGGACAGA
188->TTGTGACCTCTACAATAGGCGCGGCAAATAGTACATATAAGTCCTTGCGTTACGAACTCGCTTTAGGGAATAACTAAGGACGATGAGTTGTTCTCGTGCACGAAGCGCATAGAGAGTCGCTTTCCATTGTGCTTAGATATCATTCGAAGCATCTTCGTTTGTAACGACCTCGTTTGCCAT
188->GCGAGACTAAACGTCGGCCCACACCTCGAGCCATGCATTACGCTGGTCTCTGGGTACAAGTCGAGCACCTTTCGTCCACGTGTGTGCCTCGCAACCGACTCACTTCGGGACTGGGATGCGCCAAGAGCACGGTCCGATACGTATTATAGCACCTGGAAAGGTGGGAGCGAATAACTATGT
189->TACTTGTTATAGCACCGCCAATAAGTTTCAACTGCTCCTTTTGCCCTTGCGTGCGTGCATTGTTATATACATAGATAGAAGCCTCGAGGATCCATACTCAGTTATCGTGCGCTAGCTACCAAATATGGTGGTGTAGTCAACATTGGTATGTCGGACGCGGCCGCATCGGCCCGACTGGAT
189->TCTCCTACGGTCTTGTGCAGATCTGCACCGATCATGAGAAGAGCACGCTCAAGGATTCACTATTCTTGATCCGATAATCTATACCCCCGCTTTAGGCAACCACGGGTACGCTCAATTTCACGCCTTGGAACTATAAATAAAACCTTGAACAAGGCGTCCTTGTGACTATGAGCTATGTCG
190->AATCTCCATAAAGAGAATTCATCCCACCATTACTGATAACTTCCGGTAAAGTGGTGGGGCAAGCGGCGCTTTGGTGTATCTCTGAACTCACGCGCGATTATTGCTTGCGCTAGGACGAAAGAATTAAATCTGAACACCTATGGACCGCGCTGCTAATGAGGGCGGCCTAGGCCGAGAGCT
190->TGTTGAGCCACGACCACTCTGGTCTCGTTCAGGGTAAACGCCGGGCTACCATGTGCGCTTACGAGAGTGCTCAGGCTCTCCCAGTATTGGTGCCTGGCACAATCCGATGCGACGAAATAAGGTTAGGTTGAATCGGTCTTCCCCGTCAGGTACATTAAGTGGCGGAAGCATTTGCATCAG
191->CCTGATAAGTGCTTTCAGCGAAATTGATTTATTATTACTCAGCACTTCTTTTAATCATATTTGAGGCTCTATCCGTCGGGTAGGCCGAAACCCTATCTATTATGAGTTGCGAGTGAATCTGGTGTAGCATTCCGCTCAATTCTCTACGTGTACGTGCAGACGGGGATGCATGGACCACGA
191->ACGGCTCTTCTTGGCTGTCGTGGGATAGTTACAAAACAGCACCTATATAAAGCATGGGGATCGTAATACTATTTTTTTCGGGAGCGTGTGTCCGGTAGTGTAAACGGCTGGCCGTATAATTACCGACTTTATCTAAGGGGTGCGGTACGTCAGGTACCCCACGTTACCCCCGGAGGTAGA
192->190
192->183
193->187
193->171
194->181
194->131
195->166
195->137
196->177
196->175
197->179
197->143
198->176
198->136
199->128
199->184
200->156
200->138
201->178
201->173
202->158
202->152
203->189
203->129
204->147
204->130
205->182
205->159
206->148
206->134
207->142
207->133
208->153
208->185
209->151
209->149
210->169
210->186
211->139
211->188
212->162
212->141
213->165
213->172
214->164
214->155
215->150
215->161
216->167
216->157
217->174
217->154
218->160
218->144
219->146
219->163
220->145
220->170
221->180
221->191
222->140
222->132
223->168
223->135
224->203
224->199
225->207
225->221
226->219
226->218
227->216
227->215
228->201
228->202
229->193
229->211
230->217
230->210
231->223
231->200
232->214
232->209
233->205
233->192
234->206
234->212
235->208
235->204
236->198
236->222
237->195
237->220
238->213
238->197
239->194
239->196
240->226
240->225
241->224
241->227
242->235
242->236
243->239
243->230
244->234
244->233
245->228
245->237
246->231
246->229
247->238
247->232
248->247
248->244
249->246
249->242
250->241
250->240
251->245
251->243
252->250
252->251
253->248
253->249
254->252
254->253
"""
T = ParsingInputW3(ip)
P_score, T = SmallParsimonyString(T)

print(int(P_score))
for k,v in T["adjacency_list"].items():
    print(f"{k}:{v}")

# sum_HD = 0
# for k,v in T["adjacency_list"].items():
#     sum_HD +=v
# print(sum_HD/2)

12641
TTGAGCCTCTTGATTTCTCTAATGATGGCAGTTAAAATCTCTGCGTTCCTGGATCGCCTCTCGGCTAGAAGATACCTGACGGATACACGCCGTTTGTCACAACCACGGCGGAATGAGAGGCTTAAGGATATTTCAAACCAGCACCCCGTTAGATATAAGCCGGGTATGAAGCTAGGGACC->CTGTGCCGTTTCATTCCTTCAGTGAGGTCTCTTCACATAACTCGTCTCCTGGAGCGTCTCTGGGCTGGCAGATATGATCCGTGTGCTCACCCATTCTTACTCGCACAGAGGAACGCGTGGATTAGGTATGCGTTTGAACGTCAACCCGTTCGATATCGGATTGGTGCCAAGCTGAGGAGC:73
CTGTGCCGTTTCATTCCTTCAGTGAGGTCTCTTCACATAACTCGTCTCCTGGAGCGTCTCTGGGCTGGCAGATATGATCCGTGTGCTCACCCATTCTTACTCGCACAGAGGAACGCGTGGATTAGGTATGCGTTTGAACGTCAACCCGTTCGATATCGGATTGGTGCCAAGCTGAGGAGC->TTGAGCCTCTTGATTTCTCTAATGATGGCAGTTAAAATCTCTGCGTTCCTGGATCGCCTCTCGGCTAGAAGATACCTGACGGATACACGCCGTTTGTCACAACCACGGCGGAATGAGAGGCTTAAGGATATTTCAAACCAGCACCCCGTTAGATATAAGCCGGGTATGAAGCTAGGGACC:73
TTGAGCCTCTTGATTTCTCTAATGATGGCAGTTAAAATCTCTGCGTTCCTGGATCGCCTCTCGGCTAGAAGATACCTGACGGATACACGCCGTTTGTCACAACCACGGCGGAATGAGAGGCTTAAGGATATTTCAAACCAGCACCCCGTTAGATATAAGCCGGGTATGAAGCTAGGGACC->TCTAGCTTCAGGATCAGTCTGAAGTTGGGAGAAAGGTGCCCTGCGATGACGTCATACGTCTCCCCCATACGATACCTGAT

In [239]:
sum_HD = 0
for k,v in T["adjacency_list"].items():
    sum_HD +=v
print(sum_HD/2)

12641.0


# Small parsimony in unrooted tree

In [495]:
def IsAlphabet(string):
    try:
        int(string)
    except:
        return True
    
def ParsingInputW3_Unrooted(ip, i = None):
    T = {}
    T["nodes"] = {}
    T["characters"] = {}
    T["adjacency_list"] = {}
    T["leaves"] = {}
    T["edges"] = {}
    nodes = []
    ip = ip.strip().split("\n")
    n = int(ip.pop(0))

    for edge in ip:
        node1,node2 = edge.split("->")

        if node1 not in nodes:
            nodes.append(node1)
        if node2 not in nodes:
            nodes.append(node2)

    for node in nodes :
        if IsAlphabet(node):
            leave_counts = len(T["leaves"])
            leave_counts = str(leave_counts)
            T["leaves"][node] = leave_counts
            T["characters"][leave_counts] = node
            T["nodes"][leave_counts] = []
        else:
            T["characters"][node] = ""

    for edge in ip:
        node1,node2 = [T["leaves"][node] if IsAlphabet(node) else node for node in edge.split("->")]
        if node1 not in T["edges"]:
            T["edges"][node1] = [node2]
        else:
            T["edges"][node1].append(node2)
    T["leaves"] = {v:[k] for k,v in T["leaves"].items()}

    return T

def AddRoot(T):
    #Add root
    new_root = str(len(T["edges"]))
    son = str(len(T["edges"])-1)
    daughter = str(len(T["edges"])-2)
    assert son in T["edges"][daughter]
    #Add new edges, Delete old edges
    T["characters"][new_root] = ""
    T["edges"][new_root]= [son,daughter]
    T["edges"][son].remove(daughter)
    T["edges"][daughter].remove(son)
    
    #Log of change
    T["changes"] = {"new_root":new_root,"son":son,"daughter":daughter}
    
    #Delete all backward edges
    z = [son,daughter]
    count = 0
    while len(z) > 0 :
        count += 1
        new_z = []
        for i in z:
            for ii in T["edges"][i]:
                T["edges"][ii].remove(i)
            new_z +=  T["edges"][i]
        z = new_z
        if count > 9999999:
            print("Time Out", count)
            break
    #polishing        
    T["nodes"] = {k:v for k,v in sorted(T["edges"].items(), key=(lambda item: int(item[0])))}
    T["characters"] = {k:v for k,v in sorted(T["characters"].items(), key=(lambda item: int(item[0])))}
    return T

def ReverseRootedTree(T):
    new_root = T["changes"]["new_root"]
    son = T["changes"]["son"]
    daughter =T["changes"]["daughter"]

    del T["nodes"][new_root]
    T["nodes"][son].append(daughter)
    del T["changes"]
    return T

def SmallParsimonyString_Unrooted(T):
    T = AddRoot(T)
    P_score = 0

    for i_character in range(len(T["leaves"]["0"][0])):
        Character = {}
        for (key, value) in T["leaves"].items():
            Character[key] = value[0][i_character]

        S = SmallParsimony(T,Character)
        score,Backtrack = BackTrack(T,S)
        for node in T["nodes"]:
            if node not in T["leaves"]:
                T["characters"][node] += Backtrack[node]
        P_score += score
    T["P_score"] = P_score
    T = ReverseRootedTree(T)
    for node_v in T["nodes"]:
        if node_v not in T["leaves"]:
            for node_child in T["nodes"][node_v]:
                node_v_char = T["characters"][node_v]
                node_child_char = T["characters"][node_child]
                edge = MkEdge(node_v_char,node_child_char)
                rv_edge = RvEdge(edge)
                length = HammingDistance(node_v_char, node_child_char)
                T["adjacency_list"][edge] = length
                T["adjacency_list"][rv_edge] = length
    return T

In [492]:
ip = """32
ATACACCCAAGGCTGACAGTTTTCAAAATTAATC->32
32->ATACACCCAAGGCTGACAGTTTTCAAAATTAATC
GGGCCTGTTAGCTAGTCGAGGCACTCGTTTGGCT->32
32->GGGCCTGTTAGCTAGTCGAGGCACTCGTTTGGCT
TGGTCAATAATCAGGTGAGCACGGACGTGATTCC->33
33->TGGTCAATAATCAGGTGAGCACGGACGTGATTCC
CAATGAACGAACGACGTTTGATGGATCCAGTTCC->33
33->CAATGAACGAACGACGTTTGATGGATCCAGTTCC
GTGAGCTATGTACGGTTAAAAGGTTCGCGGATCG->34
34->GTGAGCTATGTACGGTTAAAAGGTTCGCGGATCG
CAAATGGCATACCCTTCCCATCAATGAGTCTCCC->34
34->CAAATGGCATACCCTTCCCATCAATGAGTCTCCC
GTTCATAAAGCACATTTCGGCTACCACCATGACC->35
35->GTTCATAAAGCACATTTCGGCTACCACCATGACC
CAAAAGTCAACTCTATCGCGTTTCAACACGTGAG->35
35->CAAAAGTCAACTCTATCGCGTTTCAACACGTGAG
TACCAAAGAATTTGCGGCCTTAGACGTTTTAGCG->36
36->TACCAAAGAATTTGCGGCCTTAGACGTTTTAGCG
AATACGGATCTGATTAGCGGGAGGGCCCGAACCT->36
36->AATACGGATCTGATTAGCGGGAGGGCCCGAACCT
ATATGCCTATTGCACGTCTTAGTACATACATGGG->37
37->ATATGCCTATTGCACGTCTTAGTACATACATGGG
CGAACTATGAGAGTTTAAGAAGATTGGCTTAAGG->37
37->CGAACTATGAGAGTTTAAGAAGATTGGCTTAAGG
TCTCCTTTGGACGCGGATGTCCCTAATCGGTAGC->38
38->TCTCCTTTGGACGCGGATGTCCCTAATCGGTAGC
TCAGAGTCCTTGTCCAACGCTCCTTAAACTCACG->38
38->TCAGAGTCCTTGTCCAACGCTCCTTAAACTCACG
ATGTTCATTAATGGTTGCGTCCGGCCAAGAAAAG->39
39->ATGTTCATTAATGGTTGCGTCCGGCCAAGAAAAG
ACGTGGTCGTCATCGCCGCTGACCTTTCCAGCAA->39
39->ACGTGGTCGTCATCGCCGCTGACCTTTCCAGCAA
TCAGTAAACAGCCCCCAGGGGCATTTAGCGAATC->40
40->TCAGTAAACAGCCCCCAGGGGCATTTAGCGAATC
GGACTACGCTTGACGCCCCCGAGATGCGCGCGCC->40
40->GGACTACGCTTGACGCCCCCGAGATGCGCGCGCC
CTGTACCCATGGGGCAGATAGGACAGCGCGAGAT->41
41->CTGTACCCATGGGGCAGATAGGACAGCGCGAGAT
TAAGGCTCTCTGTACTGAGTAGGAGGGTTGGCTG->41
41->TAAGGCTCTCTGTACTGAGTAGGAGGGTTGGCTG
TGCGGGTGACAGGACAAACTATGATGTCGCTGGA->42
42->TGCGGGTGACAGGACAAACTATGATGTCGCTGGA
AGGATTAGCAGTATCCAGGTAGGCAGGTATTTCT->42
42->AGGATTAGCAGTATCCAGGTAGGCAGGTATTTCT
CTATCGCTACACGCGATACTACTGAGCTCGAAAG->43
43->CTATCGCTACACGCGATACTACTGAGCTCGAAAG
GGCGTTACAATATGTCCGTTCTCCATGCCTCTGT->43
43->GGCGTTACAATATGTCCGTTCTCCATGCCTCTGT
ATTAAGTGTTATGAATGAGGCGAAACCCGCCGGG->44
44->ATTAAGTGTTATGAATGAGGCGAAACCCGCCGGG
CACATGTCACGTGCTAGCGCCCGAATTATGATAG->44
44->CACATGTCACGTGCTAGCGCCCGAATTATGATAG
ACCTACGCGAACTTGACTCGTGCAGCAGATTACA->45
45->ACCTACGCGAACTTGACTCGTGCAGCAGATTACA
TGGTCCAAGATTCAACAGAATCGATGTAAAACGT->45
45->TGGTCCAAGATTCAACAGAATCGATGTAAAACGT
TACACTATGCAGGCAATCGGGCAACTTAGTCCGA->46
46->TACACTATGCAGGCAATCGGGCAACTTAGTCCGA
TGCCTATACGCGGGAGTCATTCATCTTCCACTAG->46
46->TGCCTATACGCGGGAGTCATTCATCTTCCACTAG
GAAGCTGTTAACGCGGGCCTCTACGCGATGAAAG->47
47->GAAGCTGTTAACGCGGGCCTCTACGCGATGAAAG
GAGTACTTGTCTGGATAAGTTTGAAGGGTTCATC->47
47->GAGTACTTGTCTGGATAAGTTTGAAGGGTTCATC
46->48
48->46
34->48
48->34
38->49
49->38
47->49
49->47
41->50
50->41
40->50
50->40
35->51
51->35
42->51
51->42
45->52
52->45
36->52
52->36
32->53
53->32
48->53
53->48
43->54
54->43
51->54
54->51
44->55
55->44
53->55
55->53
39->56
56->39
37->56
56->37
50->57
57->50
55->57
57->55
49->58
58->49
57->58
58->57
56->59
59->56
33->59
59->33
54->60
60->54
59->60
60->59
52->61
61->52
58->61
61->58
60->61
61->60
"""


In [493]:
T = ParsingInputW3_Unrooted(ip)
T = SmallParsimonyString_Unrooted(T)
TestTree(T)

PASSED TEST


In [494]:
print(int(P_score))
for k,v in T["adjacency_list"].items():
    print(f"{k}:{v}")

585
TAACACTCAAGCGAGACCGGTCACACAATTAGCG->ATACACCCAAGGCTGACAGTTTTCAAAATTAATC:14
ATACACCCAAGGCTGACAGTTTTCAAAATTAATC->TAACACTCAAGCGAGACCGGTCACACAATTAGCG:14
TAACACTCAAGCGAGACCGGTCACACAATTAGCG->GGGCCTGTTAGCTAGTCGAGGCACTCGTTTGGCT:18
GGGCCTGTTAGCTAGTCGAGGCACTCGTTTGGCT->TAACACTCAAGCGAGACCGGTCACACAATTAGCG:18
CAATGAACAAACGAGTTCGGACGGACCCAATTCC->TGGTCAATAATCAGGTGAGCACGGACGTGATTCC:14
TGGTCAATAATCAGGTGAGCACGGACGTGATTCC->CAATGAACAAACGAGTTCGGACGGACCCAATTCC:14
CAATGAACAAACGAGTTCGGACGGACCCAATTCC->CAATGAACGAACGACGTTTGATGGATCCAGTTCC:8
CAATGAACGAACGACGTTTGATGGATCCAGTTCC->CAATGAACAAACGAGTTCGGACGGACCCAATTCC:8
TAAATCTCATACCCGTTCGATCAATCAAGGACCG->GTGAGCTATGTACGGTTAAAAGGTTCGCGGATCG:19
GTGAGCTATGTACGGTTAAAAGGTTCGCGGATCG->TAAATCTCATACCCGTTCGATCAATCAAGGACCG:19
TAAATCTCATACCCGTTCGATCAATCAAGGACCG->CAAATGGCATACCCTTCCCATCAATGAGTCTCCC:12
CAAATGGCATACCCTTCCCATCAATGAGTCTCCC->TAAATCTCATACCCGTTCGATCAATCAAGGACCG:12
CAAAAGACAACTCAGTTGCGATGCAACCATTACG->GTTCATAAAGCACATTTCGGCTACCACCATGACC:16
GTTCATAAAGCACATTTCGGCTACCACCATGACC->