# Imports

In [1]:
import numpy as np
from ete3 import Tree
import pprint
import random



# Variables

In [2]:
STEP_MATRIX = np.array([[0, 3, 4, 9],
                       [3, 0, 2, 4],
                       [4, 2, 0, 4],
                       [9, 4, 4, 0]])

N_SPECIES = STEP_MATRIX.shape[0]

STR_JOIN = "+"

CHAR_ORDER = ["A", "C", "G", "T"]

DIC1 = {
          'Probopass':np.array([0,np.inf,np.inf,np.inf]),
          'Aggron':np.array([np.inf,np.inf,np.inf,0]),
          'Bastiodon':np.array([np.inf,np.inf,np.inf,0]),
          'Regirock':np.array([np.inf,np.inf,0,np.inf]),
          'Registeel':np.array([np.inf,np.inf,0,np.inf]),
          'Regice':np.array([np.inf,np.inf,0,np.inf]),
          'Klingklang':np.array([np.inf,np.inf,0,np.inf]),
          'Metagross':np.array([np.inf,0,np.inf,np.inf]),
          'Genesect':np.array([0,np.inf,np.inf,np.inf]),
          'Porygon=Z':np.array([np.inf,0,np.inf,np.inf]),
          'Magnezone':np.array([np.inf,0,np.inf,np.inf]),
          'Forretress':np.array([np.inf,np.inf,np.inf,0]),
          'Electrode':np.array([0,np.inf,np.inf,np.inf]),
          'Ferrothorn':np.array([np.inf,np.inf,0,np.inf]),
       }

N1 = "(((( Electrode , Magnezone) ,Porygon=Z) , (((( Aggron , Bastiodon ) , Forretress ) , Ferrothorn ) , ((((( Regirock , Regice ) , Registeel ) , Metagross ) , Klingklang ) , Genesect ))) , Probopass );"
N2 = "((((( Regirock , Regice ) , Registeel ) , (( Metagross , Klingklang ) , Genesect )) , ((( Aggron , Bastiodon ) ,( Forretress , Ferrothorn )) , Probopass )) ,( Porygon=Z,( Magnezone , Electrode )));"

# Trees

<div>
<img src="tree1.png" width="500px">
<img src="tree2.png" width="500px">
</div>

# Sankoff algorithm

In [3]:
tree1 = Tree(N1)
print(N1)
print(tree1)

(((( Electrode , Magnezone) ,Porygon=Z) , (((( Aggron , Bastiodon ) , Forretress ) , Ferrothorn ) , ((((( Regirock , Regice ) , Registeel ) , Metagross ) , Klingklang ) , Genesect ))) , Probopass );

            /-Electrode
         /-|
      /-|   \-Magnezone
     |  |
     |   \-Porygon=Z
     |
     |            /-Aggron
     |         /-|
     |      /-|   \-Bastiodon
   /-|     |  |
  |  |   /-|   \-Forretress
  |  |  |  |
  |  |  |   \-Ferrothorn
  |  |  |
  |  |  |               /-Regirock
  |  |  |            /-|
  |   \-|         /-|   \-Regice
--|     |        |  |
  |     |      /-|   \-Registeel
  |     |     |  |
  |     |   /-|   \-Metagross
  |     |  |  |
  |      \-|   \-Klingklang
  |        |
  |         \-Genesect
  |
   \-Probopass


In [4]:
def sankoff(tree):
    new_tree = parse_string(tree) # In this part we clean and parse the string to get an array
    merge_cluster(new_tree) # In this part we merge the clusters and add the new array values of intern nodes in the dictionnary
    init_solo_leaves() # Initialisation of the solo leaves for the traceback function
    new_chars = traceback() # Traceback in the dictionnary and gets the new characters of the nodes
    parcimony_score = compute_score(new_chars) # Computes the parcimony score of the tree
    return parcimony_score

def parse_string(tree):
    tree = tree.replace(" ","")
    tree = tree.replace(";","")
    new_tree = []
    for i in range(len(tree)):
        if tree[i] == ")":
            cpt = 0
            for j in range(i, 0, -2):
                if tree[j] == ")" and tree[j-1] == ")": # Discriminate mini/big cluster and get offset
                    cpt += 1
            tmp_tree = tree[:i] # Cut after parenthesis
            n_close_par = tmp_tree.count(")") # Counts open and close parenthesis
            n_open_par = tmp_tree.count("(")
            if cpt == 1: # Big cluster
                parenthesis_to_cut = n_open_par - n_close_par + cpt + 1 # FIX BUG HERE WITH N2
                nth = find_nth(tmp_tree, "(", parenthesis_to_cut) # Finds the index to cut at the right open parenthesis
                tmp_tree = tmp_tree[nth:] 
            else: # Mini cluster
                index_par = tmp_tree.rfind("(") # For mini cluster : just cut at first open parenthesis met
                tmp_tree = tmp_tree[index_par:]
            tmp_tree = tmp_tree.replace("(","") # Delete all parenthesis to extract leaves
            tmp_tree = tmp_tree.replace(")","")
            leaves = tmp_tree.split(",")
            new_tree.append(leaves) # We build a new list of list containing leaves
    return new_tree

def merge_cluster(new_tree):
    visited = []
#     species = np.unique(np.concatenate(np.array(new_tree)).flatten()) # Gets the species of the tree
    for i in range(len(new_tree)):
        if len(new_tree[i]) == 2: # Merge of 2 elements
            elt1 = new_tree[i][0]
            elt2 = new_tree[i][1]
#             print("we add", elt1, "and", elt2)
            new_ancester = add_ancester(elt1, elt2)
            visited.append(new_ancester)
        else: # Merge of clusters
            tmp_str = new_tree[i][0]
            str_possible = []
            for j in range(1, len(new_tree[i])-1):
                tmp_str += STR_JOIN + new_tree[i][j]
                str_possible.append(tmp_str)
            str_possible.reverse() # All the existing clusters in this array
            if tmp_str in visited: # Merge of mini clusters
                visited.remove(tmp_str) # If we find one we remove it from the visited array
                del_elt = tmp_str.split(STR_JOIN)
                new_elt = new_tree[i].copy() # Copy because we remove from new_elt after that
                for elt in del_elt:
                    new_elt.remove(elt)
#                 print("we add", tmp_str, "and", new_elt[0])
                new_ancester = add_ancester(tmp_str, new_elt[0])
                visited.append(new_ancester)
            else: # Merge of big clusters
                str_possible = []
                for j in range(1, len(new_tree[i])):
                    tmp_str += STR_JOIN + new_tree[i][j]
                    str_possible.append(tmp_str)
                str_possible.reverse() 
                new_list = []
                for j in range(len(visited)):
                    if visited[j] in tmp_str: # This time there can be multiple clusters
                        new_list.append(visited[j])
#                 print("we add", new_list[0], "and", new_list[1])
                new_ancester = add_ancester(new_list[0], new_list[1])
                visited.append(new_ancester)
                visited.remove(new_list[0])
                visited.remove(new_list[1])
    
def find_nth(haystack, needle, n):
    start = haystack.find(needle)
    while start >= 0 and n > 1:
        start = haystack.find(needle, start+len(needle))
        n -= 1
    return start
    
def add_ancester(elt1, elt2):
    new_ancester = elt1+STR_JOIN+elt2 
    if len(DIC1[elt1]) != 4: # Have to check if we just have the basic array
        val1 = DIC1[elt1][0] # or if we have the array + backtrack characters
    else:
        val1 = DIC1[elt1]
    if len(DIC1[elt2]) != 4:
        val2 = DIC1[elt2][0]
    else:
        val2 = DIC1[elt2]
    new_values = compute_new_values(val1, val2)
    add_to_backtrack_dic(val1, val2, new_ancester, new_values)
    return new_ancester
    
def compute_new_values(val1, val2):
    new_array = np.zeros((N_SPECIES))
    for i in range(N_SPECIES):
        new_array[i] = np.min(STEP_MATRIX[i]+val1) + np.min(STEP_MATRIX[i]+val2)
    return new_array

def add_to_backtrack_dic(val1, val2, new_ancester, new_values):
    min_index1, min_index2 = np.argmin(val1), np.argmin(val2)
    DIC1[new_ancester] = [new_values, CHAR_ORDER[min_index1], CHAR_ORDER[min_index2]] 
    
def init_solo_leaves():
    for key in DIC1:
        if STR_JOIN not in key:
            DIC1[key] = [DIC1[key], CHAR_ORDER[np.argmin(DIC1[key])]]
    
def traceback():
    new_chars = []
    first_elem = True
    for item in reversed(DIC1.values()):
        tmp_item = item[1:] # Ignoring array values
        if first_elem:
            new_chars.append(random.choice(tmp_item))
            first_elem = False
        else:
            if item in save_item:
                for item in tmp_item:
                    new_chars.append(item)
            else:
                new_chars.append(random.choice(tmp_item))
        save_item = tmp_item
    return new_chars
    
def compute_score(new_chars):
    score = 0
    
    return score

In [5]:
parcimony_score = sankoff(N1)
print(parcimony_score)
# pprint.pprint(DIC1, sort_dicts=False)

0
