# Imports

In [None]:
import numpy as np
from ete3 import Tree

# Variables

In [None]:
step_matrix = np.array([[0, 3, 4, 9],
                       [3, 0, 2, 4],
                       [4, 2, 0, 4],
                       [9, 4, 4, 0]])

n_species = step_matrix.shape[0]

# Ordre : A C G T
dic1 = {
          'Probopass':np.array([0,np.inf,np.inf,np.inf]),
          'Aggron':np.array([np.inf,np.inf,np.inf,0]),
          'Bastiodon':np.array([np.inf,np.inf,np.inf,0]),
          'Regirock':np.array([np.inf,np.inf,0,np.inf]),
          'Registeel':np.array([np.inf,np.inf,0,np.inf]),
          'Regice':np.array([np.inf,np.inf,0,np.inf]),
          'Klingklang':np.array([np.inf,np.inf,0,np.inf]),
          'Metagross':np.array([np.inf,0,np.inf,np.inf]),
          'Genesect':np.array([0,np.inf,np.inf,np.inf]),
          'Porygon=Z':np.array([np.inf,0,np.inf,np.inf]),
          'Magnezone':np.array([np.inf,0,np.inf,np.inf]),
          'Forretress':np.array([np.inf,np.inf,np.inf,0]),
          'Electrode':np.array([0,np.inf,np.inf,np.inf]),
          'Ferrothorn':np.array([np.inf,np.inf,0,np.inf]),
       }

N1 = "(((( Electrode , Magnezone) ,Porygon=Z) , (((( Aggron , Bastiodon ) , Forretress ) , Ferrothorn ) , ((((( Regirock , Regice ) , Registeel ) , Metagross ) , Klingklang ) , Genesect ))) , Probopass );"
N2 = "((((( Regirock , Regice ) , Registeel ) , (( Metagross , Klingklang ) , Genesect )) , ((( Aggron , Bastiodon ) ,( Forretress , Ferrothorn )) , Probopass )) ,( Porygon=Z,( Magnezone , Electrode )));"

# Trees

<div>
<img src="tree1.png" width="500px">
<img src="tree2.png" width="500px">
</div>

# Sankoff algorithm

In [None]:
tree1 = Tree(N1)
print(N1)
print(tree1)

In [None]:
def parse_string(tree):
    tree = tree.replace(" ","")
    tree = tree.replace(";","")
    new_tree = []
    for i in range(len(tree)):
        if tree[i] == ")":
            cpt = 0
            for j in range(i, 0, -2):
                if tree[j] == ")" and tree[j-1] == ")": # Discriminate mini/big cluster and get offset
                    cpt += 1
            tmp_tree = tree[:i] # Cut after parenthesis
            n_close_par = tmp_tree.count(")") # Counts open and close parenthesis
            n_open_par = tmp_tree.count("(")
            if cpt == 1: # Big cluster
                parenthesis_to_cut = n_open_par - n_close_par + cpt + 1 # Petit bug de decalage ici avec N2 ...
                nth = find_nth(tmp_tree, "(", parenthesis_to_cut) # Finds the index to cut at the right open parenthesis
                tmp_tree = tmp_tree[nth:] 
            else: # Mini cluster
                index_par = tmp_tree.rfind("(") # For mini cluster : just cut at first open parenthesis met
                tmp_tree = tmp_tree[index_par:]
            tmp_tree = tmp_tree.replace("(","") # Delete all parenthesis to extract leaves
            tmp_tree = tmp_tree.replace(")","")
            leaves = tmp_tree.split(",")
            new_tree.append(leaves) # We build a new list of list containing leaves
    return new_tree

def merge_cluster(new_tree):
    visited = []
#     species = np.unique(np.concatenate(np.array(new_tree)).flatten()) # Gets the species of the tree
    for i in range(len(new_tree)):
        if len(new_tree[i]) == 2: # Merge of 2 elements
            elt1 = new_tree[i][0]
            elt2 = new_tree[i][1]
#             print("we add", elt1, "and", elt2)
            new_ancester = add_ancester(elt1, elt2)
            visited.append(new_ancester)
        else: # Merge of clusters
            tmp_str = new_tree[i][0]
            str_possible = []
            for j in range(1, len(new_tree[i])-1):
                tmp_str += "-" + new_tree[i][j]
                str_possible.append(tmp_str)
            str_possible.reverse() # All the existing clusters in this array
            if tmp_str in visited: # Merge of mini clusters
                visited.remove(tmp_str) # If we find one we remove it from the visited array
                del_elt = tmp_str.split("-")
                new_elt = new_tree[i].copy() # Copy because we remove from new_elt after that
                for elt in del_elt:
                    new_elt.remove(elt)
#                 print("we add", tmp_str, "and", new_elt[0])
                new_ancester = add_ancester(tmp_str, new_elt[0])
                visited.append(new_ancester)
            else: # Merge of big clusters
                str_possible = []
                for j in range(1, len(new_tree[i])):
                    tmp_str += "-" + new_tree[i][j]
                    str_possible.append(tmp_str)
                str_possible.reverse() 
                new_list = []
                for j in range(len(visited)):
                    if visited[j] in tmp_str: # This time there can be multiple clusters
                        new_list.append(visited[j])
#                 print("we add", new_list[0], "and", new_list[1])
                new_ancester = add_ancester(new_list[0], new_list[1])
                visited.append(new_ancester)
                visited.remove(new_list[0])
                visited.remove(new_list[1])

def sankoff(tree):
    # In this part we clean and parse the string to get an array
    new_tree = parse_string(tree)
    # In this part we merge the clusters        
    merge_cluster(new_tree)
    
def find_nth(haystack, needle, n):
    start = haystack.find(needle)
    while start >= 0 and n > 1:
        start = haystack.find(needle, start+len(needle))
        n -= 1
    return start
    
def add_ancester(elt1, elt2):
    new_ancester = elt1+"-"+elt2    
    new_tab = compute_new_values(elt1,elt2)
    dic1[new_ancester] = new_tab
    return new_ancester
    
def compute_new_values(elt1,elt2):
    new_tab = np.zeros((n_species))
    val1 = dic1[elt1]
    val2 = dic1[elt2]
    for i in range(n_species):
        val_letter = step_matrix[i]
        new_tab[i] = np.min(val_letter+val1) + np.min(val_letter+val2)
    return new_tab

In [None]:
print(N1.replace(" ", ""))
sankoff(N1)
print(dic1)