# Import

In [1]:
import numpy as np
from ete3 import Tree



# 1. UPGMA :

# Variables

In [2]:
# distance_matrix = np.array([[ 0, 19, 27,  8, 33, 18, 13],
#                             [19,  0, 31, 18, 36,  1, 13],
#                             [27, 31,  0, 26, 41, 32, 29],
#                             [ 8, 18, 26,  0, 31, 17, 14],
#                             [33, 36, 41, 31,  0, 35, 28],
#                             [18,  1, 32, 17, 35,  0, 12],
#                             [13, 13, 29, 14, 28, 12, 0]])

distance_matrix = np.array([[0.0,2.0,4.0,6.0,6.0,8.0],
                            [2.0,0.0,4.0,6.0,6.0,8.0],
                            [4.0,4.0,0.0,6.0,6.0,8.0],
                            [6.0,6.0,6.0,0.0,4.0,8.0],
                            [6.0,6.0,6.0,4.0,0.0,8.0],
                            [8.0,8.0,8.0,8.0,8.0,0.0]])

In [3]:
def upgma_loop(distance_matrix):
    dic = get_dict(distance_matrix) # Initialisation du dictionnaire
    cpt = 1
    save_order = []
    n = distance_matrix.shape[0] # Sauveguarde du nombre d'espece
    while distance_matrix.shape[0] > 2: # Condition d'arret de l'algorithme
        print("===================== Nombre d'itération :", cpt,"=====================")
        print("\nDictionnaire :", dic)
        display_distance_matrix(distance_matrix, dic)
        distance_matrix, dic, save_order = upgma(distance_matrix, dic, save_order)
        print("\nNouveau dictionnaire :", dic)
        display_distance_matrix(distance_matrix, dic)
        print("\n")
        cpt += 1
    save_order.append(list(dic.values())[0]) # Ajout de toute les especes a la fin
    save_order.append(list(dic.values())[1])
    save_order.append(distance_matrix.max(axis=0)[0]) # Ajout de la derniere valeur
    return distance_matrix, dic, save_order

def upgma(distance_matrix, dic, save_order):
    min_v, v1, v2 = get_min_v(distance_matrix, dic)
    distance_matrix_u, save_order = update_distance_matrix(distance_matrix, v1, v2, dic, save_order)
    distance_matrix_u = distance_matrix_u.astype(float)
    dic_u = update_dic(dic, v1, v2)
    return distance_matrix_u, dic_u, save_order

def get_min_v(distance_matrix, dic):
    n,m = distance_matrix.shape
    min_v = np.inf
    for i in range(n):
        for j in range(m):
            if i != j: # Ignore diagonal pour la distance minimale
                if min_v > distance_matrix[i][j]:
                    min_v = distance_matrix[i][j]
                    index_v1 = i
                    index_v2 = j
    return min_v, dic[index_v1], dic[index_v2]

def update_dic(dic, v1, v2):
    dic_u = {}
    dic_u[0] = v1+v2
    n = len(dic) 
    for i in range(1, n+1):
        if dic[i-1] != v1 and dic[i-1] != v2: 
            dic_u[i] = dic[i-1] # On creer notre nouveau dictionnaire avec les valeurs sauf la nouvelle
    dic_u = {i: v for i, v in enumerate(dic_u.values())} # On reindexe le dictionnaire
    return dic_u

def update_distance_matrix(distance_matrix, v1, v2, dic, save_order):
    k1 = [k for k, v in dic.items() if v == v1][0]
    k2 = [k for k, v in dic.items() if v == v2][0]
    distance_matrix_u = delete_values(distance_matrix, k1, k2)
    new_distances, save_order = compute_new_distances(distance_matrix, v1, v2, dic, save_order)
    distance_matrix_u[0  ] = new_distances                 # Ajout des valeurs calculees sur la premiere ligne
    distance_matrix_u[:,0] = new_distances.T               # et la derniere colonne correspondant à {v1,v2}
    return distance_matrix_u, save_order

def delete_values(distance_matrix, k1, k2):
    distance_matrix_u = np.delete(distance_matrix,   k1,   0) # Delete ligne à l'indice k1 correspondant à v1
    distance_matrix_u = np.delete(distance_matrix_u, k1,   1) # Delete colonne à l'indice k1 correspondant à v1
    distance_matrix_u = np.delete(distance_matrix_u, k2-1, 0) # Delete ligne à l'indice k2 correspondant à v2 (decaler de 1 car on a suppr une ligne)
    distance_matrix_u = np.delete(distance_matrix_u, k2-1, 1) # Delete colonne à l'indice k2 correspondant à v2 (decaler de 1 car on a suppr une colonne)
    distance_matrix_u = np.insert(distance_matrix_u, 0, np.zeros(distance_matrix_u.shape[0]), axis=0)
    distance_matrix_u = np.insert(distance_matrix_u, 0, np.zeros(distance_matrix_u.shape[0]), axis=1)
    return distance_matrix_u

def compute_new_distances(distance_matrix, v1, v2, dic, save_order):
    n = distance_matrix.shape[0]
    new_distances = np.zeros((n-1))
    new_distances[0] = 0 # 0 de la diagonal
    k1 = [k for k, v in dic.items() if v == v1][0]
    k2 = [k for k, v in dic.items() if v == v2][0]
    index = 1
    for i in range(n):
        if k1 != i and k2 != i:
            new_distances[index] = (distance_matrix[i][k1] + distance_matrix[i][k2]) / 2
            index += 1
    save_order.append(v1)
    save_order.append(v2)
    save_order.append(distance_matrix[k1][k2])
    return new_distances, save_order

def display_distance_matrix(distance_matrix, dic):
    print("\nDistance matrix :\n")
    n,m = distance_matrix.shape
    for val in dic.values():
        print("    ",val, end="")
    print("")
    for i,val in enumerate(dic.values()):
        for j in range(m):
            print("   ", distance_matrix[i][j], end="")
        print("   ", val)

def get_dict(distance_matrix):
    dic = {}
    n = distance_matrix.shape[0]
    keys = [chr(i+65) for i in range(n)]
    values = [i for i in range(n)]
    for i in range(n):
        dic[values[i]] = keys[i]
    return dic

def display_tree(order):
    print("Representation de notre arbre :\n")
    order_str = ""
    valeurs_arbre = []
    for i in range(0, len(order), 3): # On recupere que nos especes et on les format
        order_str += "(" + order[i] + "," + order[i+1] + ")"
        if i != len(order)-2:
            order_str += "|"
        valeurs_arbre.append(order[i+2])
    visited = []
    order_str = order_str.split("|")
    final_str = "" # String final
    save_str = [] # Pile qui save nos strings
    for i in range(len(order_str)-1): # Pour chacun de nos duo d'espece
        clean = order_str[i].replace("(","")
        clean = clean.replace(")","")
        split = clean.split(",")
        item1 = split[0] # On separe nos deux espece en deux strings
        item2 = split[1]
        if item1 in visited and item2 in visited: # Si les deux especes sont deja formate
            save1 = save_str.pop() # On pop les derniers premiers elements de la pile car
            save2 = save_str.pop() # les deux sont deja formates
            str_tmp = "(" + save1 + "," + save2 + ")"
            save_str.append(str_tmp) # Ajout a la pile de notre nouvelle string formate
            final_str = final_str.replace(save1,"") # On supprime les elements non formate
            final_str = final_str.replace(save2,"")
        elif item1 in visited : # Si l'espece 1 est formate mais pas l'espece 2
            save1 = save_str.pop() # On pop l'element deja formate
            str_tmp = "(" + save1 + "," + item2 + ")"
            final_str = final_str.replace(save1,"") # On supprime l'element non formate
            save_str.append(str_tmp) # Ajout a la pile de notre nouvelle string formate
        elif item2 in visited : # Si l'espece 2 est formate mais pas l'espece 1
            save1 = save_str.pop() # On pop l'element deja formate
            str_tmp = "(" + item1 + "," + save1 + ")"
            final_str = final_str.replace(save1,"") # On supprime l'element non formate
            save_str.append(str_tmp) # Ajout a la pile de notre nouvelle string formate
        else: # Si aucune des especes sont formate
            str_tmp = "(" + item1 + "," + item2 + ")"
            save_str.append(str_tmp) # Ajout a la pile de notre nouvelle string formate
        final_str += str_tmp # On ajout notre string creer dans nos boucle a la string finale
        visited.append(item1+item2) # Mise a jour de nos espece deja visite
    final_str += ";"
    unrooted_tree = Tree(final_str) # Utilisation de la libre pour affichage de l'arbre
    print(unrooted_tree)
    print("\nValeurs de l'arbres : ", valeurs_arbre)

In [4]:
print("----------------------------> Algorithme UPGMA : <----------------------------")
distance_matrix_final, _, order_final = upgma_loop(distance_matrix)

print("----------------------------> Fabrication de l'arbe de philogenie : <----------------------------\n")
print("Arbre final : ", order_final)
display_tree(order_final)

----------------------------> Algorithme UPGMA : <----------------------------

Dictionnaire : {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F'}

Distance matrix :

     A     B     C     D     E     F
    0.0    2.0    4.0    6.0    6.0    8.0    A
    2.0    0.0    4.0    6.0    6.0    8.0    B
    4.0    4.0    0.0    6.0    6.0    8.0    C
    6.0    6.0    6.0    0.0    4.0    8.0    D
    6.0    6.0    6.0    4.0    0.0    8.0    E
    8.0    8.0    8.0    8.0    8.0    0.0    F

Nouveau dictionnaire : {0: 'AB', 1: 'C', 2: 'D', 3: 'E', 4: 'F'}

Distance matrix :

     AB     C     D     E     F
    0.0    4.0    6.0    6.0    8.0    AB
    4.0    0.0    6.0    6.0    8.0    C
    6.0    6.0    0.0    4.0    8.0    D
    6.0    6.0    4.0    0.0    8.0    E
    8.0    8.0    8.0    8.0    0.0    F



Dictionnaire : {0: 'AB', 1: 'C', 2: 'D', 3: 'E', 4: 'F'}

Distance matrix :

     AB     C     D     E     F
    0.0    4.0    6.0    6.0    8.0    AB
    4.0    0.0    6.0    6.0    8

# 2. Neighbor Joining :

In [5]:
distance_matrix = np.array([[0.0,2.0,4.0,6.0,6.0,8.0],
                            [2.0,0.0,4.0,6.0,6.0,8.0],
                            [4.0,4.0,0.0,6.0,6.0,8.0],
                            [6.0,6.0,6.0,0.0,4.0,8.0],
                            [6.0,6.0,6.0,4.0,0.0,8.0],
                            [8.0,8.0,8.0,8.0,8.0,0.0]])

In [6]:
def nj_loop(distance_matrix):
    dic = get_dict(distance_matrix) # Initialisation du dictionnaire
    cpt = 1
    save_order = []
    n = distance_matrix.shape[0] # Sauveguarde du nombre d'espece
    while distance_matrix.shape[0] > 2: # Condition d'arret de l'algorithme
        print("===================== Nombre d'itération :", cpt,"=====================")
        print("\nDictionnaire :", dic)
        display_distance_matrix(distance_matrix, dic)
        distance_matrix, dic, save_order = nj(distance_matrix, dic, save_order)
        print("\nNouveau dictionnaire :", dic)
        display_distance_matrix(distance_matrix, dic)
        cpt += 1
    save_order.append(list(dic.values())[0]) # Ajout de toute les especes a la fin
    save_order.append(list(dic.values())[1])
    save_order.append(distance_matrix.max(axis=0)[0]) # Ajout de la derniere valeur
    return distance_matrix, dic, save_order

def nj(distance_matrix, dic, save_order):
    u_values = compute_u(distance_matrix)    
    q_values = compute_q(distance_matrix, u_values)
    min_v, v1, v2 = get_min_v(q_values, dic)
    distance_matrix_u, save_order = update_distance_matrix(distance_matrix, v1, v2, dic, save_order)
    distance_matrix_u = distance_matrix_u.astype(float)
    dic_u = update_dic(dic, v1, v2)
    return distance_matrix_u, dic_u, save_order

def compute_u(distance_matrix):
    n = distance_matrix.shape[0]
    return np.sum(distance_matrix, axis=0)/(n-2)

def compute_q(distance_matrix, u_values):
    n, m = distance_matrix.shape
    q_values = np.zeros((n,m))
    for i in range(n):
        for j in range(m):
            if i != j: # Ignore diagonal pour compute les q
                q_values[i][j] = distance_matrix[i][j] - u_values[i] - u_values[j]
    return q_values

def compute_new_distances(distance_matrix, v1, v2, dic, save_order): # Redefinition de la fonction pour l'algorithme NJ
    n = distance_matrix.shape[0]
    new_distances = np.zeros((n-1))
    new_distances[0] = 0 # 0 de la diagonal
    k1 = [k for k, v in dic.items() if v == v1][0]
    k2 = [k for k, v in dic.items() if v == v2][0]
    index = 1
    for i in range(n):
        if k1 != i and k2 != i:
            new_distances[index] = (distance_matrix[i][k1] + distance_matrix[i][k2] - distance_matrix[k1][k2]) / 2 # La difference est ici
            index += 1
    save_order.append(v1)
    save_order.append(v2)
    save_order.append(distance_matrix[k1][k2])
    return new_distances, save_order

In [7]:
print("----------------------------> Algorithme NJ : <----------------------------")
distance_matrix_final, _, order_final = nj_loop(distance_matrix)

print("----------------------------> Fabrication de l'arbe de philogenie : <----------------------------\n")
print("Arbre final : ", order_final)
display_tree(order_final)

----------------------------> Algorithme NJ : <----------------------------

Dictionnaire : {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F'}

Distance matrix :

     A     B     C     D     E     F
    0.0    2.0    4.0    6.0    6.0    8.0    A
    2.0    0.0    4.0    6.0    6.0    8.0    B
    4.0    4.0    0.0    6.0    6.0    8.0    C
    6.0    6.0    6.0    0.0    4.0    8.0    D
    6.0    6.0    6.0    4.0    0.0    8.0    E
    8.0    8.0    8.0    8.0    8.0    0.0    F

Nouveau dictionnaire : {0: 'AB', 1: 'C', 2: 'D', 3: 'E', 4: 'F'}

Distance matrix :

     AB     C     D     E     F
    0.0    3.0    5.0    5.0    7.0    AB
    3.0    0.0    6.0    6.0    8.0    C
    5.0    6.0    0.0    4.0    8.0    D
    5.0    6.0    4.0    0.0    8.0    E
    7.0    8.0    8.0    8.0    0.0    F

Dictionnaire : {0: 'AB', 1: 'C', 2: 'D', 3: 'E', 4: 'F'}

Distance matrix :

     AB     C     D     E     F
    0.0    3.0    5.0    5.0    7.0    AB
    3.0    0.0    6.0    6.0    8.0   