In [1]:
import numpy as np
import copy

# Week 1: Introduction to Evolutionary Tree Construction

In [2]:
'''
Distances Between Leaves Problem: Compute the distances between leaves in a weighted tree.
Input:  An integer n followed by the adjacency list of a weighted tree with n leaves.
Output: An n x n matrix (di,j), where di,j is the length of the path between leaves i and j.
'''

def Distances_Matrix(n, adjacency_list):
    if type(adjacency_list) == str:
        adjacency_list = adjacency_list.split('\n')
        
    graph        = dict()
    graph_weight = dict()
    for adjacency in adjacency_list:
        adjacency = adjacency.split(':')
        graph_weight[adjacency[0]] = int(adjacency[1])
        adjacency = adjacency[0].split('->')
        adjacency[0], adjacency[1] = int(adjacency[0]), int(adjacency[1])
        if adjacency[0] in graph:
            graph[adjacency[0]].append(adjacency[1])
        else:
            graph[adjacency[0]] = [(adjacency[1])]
    
    length_matrix = np.full([n,n], 0, int)

    for from_ in range(n):
        weight_row = [0] * (max(graph.keys()) + 1)
        froms      = [from_]
        
        while len(froms) != 0:
            
            next_froms = []
            for node in froms:
                tos = graph[node]
                
                for to in tos:
                    if to != from_:
                        if (to not in range(n)) & (weight_row[to] == 0):
                            next_froms.append(to)
                        
                        graph_weight_key = str(node) + '->' + str(to)
                        weight           = graph_weight[graph_weight_key]
                        weight_row[to]   = weight_row[node] + weight
                        
            froms = next_froms
        length_matrix[from_, :] = weight_row[: n]

    return(length_matrix)

# Test
n = 4
adjacency_list = '''0->4:11
1->4:2
2->5:6
3->5:7
4->0:11
4->1:2
4->5:4
5->4:4
5->3:7
5->2:6'''

Distances_Matrix(n, adjacency_list)

array([[ 0, 13, 21, 22],
       [13,  0, 12, 13],
       [21, 12,  0, 13],
       [22, 13, 13,  0]])

In [3]:
'''
Code Challenge: Solve the Limb Length Problem.
Input: An integer n, followed by an integer j between 0 and n - 1, followed by a space-separated additive distance matrix D (whose elements are integers).
Output: The limb length of the leaf in Tree(D) corresponding to row j of this distance matrix (use 0-based indexing).
'''

def Limb_Length(j, length_matrix):
    if type(length_matrix) == str:
        length_matrix = length_matrix.replace('\n', ' ')
        length_matrix = length_matrix.split(' ')
        length_matrix = list(map(int, length_matrix))
        length_matrix = np.array(length_matrix).reshape(n, n)

    min_length = float('Inf')
    for i in range(length_matrix.shape[0]):
        if i != j:
            for k in range(length_matrix.shape[0]):
                if (k != j) & (k != i):
                    length = (length_matrix[i, j] + length_matrix[j, k] - length_matrix[i, k]) / 2
                    if length < min_length:
                        min_length = int(length)
    
    return(min_length)

# Test
n = 4
j = 1
length_matrix = '''0 13 21 22
13 0 12 13
21 12 0 13
22 13 13 0'''

Limb_Length(j, length_matrix)

2

In [4]:
'''
Code Challenge: Implement AdditivePhylogeny to solve the Distance-Based Phylogeny Problem.
Input: An integer n followed by a space-separated n x n distance matrix.
Output: A weighted adjacency list for the simple tree fitting this matrix.
'''

def Attached_Limb(length_matrix, j):
    for i in range(length_matrix.shape[0]):
        for k in range(length_matrix.shape[0]):
            if (i != j) & (k != j) :
                if length_matrix[i, k] == length_matrix[i, j - 1] + length_matrix[j - 1, k]:
                    return(i, k)

def Find_Path(nodes, current, final, path, visited, final_path):
    path = path + [current]
    visited.append(current)
    neighbor_nodes = nodes[current].keys()
    if current == final:
        final_path.extend(path) 
        return

    unvisited_neighbor_nodes = set(neighbor_nodes) - set(visited)
    if len(unvisited_neighbor_nodes) == 0:
        return
    
    for unvisited_neighbor_node in list(unvisited_neighbor_nodes):
        Find_Path(nodes,int(unvisited_neighbor_node),final,path,visited, final_path)

    return final_path
 
def add_to_graph(length_matrix,nodes,n,m,i,k,x):
    visited = []
    final_path = []
    
    i_k_path = Find_Path(nodes,i,k,[],visited,final_path)   
    total_length = 0

    for index in range(len(i_k_path) - 1):
        current_node   = i_k_path[index]
        next_node      = i_k_path[index + 1]
        length_between = nodes[current_node][next_node]
        total_length   = total_length + length_between

        if total_length == x:
            limb_length = Limb_Length(n, length_matrix)
            nodes[next_node][n] = limb_length
            nodes[n] = {next_node:limb_length}
            return nodes
        
        elif total_length > x:
            length1 = x - (total_length - length_between)
            length2 = total_length - x

            limb_length = Limb_Length(n , length_matrix)

            nodes[current_node].pop(next_node)
            nodes[next_node]   .pop(current_node)

            nodes[current_node][m[0]] = length1
            nodes[next_node][m[0]]    = length2
            nodes[m[0]] = {current_node:length1, next_node:length2}

            nodes[m[0]][n] = limb_length
            nodes[n]       = {m[0]:limb_length}
            m[0]           = m[0] + 1
            return nodes
    return nodes

def AdditivePhylogeny(length_matrix,n,m):
    '''if type(length_matrix) == str:
        length_matrix = length_matrix.replace('\n', ' ')
        length_matrix = length_matrix.split(' ')
        length_matrix = list(map(int, length_matrix))
        length_matrix = np.array(length_matrix).reshape(n, n)'''
    
    if n == 1:
        nodes = {}
        nodes[1] = {0:length_matrix[0, 1]}
        nodes[0] = {1:length_matrix[0, 1]}
        return nodes
    
    limb_length = Limb_Length(n , length_matrix)

    sub_matrix = copy.deepcopy(length_matrix)

    for j in range(n):

        sub_matrix[j, n] = sub_matrix[j, n] - limb_length
        sub_matrix[n, j] = sub_matrix[j, n]   

    (i,k) = Attached_Limb(sub_matrix, n)

    x = sub_matrix[i, n]

    sub_matrix = sub_matrix[: -1, : -1]
        
    nodes = AdditivePhylogeny(sub_matrix,n-1,m)

    nodes = add_to_graph(length_matrix, nodes, n, m, i, k, x)
    return nodes

#Test
n = 4
length_matrix = '''0 13 21 22
13 0 12 13
21 12 0 13
22 13 13 0'''
if type(length_matrix) == str:
    length_matrix = length_matrix.replace('\n', ' ')
    length_matrix = length_matrix.split(' ')
    length_matrix = list(map(int, length_matrix))
    length_matrix = np.array(length_matrix).reshape(n, n)

tmp = AdditivePhylogeny(length_matrix, n - 1,[n])    
for key1, values1 in tmp.items():
    for key2, value2 in values1.items():
        print(str(key1) + '->' + str(key2) + ':' + str(value2))

1->4:2
0->4:11
4->0:11
4->1:2
4->5:4
2->5:6
5->4:4
5->2:6
5->3:7
3->5:7


# Week 2: More Algorithms for Constructing Trees from Distance Matrices

In [5]:
'''
Code Challenge: Implement UPGMA.
Input: An integer n followed by a space separated n x n distance matrix.
Output: An adjacency list for the ultrametric tree returned by UPGMA. 
'''

def UPGMA(n, length_matrix):
    if type(length_matrix) == str:
        length_matrix = length_matrix.replace('\n', ' ')
        length_matrix = length_matrix.split(' ')
        while '' in length_matrix:
            length_matrix.remove('')
        length_matrix = list(map(int, length_matrix))
        length_matrix = np.array(length_matrix).reshape(n, n)
        
    
    clusters = list(range(n))
    ages = dict()
    graph = dict()
    n_node_cluster = dict()
    
    for node in clusters:
        ages[node] = 0
        n_node_cluster[node] = 1
        
    while len(clusters) != 1:
        
        n = length_matrix.shape[0]
        
        # find minimun length
        min_length   = float('Inf')
        for i in range(n):
            for j in range(n):
                if (length_matrix[i, j] != 0) & (length_matrix[i, j] < min_length) & (i < j):
                    min_length   = length_matrix[i, j]
                    min_location = [i, j]
                    
        i_index, j_index = min_location
        
        i = clusters[i_index]
        j = clusters[j_index]
        
        #make new cluster
        new_cluster = max(clusters) + 1
        
        #update new cluster's n_node
        n_node_cluster[new_cluster] = n_node_cluster[i] + n_node_cluster[j]
        
        #update cluster list
        clusters.remove(i)
        clusters.remove(j)
        clusters.append(new_cluster)
        
        #update ages
        ages[new_cluster] = length_matrix[i_index, j_index] / 2
        
        #add new cluster to graph
        if new_cluster in graph :
            graph[new_cluster][i] = ages[new_cluster] - ages[i]
            graph[new_cluster][j] = ages[new_cluster] - ages[j]
        else:
            graph[new_cluster] = {i : (ages[new_cluster] - ages[i]), j : (ages[new_cluster] - ages[j])}
        if i in graph :
            graph[i][new_cluster] = ages[new_cluster] - ages[i]
        else: 
            graph[i] = {new_cluster : (ages[new_cluster] - ages[i])}
        if j in graph :
            graph[j][new_cluster] = ages[new_cluster] - ages[j]
        else: 
            graph[j] = {new_cluster : (ages[new_cluster] - ages[j])}
        
        new_col = []
        for vi, vj in zip(length_matrix[:,i_index], length_matrix[:,j_index]):
            if (vi * vj) != 0:
                new_value = (vi * n_node_cluster[i] + vj * n_node_cluster[j]) / (n_node_cluster[i] + n_node_cluster[j])
                new_col.append(new_value)
        
        #update matrix
        length_matrix = np.delete(length_matrix, [i_index, j_index], 0)
        length_matrix = np.delete(length_matrix, [i_index, j_index], 1)
        
        length_matrix = np.vstack((length_matrix,np.array(new_col).reshape(1, len(new_col))))
        new_col.append(0)
        length_matrix = np.hstack((length_matrix,np.array(new_col).reshape(len(new_col), 1)))
            
    return(graph)

# Test
n = 4
length_matrix = '''0 20 17 11
20 0 20 13
17 20 0 10
11 13 10 0'''

tmp = UPGMA(n, length_matrix)

for key1, values1 in tmp.items():
    for key2, value2 in values1.items():
        print(str(key1) + '->' + str(key2) + ':' + str(value2))

4->2:5.0
4->3:5.0
4->5:2.0
2->4:5.0
3->4:5.0
5->0:7.0
5->4:2.0
5->6:1.83333333333
0->5:7.0
6->1:8.83333333333
6->5:1.83333333333
1->6:8.83333333333


In [6]:
'''
Code Challenge: Implement NeighborJoining.
Input: An integer n, followed by an n x n distance matrix.
Output: An adjacency list for the tree resulting from applying the neighbor-joining algorithm. 
'''

def NeighborJoining(n, length_matrix, nodes, m):

    graph = dict()
    limb_length = dict()
    if n == 2:
        graph[nodes[0]] = {nodes[1]: length_matrix[0, 1]}
        graph[nodes[1]] = {nodes[0]: length_matrix[1, 0]}
        return(graph)
    
    total_distance = {}
    for i in range(n):
        total_distance[i] = np.sum(length_matrix[i,:])
    
    # generate joining matrix 
    matrix_star = np.full([n,n], 0, float)
    for i in range(n):
        for j in range(n):
            if i != j:
                matrix_star[i, j] = (n - 2) * length_matrix[i, j] - total_distance[i] - total_distance[j]
    
    # find minimun value in joining matrix
    min_length   = float('Inf')
    for i in range(n):
        for j in range(n):
            if (matrix_star[i, j] != 0) & (matrix_star[i, j] < min_length) & (i < j):
                min_length   = matrix_star[i, j]
                min_location = [i, j]
    i_index, j_index = min_location
    
    i = nodes[i_index]
    j = nodes[j_index]
    
    # calculate delta for limb length
    delta = (total_distance[i_index] - total_distance[j_index]) / (n - 2)
    
    limb_length[i_index] = (length_matrix[i_index, j_index] + delta) / 2
    limb_length[j_index] = (length_matrix[i_index, j_index] - delta) / 2
    
    #make new node
    new_node = m 
    m = m + 1
    
    # update node list
    nodes.append(new_node)
    nodes.remove(i)
    nodes.remove(j)
    
    # update length matrix
    new_col = []
    for k in range(n):
        new_value = (length_matrix[k, i_index] + length_matrix[k, j_index] - length_matrix[i_index, j_index]) / 2
        new_col.append(new_value)
    length_matrix = np.vstack((length_matrix,np.array(new_col).reshape(1, len(new_col))))
    new_col.append(0)
    length_matrix = np.hstack((length_matrix,np.array(new_col).reshape(len(new_col), 1)))
    
    length_matrix = np.delete(length_matrix, [i_index, j_index], 0)
    length_matrix = np.delete(length_matrix, [i_index, j_index], 1)
    
    # iteration
    graph = NeighborJoining(n - 1, length_matrix, nodes, m)
    
    # update graph
    if new_node in graph :
        graph[new_node][i] = limb_length[i_index]
        graph[new_node][j] = limb_length[j_index]
    else:
        graph[new_node] = {i : limb_length[i_index], j : limb_length[j_index]}
    if i in graph :
        graph[i][new_node] = limb_length[i_index]
    else: 
        graph[i] = {new_node : limb_length[i_index]}
    if j in graph :
        graph[j][new_node] = limb_length[j_index]
    else: 
        graph[j] = {new_node : limb_length[j_index]}  
    
    return(graph)

# Test
n = 4
length_matrix = '''0 23 27 20
23 0 30 28
27 30 0 30
20 28 30 0'''

nodes = list(range(n))
m = n
if type(length_matrix) == str:
    length_matrix = length_matrix.replace('\n', ' ')
    length_matrix = length_matrix.split(' ')
    while '' in length_matrix:
        length_matrix.remove('')
    length_matrix = list(map(int, length_matrix))
    length_matrix = np.array(length_matrix).reshape(n, n)

    
tmp = NeighborJoining(n, length_matrix, nodes, m)
for key1, values1 in tmp.items():
    for key2, value2 in values1.items():
        print(str(key1) + '->' + str(key2) + ':' + str(value2))

4->5:2.0
4->0:8.0
4->3:12.0
5->4:2.0
5->1:13.5
5->2:16.5
1->5:13.5
2->5:16.5
0->4:8.0
3->4:12.0
