# Bioinformatics 2021: Week 4

### Working with trees

In [58]:
#!pip install dendropy

In [59]:
import dendropy

In [86]:
# https://dendropy.org/primer/trees.html#building-a-tree-programmatically

taxon_namespace = dendropy.TaxonNamespace(["A", "B", "C", "D"])
tree = dendropy.Tree(taxon_namespace=taxon_namespace)

ch1 = tree.seed_node.new_child()
ch1.edge.length = 1
ch2 = tree.seed_node.new_child(edge_length=3)

ch3 = dendropy.Node(edge_length=1)
ch4 = dendropy.Node(edge_length=2)
ch1.add_child(ch3)
ch1.add_child(ch4)

ch5 = dendropy.Node(edge_length=4)
ch6 = dendropy.Node(edge_length=5)

ch3.set_child_nodes([ch5, ch6])

# Assign taxa
ch2.taxon = taxon_namespace.get_taxon("A")
ch4.taxon = taxon_namespace.get_taxon("B")
ch5.taxon = taxon_namespace.get_taxon("C")
ch6.taxon = taxon_namespace.get_taxon("D")

In [87]:
# visual representation
print(tree.as_ascii_plot())

                                                                             /-------------------------------------- C
                                      /--------------------------------------+                                        
/-------------------------------------+                                      \-------------------------------------- D
|                                     |                                                                               
+                                     \----------------------------------------------------------------------------- B
|                                                                                                                     
\------------------------------------------------------------------------------------------------------------------- A
                                                                                                                      
                                                

In [88]:
# Newick format 
print(tree.as_string("newick"))

(((C:4,D:5):1,B:2):1,A:3);



### Working with BioPython

In [90]:
#pip install biopython

In [91]:
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

In [92]:
alignments = pairwise2.align.globalms("AGTCGATGCGATAGCCGTA", "AGTCGTGCGAAAGCCCTA", 2, -1.5, -1, -0.5)
print(format_alignment(*alignments[0]))

AGTCGATGCGATAGCCGTA
||||| |||||.||||.||
AGTCG-TGCGAAAGCCCTA
  Score=28



In [93]:
alignments[0].seqA, alignments[0].seqB, alignments[0].score

('AGTCGATGCGATAGCCGTA', 'AGTCG-TGCGAAAGCCCTA', 28.0)

### Working with numpy 

In [94]:
import numpy as np

In [95]:
arr = np.ndarray((3,3))
for i in range(3):
    for j in range(3):
        arr[i,j] = (i+1)+(j+1)*10

In [96]:
arr

array([[11., 21., 31.],
       [12., 22., 32.],
       [13., 23., 33.]])

### Tasks

In [97]:
# Task 1: construct pairwise distance matrix for a list of sequences:
# - align two sequences
# - use hamming_distance

def distance_matrix_pairwise(seqs):
    dm = np.ndarray((len(seqs),len(seqs)))
    # your code
    
    return dm

# Example 1: pairwise_distance(["AA", "AC", "GC"]) -> 
#                                                    [[0,1,2],
#                                                     [1,0,1],
#                                                     [2,1,0]]

In [89]:
# Task 2: perform hierarchical clustering

def hierarchical_clustering(dm, taxa):
    taxon_namespace = dendropy.TaxonNamespace(taxa)
    tree = dendropy.Tree(taxon_namespace=taxon_namespace)
    # your code
    
    return tree

# Inputs:
# - NxN distance matrix for N objects
# - list of names for N objects

# Brief notes:
# - two nearest objects are joined into a cluster
# - the corresponding edge_length should be taken as their respective distance / 2
# - algorithm iterates until a signle root cluster is formed
# - distance between clusters should be calculated as mean pairwise distance 
#   between all objects in the 1st cluster and all objects in the 2nd cluster (or suggest your idea?)