# Tree Parse

---
## Objectives

1. Convert newick tree to PhyloXML format.
1. Convert tree to data frame.
1. Add metadata to PhyloXML and data frame.

---
## Setup

### Module Imports

In [1]:
import pandas as pd

from Bio import Phylo

import os
import copy

### Input File Paths

In [2]:
tree_path = "../../docs/results/latest/iqtree/iqtree-core_chromosome.filter5.treefile"
#tree_path = "../../results/iqtree/all/iqtree-core_chromosome.filter50.treefile"
metadata_path = "../../docs/results/latest/metadata/metadata.txt"
outdir = "../../docs/results/latest/parse_tree"

# Create output directory if it doesn't exist
if not os.path.exists(outdir):
    os.mkdir(outdir)

### Variables

In [3]:
# Branch Support Thresholds (from IQTREE docs)
ALRT_THRESH = 80
UFBOOT_THRESH = 95

BRANCH_LEN_SIG_DIG = 12

NAME_COL = "Sample"
NO_DATA_CHAR = "NA"

LOW_COL = "black"
HIGH_COL = "red"
TERM_COL = "grey"

---
## 1. Convert tree to PhyloXML format

In [4]:
tmp_xml = os.path.join(outdir, "temp.xml")
Phylo.convert(tree_path, "newick", tmp_xml, "phyloxml")

1

## 2. Convert tree to dataframe

### Read in XML tree

In [5]:
# Read in the new tree
tree = Phylo.read(tmp_xml, "phyloxml")

# Deepest nodes first
tree.ladderize(reverse=True)

### Fix node names and confidences

In [6]:
# Counter for integer node names
node_i = 0

# Iterate through the nodes in the tree
for c in tree.find_clades():
    if not c.name: continue
    name_split = c.name.split("/")
    # Check if the name has 'accidentally' become the confidence vals
    alrt_val = NO_DATA_CHAR
    ufboot_val = NO_DATA_CHAR
    
    if len(name_split) == 2:
        # Name the internal node
        c.name = "NODE" + str(node_i)    

        # Assign confidence values
        alrt_val = float(name_split[0])        
        ufboot_val = float(name_split[1])
        
    else:
        alrt_val = NO_DATA_CHAR
        ufboot_val = NO_DATA_CHAR

    alrt_conf = Phylo.PhyloXML.Confidence(alrt_val, type="alrt")
    ufboot_conf = Phylo.PhyloXML.Confidence(ufboot_val, type="ufboot")
    
    c.confidences.append(alrt_conf)
    c.confidences.append(ufboot_conf)
        
    # Increment node counter
    node_i += 1

### Initialize the tree dataframe

In [7]:
# initialize empty dataframe
tree_df = pd.DataFrame(columns = [
    "Name",
    "UFboot", 
    "aLRT", 
    "Branch_Length", 
    "Branch_Support_Color", 
    "coord_x", 
    "coord_y",]
)

for c in tree.find_clades():
    if not c.name: continue
    if not c.branch_length: c.branch_length = 0
    
    node_data = {
        "Name" : c.name,
        "UFboot" : [conf.value for conf in c.confidences if conf.type=="ufboot"][0],
        "aLRT" :  [conf.value for conf in c.confidences if conf.type=="alrt"][0],
        "Branch_Support_Color" :  NO_DATA_CHAR,
        "Branch_Length" :  c.branch_length,
        "coord_x" :  NO_DATA_CHAR,
        "coord_y" :  NO_DATA_CHAR,
    }
    
    tree_df = tree_df.append(node_data, ignore_index=True)

# Set the index to the node name
tree_df.set_index("Name", inplace=True)

print(tree_df)

               UFboot  aLRT Branch_Length Branch_Support_Color coord_x coord_y
Name                                                                          
NODE0             100   100             0                   NA      NA      NA
NODE1             100  99.9    5.2248e-06                   NA      NA      NA
NODE2             100   100   1.82301e-05                   NA      NA      NA
NODE3             100   100    1.0679e-05                   NA      NA      NA
NODE4             100  97.9     2.123e-06                   NA      NA      NA
...               ...   ...           ...                  ...     ...     ...
SAMEA104233049     NA    NA     1.871e-07                   NA      NA      NA
SAMEA104233047     NA    NA     4.923e-07                   NA      NA      NA
SAMEA104233050     NA    NA     1.006e-06                   NA      NA      NA
SAMEA104233046     NA    NA    2.7607e-06                   NA      NA      NA
SAMEA3541827       NA    NA    3.8915e-06           

### Add plotting x and y coordinates of each node

In [8]:
# This code is from the biopython Phylo module

def get_x_positions(tree):
    """Create a mapping of each clade to its horizontal position.
    Dict of {clade: x-coord}
    """
    depths = tree.depths()
    # If there are no branch lengths, assume unit branch lengths
    if not max(depths.values()):
        depths = tree.depths(unit_branch_lengths=True)
    return depths

def get_y_positions(tree):
    """Create a mapping of each clade to its vertical position.
    Dict of {clade: y-coord}.
    Coordinates are negative, and integers for tips.
    """
    maxheight = tree.count_terminals()
    # Rows are defined by the tips
    heights = {
        tip: maxheight - i for i, tip in enumerate(reversed(tree.get_terminals()))
    }

    # Internal nodes: place at midpoint of children
    def calc_row(clade):
        for subclade in clade:
            if subclade not in heights:
                calc_row(subclade)
        # Closure over heights
        heights[clade] = (
            heights[clade.clades[0]] + heights[clade.clades[-1]]
        ) / 2.0

    if tree.root.clades:
        calc_row(tree.root)
    return heights

x_posns = get_x_positions(tree)
y_posns = get_y_positions(tree)

# Add x and y coordinates as other attributes
for c in tree.find_clades():
    if not c.name: continue
    # x coordinates will be of branch length units
    coord_x = [value for key,value in x_posns.items() if key.name == c.name][0]
    # y coordinates will be arbitrary, based on number of tips
    coord_y = [value for key,value in y_posns.items() if key.name == c.name][0]
    # Add data to tree dataframe
    tree_df.at[c.name, 'coord_x'] = coord_x
    tree_df.at[c.name, 'coord_y'] = coord_y
    
    # Store them as other attributes
    #other_coord_x = Phylo.PhyloXML.Other(tag="coord_x", value="{:.10f}".format(coord_x), namespace="coord")
    #other_coord_y = Phylo.PhyloXML.Other(tag="coord_y", value="{:.10f}".format(coord_y), namespace="coord")   
    # Add data to tree
    #c.other.append(other_coord_x)
    #c.other.append(other_coord_y)

print(tree_df)

               UFboot  aLRT Branch_Length Branch_Support_Color      coord_x  \
Name                                                                          
NODE0             100   100             0                   NA            0   
NODE1             100  99.9    5.2248e-06                   NA   5.2248e-06   
NODE2             100   100   1.82301e-05                   NA  2.34549e-05   
NODE3             100   100    1.0679e-05                   NA  3.41339e-05   
NODE4             100  97.9     2.123e-06                   NA  3.62569e-05   
...               ...   ...           ...                  ...          ...   
SAMEA104233049     NA    NA     1.871e-07                   NA  3.70928e-05   
SAMEA104233047     NA    NA     4.923e-07                   NA  2.50576e-05   
SAMEA104233050     NA    NA     1.006e-06                   NA   1.5829e-05   
SAMEA104233046     NA    NA    2.7607e-06                   NA  1.64692e-05   
SAMEA3541827       NA    NA    3.8915e-06           

In [9]:
#print([1 + x for x in x_posns.values()])
print(tree.depths().values())

dict_values([0, 5.2248e-06, 2.34549e-05, 3.41339e-05, 3.62569e-05, 4.59334e-05, 4.79901e-05, 5.36781e-05, 5.48501e-05, 5.57877e-05, 5.58111e-05, 5.60455e-05, 5.6279899999999996e-05, 5.630329999999999e-05, 5.677209999999999e-05, 5.682189999999999e-05, 5.728799999999999e-05, 5.983929999999999e-05, 6.245279999999999e-05, 6.292159999999999e-05, 6.294499999999999e-05, 6.296839999999998e-05, 6.343719999999998e-05, 6.367159999999998e-05, 6.955809999999998e-05, 6.958149999999998e-05, 6.960489999999997e-05, 6.962829999999997e-05, 7.009709999999997e-05, 7.012049999999997e-05, 7.014389999999996e-05, 7.037829999999996e-05, 7.292989999999996e-05, 7.410189999999996e-05, 7.431179999999996e-05, 7.433519999999996e-05, 7.435859999999996e-05, 7.456939999999996e-05, 7.461189999999996e-05, 7.463529999999996e-05, 7.465869999999996e-05, 7.468209999999995e-05, 7.470549999999995e-05, 7.472889999999995e-05, 7.496329999999995e-05, 7.498669999999994e-05, 7.501009999999994e-05, 7.503349999999994e-05, 7.50568999999

### Add branch support color

In [10]:
for c in tree.find_clades():   
    if not c.name: continue
    # Default Color
    branch_color = LOW_COL
    # Terminal branches will be grey
    if c.is_terminal():
        branch_color = TERM_COL
    # High support branches
    alrt_val = tree_df["aLRT"][c.name]
    ufboot_val = tree_df["UFboot"][c.name]
    if alrt_val != "NA" and ufboot_val != "NA":
        if alrt_val >= ALRT_THRESH and ufboot_val >= UFBOOT_THRESH:
            branch_color = HIGH_COL
    tree_df.at[c.name, "Branch_Support_Color"] = branch_color            
    # Store branch color as an other attribute
    #other_branch_color = Phylo.PhyloXML.Other(tag="branch-support", value=branch_color, namespace="color")
    #c.other.append(other_branch_color)

print(tree_df["Branch_Support_Color"])

Name
NODE0              red
NODE1              red
NODE2              red
NODE3              red
NODE4              red
                  ... 
SAMEA104233049    grey
SAMEA104233047    grey
SAMEA104233050    grey
SAMEA104233046    grey
SAMEA3541827      grey
Name: Branch_Support_Color, Length: 1019, dtype: object


### Parse metadata into df

In [11]:
# Parse the metadata
metadata_df = pd.read_csv(metadata_path, sep='\t')

# Fix the problem with multiple forms of NA in the table
# Consolidate missing data to the NO_DATA_CHAR
metadata_df.fillna(NO_DATA_CHAR, inplace=True)

# Tell treetime which column is the taxon name
#if NAME_COL in metadata_df.columns:
#    taxon_name = NAME_COL

# set the metadata index
metadata_df.set_index(NAME_COL, inplace=True)
print(metadata_df)

                                    Strain  Date DateBP  \
Sample                                                    
GCA_009669545.1_ASM966954v1_genomic  42126  2006    -15   
GCA_009669555.1_ASM966955v1_genomic  42123  2005    -16   
GCA_009669565.1_ASM966956v1_genomic  42118  2005    -16   
GCA_009669605.1_ASM966960v1_genomic  42117  2005    -16   
GCA_009669625.1_ASM966962v1_genomic  42116  2005    -16   
...                                    ...   ...    ...   
IP283                                IP283  1994    -27   
IP562                                IP562  1947    -74   
IP542                                IP542  1952    -69   
IP543                                IP543  1953    -68   
IP557                                IP557  1963    -58   

                                                              Country  \
Sample                                                                  
GCA_009669545.1_ASM966954v1_genomic                             China   
GCA_009669555

### Add metadata to tree and tree dataframe

In [12]:
# Iterate through the different metadata attributes
for attr in metadata_df.columns:
    # Initialize an empty column for the attribute
    tree_df[attr] = [NO_DATA_CHAR for row in range(0,len(tree_df))]
    # Iterate over the nodes in the tree
    for c in tree.find_clades():
        # Initialize to empty
        attr_val = NO_DATA_CHAR
        # If the node shows up in the metadata
        if c.name in metadata_df.index:
            tree_df.at[c.name, attr] = metadata_df[attr][c.name]
            attr_val = metadata_df[attr][c.name]         
        # Store the metadata as a clade attribute
        #other_meta = Phylo.PhyloXML.Other(tag=attr, value=str(attr_val), namespace="meta")   
        #c.other.append(other_meta)
print(tree_df)

               UFboot  aLRT Branch_Length Branch_Support_Color      coord_x  \
Name                                                                          
NODE0             100   100             0                  red            0   
NODE1             100  99.9    5.2248e-06                  red   5.2248e-06   
NODE2             100   100   1.82301e-05                  red  2.34549e-05   
NODE3             100   100    1.0679e-05                  red  3.41339e-05   
NODE4             100  97.9     2.123e-06                  red  3.62569e-05   
...               ...   ...           ...                  ...          ...   
SAMEA104233049     NA    NA     1.871e-07                 grey  3.70928e-05   
SAMEA104233047     NA    NA     4.923e-07                 grey  2.50576e-05   
SAMEA104233050     NA    NA     1.006e-06                 grey   1.5829e-05   
SAMEA104233046     NA    NA    2.7607e-06                 grey  1.64692e-05   
SAMEA3541827       NA    NA    3.8915e-06           

### Save tree and tree dataframe

In [13]:
out_path_df = os.path.join(outdir, "parse_tree.tsv" )
tree_df.to_csv(out_path_df, sep="\t")

out_path_xml = os.path.join(outdir, "parse_tree.xml" )
out_path_nwk = os.path.join(outdir, "parse_tree.nwk" )
out_path_nexus = os.path.join(outdir, "parse_tree.nexus" )
Phylo.write(tree, out_path_xml, 'phyloxml')
Phylo.write(tree, out_path_nwk, 'newick', format_branch_length='%1.{}f'.format(BRANCH_LEN_SIG_DIG))
Phylo.write(tree, out_path_nexus, 'nexus', format_branch_length='%1.{}f'.format(BRANCH_LEN_SIG_DIG))

1

---
## Cleanup

In [14]:
os.remove(tmp_xml)