---
# Objectives

1. Convert newick tree to PhyloXML format.
1. Fixes internal node names from an IQTREE tree. 
   - internal nodes by default get named the confidence (ex. "99/100")
   - this notebook instead gives them the name NODEi where i is an integer starting from 0.
   - NODE0 therefore becomes the root.
1. Convert tree to data frame.
1. Combine metadata with tree data frame.

---
# Setup

## Module Imports

In [1]:
import pandas as pd
from Bio import Phylo
import os
import copy
import augur
import json
from matplotlib import colors

## Input File Paths

In [2]:
#tree_path = "../../docs/results/latest/iqtree/iqtree-core_chromosome.filter5.treefile"
tree_path = "../../docs/results/latest/iqtree/iqtree-core_chromosome.filter5_post.cf.tree"
metadata_path = "../../docs/results/latest/metadata/metadata.tsv"
outdir = "../../docs/results/latest/parse_tree"

# Create output directory if it doesn't exist
if not os.path.exists(outdir):
    os.mkdir(outdir)

## Variables

In [3]:
from config import *

# Custom script variables
SCRIPT_NAME = "parse_tree"
NAME_COL = "Sample"

---
# 1. Convert tree to PhyloXML format

In [4]:
tmp_xml = os.path.join(outdir, "temp.xml")
Phylo.convert(tree_path, "newick", tmp_xml, "phyloxml")

1

---
# 2. Fix internal node names

## Read in XML tree

In [5]:
# Read in the new tree
tree = Phylo.read(tmp_xml, "phyloxml")

# Deepest nodes first
tree.ladderize(reverse=False)

## Fix node names and confidences

In [6]:
# Counter for integer node names
node_i = 0

# Iterate through the nodes in the tree
for c in tree.find_clades():
    # Check if the name has 'accidentally' become the confidence vals
    ufboot_val = NO_DATA_CHAR    
    scf_val = NO_DATA_CHAR
    # If it's a named node, parse it
    if c.name:
        name_split = c.name.split("/")
        
        # Check if the name has 'accidentally' become the confidence vals
        if len(name_split) > 1:
            # Name the internal node
            c.name = "NODE" + str(node_i)    

            # Assign confidence values     
            ufboot_val = float(name_split[0])
            scf_val = float(name_split[1])
    # If it's not a named node, give it a name
    else:
        c.name = "NODE" + str(node_i) 
    
    ufboot_conf = Phylo.PhyloXML.Confidence(ufboot_val, type="ufboot")
    scf_conf = Phylo.PhyloXML.Confidence(scf_val, type="scf")
    
    c.confidences.append(ufboot_conf)
    c.confidences.append(scf_conf)
        
    # Increment node counter
    node_i += 1

---
# 3. Convert tree to dataframe

## Initialize the tree dataframe

In [7]:
# initialize empty dataframe
tree_df = pd.DataFrame(columns = [
    "Name",
    "UFboot", 
    "sCF", 
    "Branch_Length", 
    "Branch_Support_Color", 
    "coord_x", 
    "coord_y",]
)

for c in tree.find_clades():
    if not c.branch_length: c.branch_length = 0.0
    
    node_data = {
        "Name" : c.name,
        "UFboot" : [conf.value for conf in c.confidences if conf.type=="ufboot"][0],
        "sCF" :  [conf.value for conf in c.confidences if conf.type=="scf"][0],
        "Branch_Support_Color" :  NO_DATA_CHAR,
        "Branch_Length" :  c.branch_length,
        "coord_x" :  NO_DATA_CHAR,
        "coord_y" :  NO_DATA_CHAR,
    }
    
    tree_df = tree_df.append(node_data, ignore_index=True)

# Set the index to the node name
tree_df.set_index("Name", inplace=True)

# Visualize data frame
tree_df

Unnamed: 0_level_0,UFboot,sCF,Branch_Length,Branch_Support_Color,coord_x,coord_y
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NODE0,100,100,0.000000e+00,,,
SAMEA3541827,,,3.850300e-06,,,
NODE2,100,100,1.327050e-05,,,
SAMEA104233046,,,2.248300e-06,,,
NODE4,90.2,100,1.363600e-05,,,
...,...,...,...,...,...,...
GCA_000268965.1_ASM26896v1_genomic,,,4.692000e-07,,,
GCA_000269425.1_ASM26942v1_genomic,,,2.350000e-08,,,
NODE1263,0,97,2.346000e-07,,,
GCA_000269185.1_ASM26918v1_genomic,,,2.346000e-07,,,


## Add plotting x and y coordinates of each node

In [8]:
x_posns = get_x_positions(tree)
y_posns = get_y_positions(tree)

# Add x and y coordinates as other attributes
for c in tree.find_clades():
    # x coordinates will be of branch length units
    coord_x = [value for key,value in x_posns.items() if key.name == c.name][0]
    # y coordinates will be arbitrary, based on number of tips
    coord_y = [value for key,value in y_posns.items() if key.name == c.name][0]
    # Add data to tree dataframe
    tree_df.at[c.name, 'coord_x'] = coord_x
    tree_df.at[c.name, 'coord_y'] = coord_y    

# Visualize dataframe
tree_df

Unnamed: 0_level_0,UFboot,sCF,Branch_Length,Branch_Support_Color,coord_x,coord_y
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NODE0,100,100,0.000000e+00,,0,17.8552
SAMEA3541827,,,3.850300e-06,,3.8503e-06,1
NODE2,100,100,1.327050e-05,,1.32705e-05,3.15625
SAMEA104233046,,,2.248300e-06,,1.55188e-05,2
NODE4,90.2,100,1.363600e-05,,2.69065e-05,4.3125
...,...,...,...,...,...,...
GCA_000268965.1_ASM26896v1_genomic,,,4.692000e-07,,7.69589e-05,631
GCA_000269425.1_ASM26942v1_genomic,,,2.350000e-08,,7.65132e-05,632
NODE1263,0,97,2.346000e-07,,7.67008e-05,633.5
GCA_000269185.1_ASM26918v1_genomic,,,2.346000e-07,,7.69354e-05,633


## Add branch support color

In [9]:
for c in tree.find_clades():   
    # Default Color
    branch_color = LOW_COL
    # Terminal branches will be grey
    if c.is_terminal():
        branch_color = TERM_COL
    # High support branches
    ufboot_val = tree_df["UFboot"][c.name]
    scf_val = tree_df["sCF"][c.name]
    if scf_val != NO_DATA_CHAR and ufboot_val != NO_DATA_CHAR:
        if scf_val >= SCF_THRESH and ufboot_val >= UFBOOT_THRESH:
            branch_color = HIGH_COL
    tree_df.at[c.name, "Branch_Support_Color"] = branch_color            

tree_df

Unnamed: 0_level_0,UFboot,sCF,Branch_Length,Branch_Support_Color,coord_x,coord_y
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NODE0,100,100,0.000000e+00,red,0,17.8552
SAMEA3541827,,,3.850300e-06,grey,3.8503e-06,1
NODE2,100,100,1.327050e-05,red,1.32705e-05,3.15625
SAMEA104233046,,,2.248300e-06,grey,1.55188e-05,2
NODE4,90.2,100,1.363600e-05,black,2.69065e-05,4.3125
...,...,...,...,...,...,...
GCA_000268965.1_ASM26896v1_genomic,,,4.692000e-07,grey,7.69589e-05,631
GCA_000269425.1_ASM26942v1_genomic,,,2.350000e-08,grey,7.65132e-05,632
NODE1263,0,97,2.346000e-07,black,7.67008e-05,633.5
GCA_000269185.1_ASM26918v1_genomic,,,2.346000e-07,grey,7.69354e-05,633


---
# 4. Combine metadata into tree dataframe

## Parse metadata into df

In [10]:
# Parse the metadata
metadata_df = pd.read_csv(metadata_path, sep='\t')

# Fix the problem with multiple forms of NA in the table
# Consolidate missing data to the NO_DATA_CHAR
metadata_df.fillna(NO_DATA_CHAR, inplace=True)

# set the metadata index
metadata_df.set_index(NAME_COL, inplace=True)

metadata_df

Unnamed: 0_level_0,Strain,Date,DateBP,Country,Province,CountryLat,CountryLon,ProvinceLat,ProvinceLon,Biovar,Branch_Major,Branch_Minor,BioSample,BioSampleComment,Branch_Number
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
GCA_009909635.1_ASM990963v1_genomic,9_10,1923,-98,Russia,Rostov Oblast,64.6863,97.7453,47.6222,40.7958,Medievalis,2.MED,2.MED1,SAMN13632815,KEEP: Assembly Modern,2
GCA_009669545.1_ASM966954v1_genomic,42126,2006,-15,China,Xinjiang,35.0001,105,42.4805,85.4633,Antiqua,0.ANT,0.ANT1,SAMN07722925,KEEP: Assembly Modern,0
GCA_009669555.1_ASM966955v1_genomic,42123,2005,-16,China,Xinjiang,35.0001,105,42.4805,85.4633,Antiqua,0.ANT,0.ANT1,SAMN07722924,KEEP: Assembly Modern,0
GCA_009669565.1_ASM966956v1_genomic,42118,2005,-16,China,Xinjiang,35.0001,105,42.4805,85.4633,Antiqua,0.ANT,0.ANT1,SAMN07722923,KEEP: Assembly Modern,0
GCA_009669605.1_ASM966960v1_genomic,42117,2005,-16,China,Xinjiang,35.0001,105,42.4805,85.4633,Antiqua,0.ANT,0.ANT1,SAMN07722922,KEEP: Assembly Modern,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
IP542,IP542,1952,-69,Kenya,,1.44197,38.4314,,,Antiqua,1.ANT,1.ANT1,IP542,KEEP: Local Modern,1
IP543,IP543,1953,-68,Democratic Republic of the Congo,,-2.98143,23.8223,,,Antiqua,1.ANT,1.ANT1,IP543,KEEP: Local Modern,1
IP557,IP557,1963,-58,Iran,Kurdistan,32.6475,54.5644,35.6728,47.0124,Medievalis,2.MED,2.MED1,IP557,KEEP: Local Modern,2
IP562,IP562,1947,-74,Iran,Kurdistan,32.6475,54.5644,35.6728,47.0124,Medievalis,2.MED,2.MED1,IP562,KEEP: Local Modern,2


## Add metadata to tree and tree dataframe

In [11]:
# Iterate through the different metadata attributes
for attr in metadata_df.columns:
    # Initialize an empty column for the attribute
    tree_df[attr] = [NO_DATA_CHAR for row in range(0,len(tree_df))]
    # Iterate over the nodes in the tree
    for c in tree.find_clades():
        # Initialize to empty
        attr_val = NO_DATA_CHAR
        # If the node shows up in the metadata
        if c.name in metadata_df.index:
            tree_df.at[c.name, attr] = metadata_df[attr][c.name]
            attr_val = metadata_df[attr][c.name]         

tree_df

Unnamed: 0_level_0,UFboot,sCF,Branch_Length,Branch_Support_Color,coord_x,coord_y,Strain,Date,DateBP,Country,Province,CountryLat,CountryLon,ProvinceLat,ProvinceLon,Biovar,Branch_Major,Branch_Minor,BioSample,BioSampleComment,Branch_Number
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NODE0,100,100,0.000000e+00,red,0,17.8552,,,,,,,,,,,,,,,
SAMEA3541827,,,3.850300e-06,grey,3.8503e-06,1,RISE509,[-2876:-2677],[-4897:-4698],Russia,Krasnoyarsk Krai,64.6863,97.7453,63.3234,97.098,Bronze Age,0.PRE,0.PRE2,SAMEA3541827,KEEP: SRA Ancient,0
NODE2,100,100,1.327050e-05,red,1.32705e-05,3.15625,,,,,,,,,,,,,,,
SAMEA104233046,,,2.248300e-06,grey,1.55188e-05,2,Gyvakarai1,[-2621:-2472],[-4642:-4493],Lithuania,Panevezys County,55.35,23.75,55.9156,25.0312,Bronze Age,0.PRE,0.PRE1,SAMEA104233046,KEEP: SRA Ancient,0
NODE4,90.2,100,1.363600e-05,black,2.69065e-05,4.3125,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GCA_000268965.1_ASM26896v1_genomic,,,4.692000e-07,grey,7.69589e-05,631,PY-65,2010,-11,Peru,La Libertad,-6.86997,-75.0459,-8,-78.5,Orientalis,1.ORI,1.ORI1,SAMN02141486,KEEP: Assembly Modern,1
GCA_000269425.1_ASM26942v1_genomic,,,2.350000e-08,grey,7.65132e-05,632,PY-94,2010,-11,Peru,Cajamarca,-6.86997,-75.0459,-6.25,-78.8333,Orientalis,1.ORI,1.ORI1,SAMN02141497,KEEP: Assembly Modern,1
NODE1263,0,97,2.346000e-07,black,7.67008e-05,633.5,,,,,,,,,,,,,,,
GCA_000269185.1_ASM26918v1_genomic,,,2.346000e-07,grey,7.69354e-05,633,PY-99,2010,-11,Peru,La Libertad,-6.86997,-75.0459,-8,-78.5,Orientalis,1.ORI,1.ORI1,SAMN02141501,KEEP: Assembly Modern,1


---
# Export

## Save tree and tree dataframe

In [12]:
out_path_df = os.path.join(outdir, SCRIPT_NAME + ".tsv" )
tree_df.to_csv(out_path_df, sep="\t")

out_path_xml = os.path.join(outdir,  SCRIPT_NAME + ".xml" )
out_path_nwk = os.path.join(outdir, SCRIPT_NAME + ".nwk" )
out_path_nexus = os.path.join(outdir, SCRIPT_NAME + ".nexus" )
Phylo.write(tree, out_path_xml, 'phyloxml')
Phylo.write(tree, out_path_nwk, 'newick', format_branch_length='%1.{}f'.format(BRANCH_LEN_SIG_DIG))
Phylo.write(tree, out_path_nexus, 'nexus', format_branch_length='%1.{}f'.format(BRANCH_LEN_SIG_DIG))

1

## Save lat and lon

Write an output tsv that is:  
resolution   name   lat    lon

In [13]:
latlon_dict = {"Country": {}, "Province": {}}
out_path_lat_lon = os.path.join(outdir, SCRIPT_NAME + "_latlon.tsv")

# Columns in data frame will be ex: CountryLat, CountryLon
for c in tree.find_clades():
    # Iterate through each geographic level
    for level in latlon_dict:
        # Store the location name
        loc = tree_df[level][c.name]
        # If the node has location data and is new
        if loc != NO_DATA_CHAR and loc not in latlon_dict[level]:
            latlon_dict[level][loc] = {}
            latlon_dict[level][loc]["lat"] = tree_df[level + "Lat"][c.name]
            latlon_dict[level][loc]["lon"] = tree_df[level + "Lon"][c.name]

with open(out_path_lat_lon, "w") as outfile:
    for level in latlon_dict:
        for loc in latlon_dict[level]:
            outfile.write(level.lower() + "\t"
                         + loc + "\t"
                         + str(latlon_dict[level][loc]["lat"]) + "\t"
                         + str(latlon_dict[level][loc]["lon"]) + "\n"
                 )

## Save colors

Write an output tsv that is:
attribute   attribute_val   hex_color

In [14]:
out_path_colors = os.path.join(outdir, SCRIPT_NAME + "_colors.tsv")
file_colors = open(out_path_colors, "w")

tree_df["dummy_conf"] = [0 for row in range(0,len(tree_df))]

hex_dict = {}

for attr in ATTRIBUTE_LIST:
    # Create the color map
    attr_key = attr.lower()
    hex_dict[attr_key] = {}
    for t in tree.get_terminals():
        attr_val = tree_df[attr][t.name]
        if attr_val not in hex_dict[attr_key] and attr_val != NO_DATA_CHAR:
            hex_dict[attr_key][attr_val] = {}
            
    # Create the custom color map (pyplot)
    cmap = plt.get_cmap(CONT_COLOR_PAL, len(hex_dict[attr_key]))
    # Convert the color map to a list of RGB values
    cmaplist = [cmap(i) for i in range(cmap.N)]
    # Convert RGB values to hex colors
    attr_hex = [colors.to_hex(col) for col in cmaplist]
    
    # Assign colors to value
    for attr_val, attr_col in zip(hex_dict[attr_key], attr_hex):
        hex_dict[attr_key][attr_val] = attr_col   

        
print(hex_dict)

for attr_key in hex_dict:
    for attr_val in hex_dict[attr_key]:
        file_colors.write(str(attr_key) + "\t" + str(attr_val) + "\t" + str(hex_dict[attr_key][attr_val]) + "\n")

file_colors.close()        

{'branch_number': {0: '#8000ff', 3: '#00b4ec', 4: '#80ffb4', 2: '#ffb462', 1: '#ff0000'}, 'branch_major': {'0.PRE': '#8000ff', '0.PE': '#4c4ffc', '0.ANT': '#1996f3', '3.ANT': '#1acee3', '4.ANT': '#4df3ce', '2.ANT': '#80ffb4', '2.MED': '#b3f396', '1.PRE': '#e6ce74', '1.ANT': '#ff964f', '1.IN': '#ff4f28', '1.ORI': '#ff0000'}, 'branch_minor': {'0.PRE2': '#8000ff', '0.PRE1': '#7116ff', '0.PE7': '#632cfe', '0.PE2': '#5542fd', '0.PE8': '#4757fb', '0.PE10': '#396cf9', '0.PE4m': '#2b7ff6', '0.PE4h': '#1c92f3', '0.PE4t': '#0ea4f0', '0.PE4a': '#00b4ec', '0.PE5': '#0ec3e7', '0.ANT1': '#1cd1e2', '0.ANT4': '#2adddd', '0.ANT2': '#39e7d7', '0.ANT5': '#47f0d1', '0.ANT3': '#55f6ca', '3.ANT1': '#63fbc3', '3.ANT2': '#71febc', '4.ANT1': '#80ffb4', '2.ANT3': '#8efeac', '2.ANT2': '#9cfba4', '2.ANT1': '#aaf69b', '2.MED0': '#b8f092', '2.MED3': '#c6e789', '2.MED2': '#d4dd80', '2.MED1': '#e3d176', '1.PRE0': '#f1c36c', '1.PRE1': '#ffb462', '1.PRE2': '#ffa457', '1.PRE3': '#ff924d', '1.ANT1': '#ff8042', '1.IN1': '

### JSON

This can then be used for auspice via:

```
augur export v2 \
  --auspice-config auspice.config \
  --tree parse_tree.nwk \
  --node-data parse_tree.json \
  --output parse_tree_auspice.json \
  --lat-longs parse_tree_latlon.tsv
```

In [15]:
# Exclude these internal variables for plotting
COLOR_KEYWORD_EXCLUDE = ["color", "coord"]

In [16]:
# There is no alignment used
node_data = {"alignment" : ""}
node_data["input_tree"] = tree_path

node_dict = {}

# Iterate through all nodes in the tree
for c in tree.find_clades():
    # Add the node to the dictionary
    node_dict[c.name] = {}
    # Iterate through all attributes in the dataframe
    for attr in tree_df.columns:
        # Check if this attribute should be excluded
        exclude = False
        for keyword in COLOR_KEYWORD_EXCLUDE:
            if keyword in attr.lower():
                exclude = True    
        if not exclude:            
            # Make attribute name in dict lowercase (ex. Branch_Length -> branch_length)
            node_dict[c.name][attr.lower()] = tree_df[attr][c.name]

        
# Add the dataframe information to the node dict for the json
node_data["nodes"] = node_dict

out_path_json = os.path.join(outdir, SCRIPT_NAME + ".json" )
augur.utils.write_json(data=node_data, file_name=out_path_json, indent=2)

---
## Cleanup

In [17]:
os.remove(tmp_xml)