# BEAST Analysis Notebook

---

# 0. SETUP

In [1]:
import os
import pandas as pd
import seaborn as sns
from Bio import Phylo, AlignIO
from functions import *
import subprocess
from matplotlib import colors
import matplotlib.pyplot as plt

## Paths

In [2]:
WILDCARDS = ["all", "chromosome", "clade", "5"]
project_dir = "/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/main/"
results_dir = project_dir

READS_ORIGIN = WILDCARDS[0]
LOCUS_NAME = WILDCARDS[1]
PRUNE = WILDCARDS[2]
MISSING_DATA = WILDCARDS[3]

In [3]:
tree_dir            = project_dir + "/beast/all/chromosome/{}/phylogeography/".format(PRUNE)
metadata_path       = project_dir + "/iqtree/all/chromosome/full/filter{}/filter-taxa/metadata.tsv".format(MISSING_DATA)
auspice_config_path = project_dir + "/config/auspice_config.json"

# ------------------------------------------
# Alignment
constant_sites_path = project_dir + "/snippy_multi/all/chromosome/full/snippy-multi.constant_sites.txt".format(PRUNE)
aln_path            = project_dir + "iqtree/all/chromosome/full/filter{}/filter-clades/1.ORI/1.ORI.fasta".format(MISSING_DATA)

# ------------------------------------------
# Output
auspice_dir = os.path.join(results_dir, "auspice/all/chromosome/full/filter{}/beast/".format(MISSING_DATA))
if not os.path.exists(auspice_dir):
    subprocess.run(["mkdir", "-p", auspice_dir]) 
    
augur_dir = os.path.join(results_dir, "augur/all/chromosome/full/filter{}/beast/".format(MISSING_DATA))
if not os.path.exists(augur_dir):
    subprocess.run(["mkdir", "-p", augur_dir]) 

## Variables

In [4]:
pd.set_option("display.max_rows", 10, "display.max_columns", None)

# ------------------------------------------
BRANCH_LIST = {
    "1.ORI" : ["1.ORI1", "1.ORI2", "1.ORI3"],
    "1.IN": ["1.IN1","1.IN2","1.IN3"],  
    "1.ANT": ["1.ANT1"], 
    "1.PRE" : ["1.PRE0","1.PRE1", "1.PRE2", "1.PRE3"],
    "2.MED": ["2.MED0", "2.MED1","2.MED2","2.MED3" ],      
    "2.ANT": ["2.ANT1","2.ANT2","2.ANT3" ],    
    "4.ANT": ["4.ANT1" ],       
    "3.ANT": ["3.ANT1", "3.ANT2" ],  
    "0.ANT": ["0.ANT1", "0.ANT2","0.ANT3","0.ANT5"],         
    "0.ANT4" : ["0.ANT4"], 
    "0.PE": ["0.PE2", "0.PE4m", "0.PE4m", "0.PE4t", "0.PE4a", "0.PE5", "0.PE7", "0.PE8", "0.PE10"],   
    "0.PRE": ["0.PRE1", "0.PRE2"],        
}

NUM_STATES = 10

NO_DATA_CHAR = "NA"
JSON_INDENT=2
CURRENT_YEAR = 2021

# ------------------------------------------
# Alignment
with open(constant_sites_path) as infile:
    data = infile.read().strip().split(",")
    constant_sites = sum([int(count) for count in data])

aln = AlignIO.read(aln_path, "fasta")
variant_sites = len(aln[0].seq)
SEQ_LEN = constant_sites + variant_sites

NO_DATA_CHAR = "NA"
UNKNOWN_CHAR = "?"

POSTERIOR_THRESH = 95

---

# 1. IMPORT

## Metadata

In [5]:
metadata_df = pd.read_csv(metadata_path, sep='\t')
metadata_df.set_index(metadata_df.columns[0], inplace=True)
metadata_df.fillna(NO_DATA_CHAR, inplace=True)

display(metadata_df)

Unnamed: 0_level_0,strain,date,date_bp,country,province,country_lat,country_lon,province_lat,province_lon,biovar,branch_major,branch_minor,biosample_accession,biosample_comment,branch_number,continent,date_mean,date_bp_mean,date_err,lat,lon,host_human,branch_major_color,geometry_size,geometry,root_rtt_dist,clade_rtt_dist
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
Reference,CO92,1992,-29,United States of America,Colorado,39.783730,-100.445882,38.7252,-105.608,Orientalis,1.ORI,1.ORI1,SAMEA1705942,KEEP: Assembly Modern Reference,1,North America,1992.0,29.0,0.0,38.725178,-105.607716,Human,#ff0000,1.0,POINT (-105.607716 38.7251776),0.000073,0.000006
GCA_009909635.1_ASM990963v1_genomic,9_10,1923.0,-98,Russia,Rostov Oblast,64.686314,97.745306,47.6222,40.7958,Medievalis,2.MED,2.MED1,SAMN13632815,KEEP: Assembly Modern,2,Europe,1923.0,98.0,0.0,47.622245,40.795794,Human,#b3f396,4.0,POINT (40.7957942 47.6222451),0.000073,0.000010
GCA_009669545.1_ASM966954v1_genomic,42126,2006.0,-15,China,Xinjiang,35.000074,104.999927,42.4805,85.4633,Antiqua,0.ANT,0.ANT1,SAMN07722925,KEEP: Assembly Modern,0,Asia,2006.0,15.0,0.0,42.480495,85.463346,Non-Human,#1996f3,105.0,POINT (85.46334640000001 42.4804953),0.000054,0.000012
GCA_009669555.1_ASM966955v1_genomic,42123,2005.0,-16,China,Xinjiang,35.000074,104.999927,42.4805,85.4633,Antiqua,0.ANT,0.ANT1,SAMN07722924,KEEP: Assembly Modern,0,Asia,2005.0,16.0,0.0,42.480495,85.463346,Non-Human,#1996f3,105.0,POINT (85.46334640000001 42.4804953),0.000055,0.000012
GCA_009669565.1_ASM966956v1_genomic,42118,2005.0,-16,China,Xinjiang,35.000074,104.999927,42.4805,85.4633,Antiqua,0.ANT,0.ANT1,SAMN07722923,KEEP: Assembly Modern,0,Asia,2005.0,16.0,0.0,42.480495,85.463346,Non-Human,#1996f3,105.0,POINT (85.46334640000001 42.4804953),0.000055,0.000012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAMEA7313243_45,Azov38,[1400:1700],[-621:-321],Russia,Rostov Oblast,64.686314,97.745306,47.6222,40.7958,Second Pandemic,1.PRE,1.PRE1,SAMEA7313243_45,KEEP: SRA Ancient Combined Record,1,Europe,1550.0,471.0,150.0,47.622245,40.795794,Human,#e6ce74,4.0,POINT (40.7957942 47.6222451),0.000075,0.000022
SAMEA7313246_49,Gdansk8,[1400:1700],[-621:-321],Poland,Pomeranian Voivodeship,52.215933,19.134422,54.2456,18.1099,Second Pandemic,1.PRE,1.PRE1,SAMEA7313246_49,KEEP: SRA Ancient Combined Record,1,Europe,1550.0,471.0,150.0,54.245560,18.109900,Human,#e6ce74,1.0,POINT (18.1099 54.24556),0.000060,0.000007
SAMEA6651390,AGU010,[1435:1477],[-586:-544],Lithuania,Vilnius County,55.350000,23.750000,54.8227,25.2495,Second Pandemic,1.PRE,1.PRE1,SAMEA6651390,KEEP: SRA Ancient,1,Europe,1456.0,565.0,21.0,54.822692,25.249534,Human,#e6ce74,3.0,POINT (25.24953400167952 54.82269205),0.000060,0.000006
SAMEA6637004,AGU025,[1441:1612],[-580:-409],Lithuania,Vilnius County,55.350000,23.750000,54.8227,25.2495,Second Pandemic,1.PRE,1.PRE1,SAMEA6637004,KEEP: SRA Ancient,1,Europe,1526.5,494.5,85.5,54.822692,25.249534,Human,#e6ce74,3.0,POINT (25.24953400167952 54.82269205),0.000061,0.000007


## Most Recent Sampling Date

In [6]:
out_path = os.path.join(augur_dir, "most_recent_sampling_dates.tsv")
mrsd_dict = {}

with open(out_path, "w") as outfile:
    for branch in BRANCH_LIST:
        branch_df = metadata_df[metadata_df["branch_minor"].isin(BRANCH_LIST[branch])]
        if len(branch_df) == 0: continue
        max_date = max(branch_df["date_mean"])
        outfile.write("{}\t{}\n".format(branch, max_date))
        mrsd_dict[branch] = max_date
        
print(mrsd_dict)

{'1.ORI': 2016.0, '1.IN': 2008.0, '1.ANT': 2004.0, '1.PRE': 1767.5, '2.MED': 2018.0, '2.ANT': 2008.0, '4.ANT': 2015.0, '3.ANT': 2017.0, '0.ANT': 2019.0, '0.ANT4': 765.0, '0.PE': 2014.0, '0.PRE': -1686.0}


## Colors

In [7]:
out_path_colors = os.path.join(augur_dir, "colors.tsv")
attributes = ["country", "province", "branch_major", "branch_minor","continent"]
colors_dict = {}

for attr in attributes:
    # Create the color map
    colors_dict[attr] = {}
    
    # Set branch_major based on list
    if attr == "branch_major":
        states = list(BRANCH_LIST.keys())
        states.reverse()
    else:
        states = set(metadata_df[attr])
        
    # Create the custom color map (pyplot)
    cmap = plt.get_cmap("rainbow", len(states))
    # Convert the color map to a list of RGB values
    cmaplist = [cmap(i) for i in range(cmap.N)]
    # Convert RGB values to hex colors
    attr_hex = [colors.to_hex(col) for col in cmaplist]

    # Assign colors to value
    for state, color in zip(states, attr_hex):
        colors_dict[attr][state] = color
        
    # Add unknown
    colors_dict[attr][NO_DATA_CHAR] = "#969696"
    
print(colors_dict)

with open(out_path_colors, "w") as outfile:
    for attr_key in colors_dict:
        for attr_val in colors_dict[attr_key]:
            outfile.write(str(attr_key) + "\t" + str(attr_val) + "\t" + str(colors_dict[attr_key][attr_val]) + "\n")        

{'country': {'Kyrgyzstan': '#8000ff', 'Uzbekistan': '#7215ff', 'Myanmar': '#652afe', 'France': '#573ffd', 'England': '#4a53fc', 'Tajikistan': '#3c66fa', 'Algeria': '#2f79f7', 'Armenia': '#228bf4', 'Canada': '#149df1', 'Lithuania': '#07adee', 'Azerbaijan': '#07bcea', 'Zimbabwe': '#14c9e5', 'Brazil': '#22d5e0', 'Germany': '#2fe0db', 'Iran': '#3cead5', 'Bolivia': '#4af1d0', 'Norway': '#57f7c9', 'Spain': '#65fcc3', 'Poland': '#72febc', 'Kazakhstan': '#80ffb4', 'Mongolia': '#8dfead', 'Georgia': '#9afca5', 'Turkmenistan': '#a8f79d', 'Vietnam': '#b5f194', 'Kenya': '#c3ea8b', 'Switzerland': '#d0e083', 'China': '#ddd579', 'Nepal': '#ebc970', 'Indonesia': '#f8bc66', 'India': '#ffad5d', 'Russia': '#ff9d53', 'Uganda': '#ff8b49', 'Italy': '#ff793f', 'Democratic Republic of the Congo': '#ff6634', 'The Netherlands': '#ff532a', 'Estonia': '#ff3f20', 'United States of America': '#ff2a15', 'Peru': '#ff150b', 'Madagascar': '#ff0000', 'NA': '#969696'}, 'province': {'Shaanxi': '#8000ff', 'Hovsgel': '#7b07f

## Latitude and Longitude

In [8]:
latlon_country_df = pd.DataFrame()
latlon_province_df = pd.DataFrame()

df_list = [latlon_country_df, latlon_province_df]
attr_list = ["country", "province"]

# Create a mapping of geo name to lat,lon
for df,attr in zip(df_list, attr_list):
    # Get data
    for rec in metadata_df.iterrows():
        node_name = rec[0]
        name = rec[1][attr]
        country = rec[1]["country"]
        
        #node_type = rec[1]["node_type"]
        #if node_type == "internal":
        #    continue

        #if attr == "province" and name == NO_DATA_CHAR and node_type == "terminal" and country != "Russia":
        if attr == "province" and name == NO_DATA_CHAR and country != "Russia":
            # Use country instead
            name = rec[1]["country"]
            lat = rec[1]["country_lat"]
            lon = rec[1]["country_lon"]
        else:
            lat = rec[1][attr + "_lat"]
            lon = rec[1][attr + "_lon"]             


        if name not in df.index:
            df.at[name, "lat"] = lat
            df.at[name, "lon"] = lon
            df.at[name, "size"] = 1
        else:
            df["size"][name] += 1
        

#display(latlon_country_df)
#display(latlon_province_df)

# Mapping file for auspice
out_path_latlon = os.path.join(augur_dir, "latlon.tsv")

# Countries
with open(out_path_latlon, "w") as outfile:
    for country in latlon_country_df.index:
        lat = str(latlon_country_df["lat"][country])
        lon = str(latlon_country_df["lon"][country])
        outfile.write("country" + "\t" + country + "\t" + lat + "\t" + lon + "\n")
# Provinces
with open(out_path_latlon, "a") as outfile:
    for province in latlon_province_df.index:
        lat = str(latlon_province_df["lat"][province])
        lon = str(latlon_province_df["lon"][province])
        outfile.write("province" + "\t" + province + "\t" + lat + "\t" + lon + "\n")
        
#display(metadata_df[metadata_df["continent"] == "Europe"])

## Continent

In [9]:
continent_dict = {}

for rec in metadata_df.iterrows():
    continent = rec[1]["continent"]
    country = rec[1]["country"]
    continent_dict[country] = continent

## Tree Files

In [10]:
# Construct a dictionary to hold the trees
tree_dict = {}

for branch in BRANCH_LIST:
    for filename in os.listdir(tree_dir):
        if not filename.endswith(".tree"): continue  
        filepath = os.path.join(tree_dir, filename)
        
        # Check file matches branch
        if filename.strip(".tree") == branch:  
        #if filename.split("_")[0] == branch:  
        
            print("Branch:", branch)
            tree_dict[branch] = {}  
            
            # Add tree files to dict
            tree_dict[branch]["tree_file_raw"] = filepath 
            tree_dict[branch]["tree_file_edit"] = os.path.join(tree_dir, branch + ".nex")
            tree_dict[branch]["sample_rename"] = {}
            
            # Read in raw tree to deal with dashes
            with open(tree_dict[branch]["tree_file_raw"],  "r") as infile:                    
                with open(tree_dict[branch]["tree_file_edit"], "w") as outfile:           
                    raw_tree = infile.read()
                    # Remove quotations if they exist
                    raw_tree = raw_tree.replace("'","")
                    
                    # Split into lines to iterate over
                    raw_tree_lines = raw_tree.split("\n")
                    
                    # By default don't parse a line for dashes
                    taxa_line = False
                    
                    for line in raw_tree_lines: 
                                
                        if "TREE" not in line and "-" in line:
                            
                            if len(line.split(" ")) == 1:
                                name_dashes = line.strip()
                                name_no_dashes = name_dashes.replace("-","_")
                                tree_dict[branch]["sample_rename"][name_no_dashes] = name_dashes
                            line = line.replace("-","_")
                                
                        outfile.write(line + "\n")
            
            # Read in edited tree
            trees = Phylo.parse(tree_dict[branch]["tree_file_edit"], "nexus")
            # There should be only 1 tree
            for t in trees:
                tree_dict[branch]["tree"] = t
                tree_dict[branch]["tree"].ladderize(reverse=False)
                break

            # Rename sample names back to with dashes
            for c in tree_dict[branch]["tree"].find_clades():
                if c.name in tree_dict[branch]["sample_rename"]:
                    orig_name = c.name
                    c.name = tree_dict[branch]["sample_rename"][c.name]
                    print("Rename:", orig_name, c.name)
                    
                # Strip the date suffix
                #if c.name:
                #    c.name = "_".join(c.name.split("_")[0:-1])           
                    
            # Rename internal nodes
            node_i = 0
            for c in tree_dict[branch]["tree"].find_clades():
                if not c.name:
                    c.name = "NODE{}".format(node_i)
                    node_i += 1
            
            #Phylo.draw(tree_dict[branch]["tree"])
            

Branch: 1.ORI
Rename: GCA_000324805.2_EV76_CN_genomic GCA_000324805.2_EV76-CN_genomic
Rename: GCA_000986995.1_YPES001_SEQ_2_ASM_1_genomic GCA_000986995.1_YPES001-SEQ-2-ASM-1_genomic
Branch: 1.PRE
Branch: 2.MED
Rename: GCA_001617735.1_Yersinia_pestis_M_1763_genomic GCA_001617735.1_Yersinia_pestis_M-1763_genomic
Rename: GCA_002412305.1_Y.pestis_A_1809_genomic GCA_002412305.1_Y.pestis_A-1809_genomic
Rename: GCA_001617785.1_Yersinia_pestis_M_549_genomic GCA_001617785.1_Yersinia_pestis_M-549_genomic
Rename: GCA_001617725.1_Yersinia_pestis_M_1484_genomic GCA_001617725.1_Yersinia_pestis_M-1484_genomic
Rename: GCA_001617715.1_Yersinia_pestis_M_519_genomic GCA_001617715.1_Yersinia_pestis_M-519_genomic
Rename: GCA_001617815.1_Yersinia_pestis_M_1453_genomic GCA_001617815.1_Yersinia_pestis_M-1453_genomic
Rename: GCA_001617705.1_Yersinia_pestis_C_791_genomic GCA_001617705.1_Yersinia_pestis_C-791_genomic
Branch: 0.ANT4
Branch: 0.PRE


## Add Tree Data to Dataframe

### Get comments from the first root that is not root

In [11]:
TREE_PARAMETERS = None

for branch in tree_dict:
    
    print(branch)
    
    tree = tree_dict[branch]["tree"]
    df = copy.deepcopy(metadata_df[metadata_df["branch_minor"].isin(BRANCH_LIST[branch])])
    
    # Check if a sample was missed :(
    df_samples = df.index
    tree_samples = [c.name for c in tree.find_clades()]
    
    for sample in df_samples:
        if sample not in tree_samples:
            df.drop(sample, inplace=True)
            print("Dropping {} from the dataframe.".format(sample))
    
    root_comment_dict = parse_comment(tree.root.comment)
    
    for c in tree.find_clades():
        if c.is_terminal():
            continue
            
        if c != tree.root:
            comment_dict = parse_comment(c.comment)
            for parameter,value in comment_dict.items():
                # Intialize parameter values
                if "range" in parameter or "95%" in parameter:
                    df[parameter] = [[NO_DATA_CHAR,NO_DATA_CHAR]] * len(df)
                    if parameter not in root_comment_dict:
                        root_comment_dict[parameter] = '{0,0}'
                else:
                    df[parameter] = [NO_DATA_CHAR] * len(df)
                    if parameter not in root_comment_dict:
                        root_comment_dict[parameter] = '0'
                    
            break

    # Update the roots comment to include missing values
    new_comment = "[&"
    new_comment_list = []
    for parameter,value in root_comment_dict.items():
        param_str = "{}={}".format(parameter, value)
        new_comment_list.append(param_str)
    new_comment += ",".join(new_comment_list) + "]"
    tree.root.comment = new_comment
    #print(root_comment_dict)
    #print(new_comment_list)
    
    if not TREE_PARAMETERS:
        TREE_PARAMETERS = [p for p in root_comment_dict]
    
    tree_dict[branch]["df"] = df 
    #display(df)

1.ORI
1.PRE
2.MED
0.ANT4
0.PRE


### Parse Tree Comments

In [12]:
parameters = [
    "branch_length",
    "branch_length_sub",
    "node_type",    
    "branch_support",
    "branch_support_conf_category",
    "branch_support_conf_char",
    "country_date_strain",
    "province_date_strain",
    "rate",
    "rate_hpd",
    "rate_sub",
    "rate_sub_year",
    "state",
    "state_prob",
    "state_conf_category",
    "state_rate",
    "state_rate_hpd",
    "state_continent",
    "state_lat",
    "state_lon",
    "height",
    "height_hpd",
    "timetree_num_date",
    "timetree_num_date_confidence",
]

for branch in tree_dict:
    print(branch)
    
    tree = tree_dict[branch]["tree"]
    df = tree_dict[branch]["df"]
    
    # Add to dataframe
    for param in parameters:
        df[param] = [NO_DATA_CHAR] * len(df)
        
    for c in tree.find_clades():
        # Defaults
        node_type = "internal"
        branch_support = 0
        branch_support_conf_category = "LOW"
        branch_support_conf_char = ""
        branch_length = 0
        branch_length_sub = 0
        country_date_strain = NO_DATA_CHAR
        province_date_strain = NO_DATA_CHAR
        rate_hpd = [0,0]
        rate = 0
        rate_sub_year = 0
        state = NO_DATA_CHAR
        state_conf_category = "LOW"
        state_prob = 0
        state_rate = 0
        state_continent = NO_DATA_CHAR
        state_lat = NO_DATA_CHAR
        state_lon = NO_DATA_CHAR
        
        
        timetree_num_date = 0
        timetree_num_date_confidence = [0,0]
        height = 0
        height_hpd = [0,0]
        
        
        comment_dict = parse_comment(c.comment)
        #print(comment_dict.keys())

        # Branch Length
        if c.branch_length:
            branch_length = c.branch_length

        # Branch support
        if "posterior" in comment_dict:
            branch_support = float(round(float(comment_dict["posterior"]) * 100))
            if branch_support >= 95:
                branch_support_conf_category = "HIGH"
                branch_support_conf_char = "*"

        # Rates
        if "default.rate" in comment_dict:
            rate = float(comment_dict["default.rate"])
            rate_sub = rate

            if c.branch_length:
                rate_sub_year = rate * SEQ_LEN
                branch_length_sub = rate_sub_year * c.branch_length

        if "default.rate_95%_HPD" in comment_dict:
            rate_hpd_split = comment_dict["default.rate_95%_HPD"].strip("{}").split(",")
            rate_hpd = [float(r) for r in rate_hpd_split]
            
        if "state.rate" in comment_dict:
            state_rate = float(comment_dict["state.rate"])   

        if "state.rate_95%_HPD" in comment_dict:
            state_rate_hpd_split = comment_dict["state.rate_95%_HPD"].strip("{}").split(",")
            state_rate_hpd = [float(r) for r in state_rate_hpd_split]            
    
        # States
        if "state" in comment_dict:
            state = comment_dict["state"]
            state_continent = continent_dict[state]
            state_latlon_dict = dict(latlon_country_df.loc[state])
            state_lat = float(state_latlon_dict["lat"])
            state_lon = float(state_latlon_dict["lon"])
            
            
        if "state.prob" in comment_dict:
            state_prob = float(round(float(comment_dict["state.prob"]) * 100))
            if state_prob >= 95:
                state_conf_category = "HIGH"


        # Dates
        
        if "height" in comment_dict:
            height = float(comment_dict["height"])

        if "height_95%_HPD" in comment_dict:
            height_hpd_split = comment_dict["height_95%_HPD"].strip("{}").split(",")
            height_hpd = [float(h) for h in height_hpd_split]
        
        height_hpd_reverse = copy.copy(height_hpd)
        height_hpd_reverse.reverse()
        timetree_num_date = float(round(mrsd_dict[branch] - height))
        timetree_num_date_confidence = [round(mrsd_dict[branch] - h) for h in height_hpd_reverse]  

        if c.is_terminal():
            node_type = "terminal"
            country = metadata_df["country"][c.name]
            province = metadata_df["province"][c.name]
            date = metadata_df["date"][c.name]
            strain = metadata_df["strain"][c.name]

            country_date_strain = "{} {} {}".format(country, date, strain)
            province_date_strain = "{} {} {}".format(province, date, strain)
            
        
        df.at[c.name, "branch_length"] = branch_length
        df.at[c.name, "branch_length_sub"] = branch_length_sub
        df.at[c.name, "node_type"] = node_type
        df.at[c.name, "branch_support"] = branch_support
        df.at[c.name, "branch_support_conf_category"] = branch_support_conf_category
        df.at[c.name, "branch_support_conf_char"] = branch_support_conf_char
        df.at[c.name, "country_date_strain"] = country_date_strain
        df.at[c.name, "province_date_strain"] = province_date_strain

        
        df.at[c.name, "rate"] = rate
        df.at[c.name, "rate_hpd"] = rate_hpd
        df.at[c.name, "rate_sub"] = rate_sub
        df.at[c.name, "rate_sub_year"] = rate_sub_year
        
        df.at[c.name, "state"] = state
        df.at[c.name, "state_prob"] = state_prob
        df.at[c.name, "state_conf_category"] = state_conf_category
        df.at[c.name, "state_continent"] = state_continent
        df.at[c.name, "state_lat"] = state_lat
        df.at[c.name, "state_lon"] = state_lon
        
        df.at[c.name, "state_rate"] = state_rate
        df.at[c.name, "state_rate_hpd"] = state_rate_hpd
        
        df.at[c.name, "height"] = height
        df.at[c.name, "height_hpd"] = height_hpd   
        
        df.at[c.name, "timetree_num_date"] = timetree_num_date
        df.at[c.name, "timetree_num_date_confidence"] = timetree_num_date_confidence        

    # Update internal nodes NA
    df.fillna(NO_DATA_CHAR, inplace=True)
    #display(df)

1.ORI
1.PRE
2.MED
0.ANT4
0.PRE


In [37]:
## State Confidence Summary
print("{}\t{}\t{}\t{}".format("branch", "high", "internal", "% high" ))
for branch in tree_dict:
    
    tree = tree_dict[branch]["tree"]
    df = tree_dict[branch]["df"]
    internal_nodes = 0
    high_state_prob = 0
    
    for rec in df.iterrows():
        sample = rec[0]
        # Focus on internal branches
        if rec[1]["node_type"] == "terminal": continue
        state_prob = rec[1]["state_prob"]
        internal_nodes += 1
        if state_prob >= 95:
            high_state_prob += 1
        perc = round((high_state_prob / internal_nodes) * 100, 0)
    print("{}\t{}\t{}\t{}".format(branch, high_state_prob, internal_nodes, perc))

branch	high	internal	% high
1.ORI	102	116	88.0
1.PRE	17	39	44.0
2.MED	80	115	70.0
0.ANT4	8	11	73.0
0.PRE	3	7	43.0


## Reduced Dataframe for Auspice

In [13]:
# Options
print(tree_dict[branch]["df"].columns)

Index(['strain', 'date', 'date_bp', 'country', 'province', 'country_lat',
       'country_lon', 'province_lat', 'province_lon', 'biovar', 'branch_major',
       'branch_minor', 'biosample_accession', 'biosample_comment',
       'branch_number', 'continent', 'date_mean', 'date_bp_mean', 'date_err',
       'lat', 'lon', 'host_human', 'branch_major_color', 'geometry_size',
       'geometry', 'root_rtt_dist', 'clade_rtt_dist', 'length_range',
       'state.rate_range', 'default.rate_95%_HPD', 'length_95%_HPD',
       'state.rate_95%_HPD', 'default.rate_range', 'state.rate',
       'default.rate', 'length', 'posterior', 'state.prob', 'height_median',
       'height_range', 'height_95%_HPD', 'state.rate_median',
       'default.rate_median', 'length_median', 'state', 'state.set.prob',
       'state.set', 'height', 'branch_length', 'branch_length_sub',
       'node_type', 'branch_support', 'branch_support_conf_category',
       'branch_support_conf_char', 'country_date_strain',
       'provin

In [14]:
for branch in tree_dict:
    print(branch)
    
    columns = [
        # Required
        "branch_length",
        "branch_length_sub",
        "node_type",
        # Time Tree
        "timetree_num_date",
        "timetree_num_date_confidence",
        # Geo
        #"country",
        "province",
        "state",
        "state_prob",
        "state_conf_category",
        "state_rate",
        "state_continent",
        "state_lat",
        "state_lon",
        #"continuous_geo",
        # Colors and Filters 
        "branch_major",
        "branch_minor",
        "branch_support",
        "branch_support_conf_category",  
        "branch_support_conf_char",  
        #"continent",
        "province_lat",
        "province_lon",
        #"country_lat",
        #"country_lon",
        # Text Description
        "biosample_accession",
        "strain",
        "country_date_strain",
        "province_date_strain",
        "host_human",
        # Tip Dates
        "date_mean",
        "date_err",
        "date_bp_mean",
        # Stats
        "root_rtt_dist",
        "clade_rtt_dist",
        "rate_sub",
        "rate_sub_year",
    ]
    
    auspice_df = copy.copy(tree_dict[branch]["df"][columns])

    # Edit df
    auspice_df["node_name"] = list(auspice_df.index)
    auspice_df["blank"] = [" "] * len(auspice_df)       
    
    # Rename
    auspice_df.rename(columns={"state": "country"}, inplace=True)
    auspice_df.rename(columns={"state_continent": "continent"}, inplace=True)
    auspice_df.rename(columns={"state_lat": "country_lat"}, inplace=True)
    auspice_df.rename(columns={"state_lon": "country_lon"}, inplace=True)
    
    # Make states grey if low confidence
    """for rec in auspice_df.iterrows():
        state_country = rec[1]["country"]
        state_continent = rec[1]["continent"]
        state_prob = rec[1]["state_prob"]
        if state_prob < 95:
            auspice_df.at[rec[0], "continent"] = NO_DATA_CHAR
            auspice_df.at[rec[0], "country"] = NO_DATA_CHAR"""
    
    display(auspice_df)
    tree_dict[branch]["auspice_df"] = auspice_df
    #break

1.ORI


Unnamed: 0_level_0,branch_length,branch_length_sub,node_type,timetree_num_date,timetree_num_date_confidence,province,country,state_prob,state_conf_category,state_rate,continent,country_lat,country_lon,branch_major,branch_minor,branch_support,branch_support_conf_category,branch_support_conf_char,province_lat,province_lon,biosample_accession,strain,country_date_strain,province_date_strain,host_human,date_mean,date_err,date_bp_mean,root_rtt_dist,clade_rtt_dist,rate_sub,rate_sub_year,node_name,blank
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
Reference,13.502915,22.937615,terminal,1992.0,"[1992, 1992]",Colorado,United States of America,100.0,HIGH,0.005133,North America,39.783730,-100.445882,1.ORI,1.ORI1,0.0,LOW,,38.7252,-105.608,SAMEA1705942,CO92,United States of America 1992 CO92,Colorado 1992 CO92,Human,1992,0,29,7.31686e-05,6.3815e-06,4.016733e-07,1.698716,Reference,
GCA_000834775.1_ASM83477v1_genomic,6.249831,8.900725,terminal,1967.0,"[1967, 1967]",Arizona,United States of America,100.0,HIGH,0.005133,North America,39.783730,-100.445882,1.ORI,1.ORI1,0.0,LOW,,34.3953,-111.763,SAMN03161124,Dodson,United States of America 1967.0 Dodson,Arizona 1967.0 Dodson,Human,1967,0,54,7.05773e-05,3.7902e-06,3.367514e-07,1.424154,GCA_000834775.1_ASM83477v1_genomic,
GCA_000834335.1_ASM83433v1_genomic,4.664658,4.498545,terminal,1954.0,"[1954, 1954]",California,United States of America,100.0,HIGH,0.005133,North America,39.783730,-100.445882,1.ORI,1.ORI1,0.0,LOW,,36.7015,-118.756,SAMN03120838,Shasta,United States of America 1954.0 Shasta,California 1954.0 Shasta,Human,1954,0,67,6.94491e-05,2.662e-06,2.280366e-07,0.964389,GCA_000834335.1_ASM83433v1_genomic,
GCA_000169635.1_ASM16963v1_genomic,23.222619,13.089660,terminal,2005.0,"[2005, 2005]",,Madagascar,100.0,HIGH,0.005133,Africa,-18.924960,46.441642,1.ORI,1.ORI3,0.0,LOW,,,,SAMN02404403,MG05-1020,Madagascar 2005.0 MG05-1020,NA 2005.0 MG05-1020,Human,2005,0,16,7.15927e-05,4.8056e-06,1.332814e-07,0.563660,GCA_000169635.1_ASM16963v1_genomic,
GCA_000170275.1_ASM17027v1_genomic,13.009251,4.460811,terminal,1991.0,"[1991, 1991]",Yunnan,China,100.0,HIGH,0.005133,Asia,35.000074,104.999927,1.ORI,1.ORI2,0.0,LOW,,25,102,SAMN02404399,F1991016,China 1991.0 F1991016,Yunnan 1991.0 F1991016,Non-Human,1991,0,30,7.25252e-05,5.7381e-06,8.108000e-08,0.342895,GCA_000170275.1_ASM17027v1_genomic,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NODE111,0.418109,0.308126,internal,2005.0,"[2001, 2008]",,Peru,100.0,HIGH,0.005103,South America,-6.869970,-75.045851,,,31.0,LOW,,,,,,,,,,,,,,1.742575e-07,0.736952,NODE111,
NODE112,3.334069,1.793268,internal,2008.0,"[2006, 2010]",,Peru,100.0,HIGH,0.005132,South America,-6.869970,-75.045851,,,96.0,HIGH,*,,,,,,,,,,,,,1.271812e-07,0.537862,NODE112,
NODE113,1.351110,1.145470,internal,2006.0,"[2003, 2009]",,Peru,100.0,HIGH,0.005107,South America,-6.869970,-75.045851,,,52.0,LOW,,,,,,,,,,,,,,2.004681e-07,0.847799,NODE113,
NODE114,1.112510,0.576862,internal,2007.0,"[2005, 2009]",,Peru,100.0,HIGH,0.005129,South America,-6.869970,-75.045851,,,23.0,LOW,,,,,,,,,,,,,,1.226085e-07,0.518523,NODE114,


1.PRE


Unnamed: 0_level_0,branch_length,branch_length_sub,node_type,timetree_num_date,timetree_num_date_confidence,province,country,state_prob,state_conf_category,state_rate,continent,country_lat,country_lon,branch_major,branch_minor,branch_support,branch_support_conf_category,branch_support_conf_char,province_lat,province_lon,biosample_accession,strain,country_date_strain,province_date_strain,host_human,date_mean,date_err,date_bp_mean,root_rtt_dist,clade_rtt_dist,rate_sub,rate_sub_year,node_name,blank
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
SAMEA5818830,1.704386,0.257695,terminal,1560.0,"[1560, 1560]",Nidwalden,Switzerland,100.0,HIGH,0.007231,Europe,46.813331,8.444947,1.PRE,1.PRE1,0.0,LOW,,46.9428,8.41198,SAMEA5818830,STN021,Switzerland [1485:1635] STN021,Nidwalden [1485:1635] STN021,Human,1560,75,461,6.30679e-05,9.8052e-06,3.575122e-08,0.151195,SAMEA5818830,
SAMEA5818829,1.067785,0.152720,terminal,1560.0,"[1560, 1560]",Nidwalden,Switzerland,100.0,HIGH,0.007231,Europe,46.813331,8.444947,1.PRE,1.PRE1,0.0,LOW,,46.9428,8.41198,SAMEA5818829,STN020,Switzerland [1485:1635] STN020,Nidwalden [1485:1635] STN020,Human,1560,75,461,6.27851e-05,9.5224e-06,3.381921e-08,0.143025,SAMEA5818829,
SAMEA5818828,1.046214,0.156027,terminal,1560.0,"[1560, 1560]",Nidwalden,Switzerland,100.0,HIGH,0.007231,Europe,46.813331,8.444947,1.PRE,1.PRE1,0.0,LOW,,46.9428,8.41198,SAMEA5818828,STN019,Switzerland [1485:1635] STN019,Nidwalden [1485:1635] STN019,Human,1560,75,461,6.28323e-05,9.5696e-06,3.526404e-08,0.149135,SAMEA5818828,
SAMEA5818826,1.067785,0.161386,terminal,1560.0,"[1560, 1560]",Nidwalden,Switzerland,100.0,HIGH,0.007231,Europe,46.813331,8.444947,1.PRE,1.PRE1,0.0,LOW,,46.9428,8.41198,SAMEA5818826,STN014,Switzerland [1485:1635] STN014,Nidwalden [1485:1635] STN014,Human,1560,75,461,6.2735e-05,9.4723e-06,3.573825e-08,0.151141,SAMEA5818826,
SAMEA5818825,1.704386,0.969636,terminal,1560.0,"[1560, 1560]",Nidwalden,Switzerland,100.0,HIGH,0.007231,Europe,46.813331,8.444947,1.PRE,1.PRE1,0.0,LOW,,46.9428,8.41198,SAMEA5818825,STN013,Switzerland [1485:1635] STN013,Nidwalden [1485:1635] STN013,Human,1560,75,461,6.32799e-05,1.00172e-05,1.345219e-07,0.568906,SAMEA5818825,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NODE34,2.579463,0.544664,internal,1555.0,"[1550, 1559]",,Switzerland,100.0,HIGH,0.007265,Europe,46.813331,8.444947,,,10.0,LOW,,,,,,,,,,,,,,4.992887e-08,0.211154,NODE34,
NODE35,2.622074,1.066974,internal,1558.0,"[1554, 1560]",,Switzerland,100.0,HIGH,0.007231,Europe,46.813331,8.444947,,,100.0,HIGH,*,,,,,,,,,,,,,9.621906e-08,0.406920,NODE35,
NODE36,3.470437,0.488058,internal,1556.0,"[1552, 1560]",,Switzerland,100.0,HIGH,0.007239,Europe,46.813331,8.444947,,,8.0,LOW,,,,,,,,,,,,,,3.325365e-08,0.140633,NODE36,
NODE37,2.367701,0.346754,internal,1558.0,"[1555, 1560]",,Switzerland,100.0,HIGH,0.007250,Europe,46.813331,8.444947,,,17.0,LOW,,,,,,,,,,,,,,3.462956e-08,0.146452,NODE37,


2.MED


Unnamed: 0_level_0,branch_length,branch_length_sub,node_type,timetree_num_date,timetree_num_date_confidence,province,country,state_prob,state_conf_category,state_rate,continent,country_lat,country_lon,branch_major,branch_minor,branch_support,branch_support_conf_category,branch_support_conf_char,province_lat,province_lon,biosample_accession,strain,country_date_strain,province_date_strain,host_human,date_mean,date_err,date_bp_mean,root_rtt_dist,clade_rtt_dist,rate_sub,rate_sub_year,node_name,blank
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
GCA_009909635.1_ASM990963v1_genomic,20.502046,11.345819,terminal,1923.0,"[1923, 1923]",Rostov Oblast,Russia,100.0,HIGH,0.006089,Europe,64.686314,97.745306,2.MED,2.MED1,0.0,LOW,,47.6222,40.7958,SAMN13632815,9_10,Russia 1923.0 9_10,Rostov Oblast 1923.0 9_10,Human,1923,0,98,7.30501e-05,9.6582e-06,1.308552e-07,0.553399,GCA_009909635.1_ASM990963v1_genomic,
GCA_009296005.1_ASM929600v1_genomic,7.976810,2.350051,terminal,1953.0,"[1953, 1953]",Chechnya,Russia,100.0,HIGH,0.006089,Europe,64.686314,97.745306,2.MED,2.MED1,0.0,LOW,,43.3976,45.6985,SAMN12991209,C-25,Russia 1953.0 C-25,Chechnya 1953.0 C-25,Non-Human,1953,0,68,7.1472e-05,8.0801e-06,6.966271e-08,0.294610,GCA_009296005.1_ASM929600v1_genomic,
GCA_008630485.1_ASM863048v1_genomic,2.961652,9.647097,terminal,1997.0,"[1997, 1997]",Kabardino-Balkaria,Russia,100.0,HIGH,0.006089,Europe,64.686314,97.745306,2.MED,2.MED1,0.0,LOW,,43.4428,43.4205,SAMN12721152,C-742,Russia 1997.0 C-742,Kabardino-Balkaria 1997.0 C-742,Non-Human,1997,0,24,7.53973e-05,1.20054e-05,7.702200e-07,3.257336,GCA_008630485.1_ASM863048v1_genomic,
GCA_008630435.1_ASM863043v1_genomic,19.492362,62.164798,terminal,1996.0,"[1996, 1996]",Karachay-Cherkessia,Russia,100.0,HIGH,0.006089,Europe,64.686314,97.745306,2.MED,2.MED0,0.0,LOW,,43.7368,41.7268,SAMN12721146,C-719,Russia 1996.0 C-719,Karachay-Cherkessia 1996.0 C-719,Non-Human,1996,0,25,8.77894e-05,2.43975e-05,7.541059e-07,3.189188,GCA_008630435.1_ASM863043v1_genomic,
GCA_008630395.1_ASM863039v1_genomic,7.962416,11.179134,terminal,1984.0,"[1984, 1984]",Republic of Dagestan,Russia,100.0,HIGH,0.006089,Europe,64.686314,97.745306,2.MED,2.MED1,0.0,LOW,,43.0883,47.1499,SAMN12715009,C-528,Russia 1984.0 C-528,Republic of Dagestan 1984.0 C-528,Non-Human,1984,0,37,7.4455e-05,1.10631e-05,3.319827e-07,1.403988,GCA_008630395.1_ASM863039v1_genomic,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NODE110,8.165731,3.840846,internal,1989.0,"[1982, 1996]",,Russia,100.0,HIGH,0.006089,Europe,64.686314,97.745306,,,100.0,HIGH,*,,,,,,,,,,,,,1.112203e-07,0.470362,NODE110,
NODE111,4.595163,2.896447,internal,1993.0,"[1989, 1997]",,Russia,100.0,HIGH,0.006090,Europe,64.686314,97.745306,,,100.0,HIGH,*,,,,,,,,,,,,,1.490448e-07,0.630325,NODE111,
NODE112,9.151472,1.017458,internal,1998.0,"[1989, 2008]",,Russia,100.0,HIGH,0.006108,Europe,64.686314,97.745306,,,73.0,LOW,,,,,,,,,,,,,,2.628923e-08,0.111180,NODE112,
NODE113,7.560997,2.498844,internal,2005.0,"[1997, 2009]",,Russia,100.0,HIGH,0.006089,Europe,64.686314,97.745306,,,100.0,HIGH,*,,,,,,,,,,,,,7.814700e-08,0.330491,NODE113,


0.ANT4


Unnamed: 0_level_0,branch_length,branch_length_sub,node_type,timetree_num_date,timetree_num_date_confidence,province,country,state_prob,state_conf_category,state_rate,continent,country_lat,country_lon,branch_major,branch_minor,branch_support,branch_support_conf_category,branch_support_conf_char,province_lat,province_lon,biosample_accession,strain,country_date_strain,province_date_strain,host_human,date_mean,date_err,date_bp_mean,root_rtt_dist,clade_rtt_dist,rate_sub,rate_sub_year,node_name,blank
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
SAMEA5661390,4.813590,69.534111,terminal,478.0,"[478, 478]",Bavaria,Germany,100.0,HIGH,0.002454,Europe,51.083420,10.423447,0.ANT,0.ANT4,0.0,LOW,,48.9468,11.4039,SAMEA5661390,UNT004,Germany [419:537] UNT004,Bavaria [419:537] UNT004,Human,478,59,1543,6.24896e-05,1.98615e-05,3.415710e-06,14.445374,SAMEA5661390,
SAMEA5661389,11.613209,0.178004,terminal,516.0,"[516, 516]",Bavaria,Germany,100.0,HIGH,0.002454,Europe,51.083420,10.423447,0.ANT,0.ANT4,0.0,LOW,,48.9468,11.4039,SAMEA5661389,UNT003,Germany [433:599] UNT003,Bavaria [433:599] UNT003,Human,516,83,1505,5.80821e-05,1.5454e-05,3.624344e-09,0.015328,SAMEA5661389,
SAMEA5661385,12.406262,34.354881,terminal,517.0,"[517, 517]",Bavaria,Germany,100.0,HIGH,0.002454,Europe,51.083420,10.423447,0.ANT,0.ANT4,0.0,LOW,,48.9468,11.4039,SAMEA5661385,PET004,Germany [434:600] PET004,Bavaria [434:600] PET004,Human,517,83,1504,6.23395e-05,1.97114e-05,6.547866e-07,2.769157,SAMEA5661385,
SAMEA5661384,79.874756,6.056320,terminal,521.0,"[521, 521]",Valencia Community,Spain,100.0,HIGH,0.002454,Europe,39.326068,-4.837979,0.ANT,0.ANT4,0.0,LOW,,39.682,-0.765441,SAMEA5661384,VAL001,Spain [432:610] VAL001,Valencia Community [432:610] VAL001,Human,521,89,1500,5.88052e-05,1.61771e-05,1.792881e-08,0.075823,SAMEA5661384,
SAMEA5661372,35.988328,19.306661,terminal,765.0,"[765, 765]",Centre-Loire Valley,France,100.0,HIGH,0.002454,Europe,46.603354,1.888334,0.ANT,0.ANT4,0.0,LOW,,47.549,1.73241,SAMEA5661372,LSD020,France [650:880] LSD020,Centre-Loire Valley [650:880] LSD020,Human,765,115,1256,6.41122e-05,2.14841e-05,1.268521e-07,0.536470,SAMEA5661372,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NODE6,40.658299,0.412774,internal,501.0,"[474, 517]",,Germany,99.0,HIGH,0.002469,Europe,51.083420,10.423447,,,93.0,LOW,,,,,,,,,,,,,,2.400574e-09,0.010152,NODE6,
NODE7,224.417933,19.154782,internal,718.0,"[637, 765]",,France,98.0,HIGH,0.002454,Europe,46.603354,1.888334,,,100.0,HIGH,*,,,,,,,,,,,,,2.018236e-08,0.085353,NODE7,
NODE8,9.250971,4.780357,internal,471.0,"[458, 478]",,Germany,100.0,HIGH,0.002454,Europe,51.083420,10.423447,,,100.0,HIGH,*,,,,,,,,,,,,,1.221871e-07,0.516741,NODE8,
NODE9,17.270125,0.390279,internal,489.0,"[475, 498]",,Germany,100.0,HIGH,0.002485,Europe,51.083420,10.423447,,,85.0,LOW,,,,,,,,,,,,,,5.343581e-09,0.022599,NODE9,


0.PRE


Unnamed: 0_level_0,branch_length,branch_length_sub,node_type,timetree_num_date,timetree_num_date_confidence,province,country,state_prob,state_conf_category,state_rate,continent,country_lat,country_lon,branch_major,branch_minor,branch_support,branch_support_conf_category,branch_support_conf_char,province_lat,province_lon,biosample_accession,strain,country_date_strain,province_date_strain,host_human,date_mean,date_err,date_bp_mean,root_rtt_dist,clade_rtt_dist,rate_sub,rate_sub_year,node_name,blank
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
SAMEA104233048,214.525846,38.594663,terminal,-1944.0,"[-1686, -1686]",Bavaria,Germany,100.0,HIGH,0.001178,Europe,51.083420,10.423447,0.PRE,0.PRE1,0.0,LOW,,48.9468,11.4039,SAMEA104233048,Post6,Germany [-2007:-1882] Post6,Bavaria [-2007:-1882] Post6,Human,-1944.5,62.5,3965.5,6.19918e-05,5.85177e-05,4.254023e-08,0.179907,SAMEA104233048,
SAMEA104233049,64.193490,53.214854,terminal,-2272.0,"[-2272, -2272]",Bavaria,Germany,100.0,HIGH,0.001178,Europe,51.083420,10.423447,0.PRE,0.PRE1,0.0,LOW,,48.9468,11.4039,SAMEA104233049,1343UnTal85,Germany [-2396:-2148] 1343UnTal85,Bavaria [-2396:-2148] 1343UnTal85,Human,-2272,124,4293,4.64152e-05,4.29411e-05,1.960172e-07,0.828976,SAMEA104233049,
SAMEA104233047,257.758638,11.121895,terminal,-2457.0,"[-2457, -2457]",Pärnu maakond,Estonia,100.0,HIGH,0.001178,Europe,58.752378,25.331908,0.PRE,0.PRE1,0.0,LOW,,58.3195,24.3026,SAMEA104233047,KunilaII,Estonia [-2574:-2340] KunilaII,Pärnu maakond [-2574:-2340] KunilaII,Human,-2457,117,4478,3.29161e-05,2.9442e-05,1.020276e-08,0.043148,SAMEA104233047,
SAMEA104233046,219.837448,10.901882,terminal,-2546.0,"[-2546, -2546]",Panevezys County,Lithuania,100.0,HIGH,0.001178,Europe,55.350000,23.750000,0.PRE,0.PRE1,0.0,LOW,,55.9156,25.0312,SAMEA104233046,Gyvakarai1,Lithuania [-2621:-2472] Gyvakarai1,Panevezys County [-2621:-2472] Gyvakarai1,Human,-2546.5,74.5,4567.5,2.08194e-05,1.73453e-05,1.172606e-08,0.049591,SAMEA104233046,
SAMEA3541827,64.247120,267.079819,terminal,-2776.0,"[-2776, -2776]",Krasnoyarsk Krai,Russia,100.0,HIGH,0.001178,Europe,64.686314,97.745306,0.PRE,0.PRE2,0.0,LOW,,63.3234,97.098,SAMEA3541827,RISE509,Russia [-2876:-2677] RISE509,Krasnoyarsk Krai [-2876:-2677] RISE509,Human,-2776.5,99.5,4797.5,8.9963e-06,5.5222e-06,9.829686e-07,4.157071,SAMEA3541827,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NODE2,51.578810,116.556103,internal,-2723.0,"[-2791, -2670]",,Russia,97.0,HIGH,0.001180,Europe,64.686314,97.745306,,,98.0,HIGH,*,,,,,,,,,,,,,5.343379e-07,2.259767,NODE2,
NODE3,29.815516,24.849557,internal,-2690.0,"[-2727, -2667]",,Russia,99.0,HIGH,0.001178,Europe,64.686314,97.745306,,,100.0,HIGH,*,,,,,,,,,,,,,1.970737e-07,0.833444,NODE3,
NODE4,13.108865,27.524550,internal,-2675.0,"[-2692, -2667]",,Russia,100.0,HIGH,0.001178,Europe,64.686314,97.745306,,,100.0,HIGH,*,,,,,,,,,,,,,4.964864e-07,2.099690,NODE4,
NODE5,348.749632,53.019582,internal,-2348.0,"[-2462, -2273]",,Germany,89.0,LOW,0.001178,Europe,51.083420,10.423447,,,100.0,HIGH,*,,,,,,,,,,,,,3.594801e-08,0.152028,NODE5,


---

# Export

## Time Tree

In [15]:
for branch in tree_dict:
    out_timetree = copy.deepcopy(tree_dict[branch]["tree"])

    metadata_to_comment(out_timetree, tree_dict[branch]["df"])    
    out_timetree_nex_path = os.path.join(augur_dir, branch + ".timetree.nex")
    Phylo.write(out_timetree, out_timetree_nex_path, "nexus")

    for c in out_timetree.find_clades():
        c.comment = None

    out_timetree_nwk_path = os.path.join(augur_dir, branch + ".timetree.nwk")
    Phylo.write(out_timetree, out_timetree_nwk_path, "newick")

## Augur

In [16]:
for branch in tree_dict:
    augur_dict = augur_export(
        tree_path=None, 
        aln_path=None,  
        tree=tree_dict[branch]["tree"], 
        tree_df=tree_dict[branch]["auspice_df"], 
        color_keyword_exclude=["geometry"],
        type_convert = {
            "branch_number" : (lambda x : str(x))
        },
    )
    
    tree_dict[branch]["augur_dict"] = augur_dict
    
    first_taxa = list(augur_dict["nodes"].keys())[0]
    print(augur_dict["nodes"][first_taxa])

    out_path_augur_json = os.path.join(augur_dir, branch + "_augur.json" )
    utils.write_json(data=tree_dict[branch]["augur_dict"], file_name=out_path_augur_json, indent=JSON_INDENT)
    tree_dict[branch]["augur_json_path"] = out_path_augur_json
    

{'branch_length': 0.0, 'branch_length_sub': 0.0, 'node_type': 'internal', 'num_date': 1866.0, 'num_date_confidence': [1810, 1907], 'province': 'NA', 'country': 'China', 'state_prob': 59.0, 'state_conf_category': 'LOW', 'state_rate': 0.0, 'continent': 'Asia', 'country_lat': 35.000074, 'country_lon': 104.999927, 'branch_major': 'NA', 'branch_minor': 'NA', 'branch_support': 100.0, 'branch_support_conf_category': 'HIGH', 'branch_support_conf_char': '*', 'province_lat': 'NA', 'province_lon': 'NA', 'biosample_accession': 'NA', 'strain': 'NA', 'country_date_strain': 'NA', 'province_date_strain': 'NA', 'host_human': 'NA', 'date_mean': 'NA', 'date_err': 'NA', 'date_bp_mean': 'NA', 'root_rtt_dist': 'NA', 'clade_rtt_dist': 'NA', 'rate_sub': 0.0, 'rate_sub_year': 0.0, 'node_name': 'NODE0', 'blank': ' '}
{'branch_length': 0.0, 'branch_length_sub': 0.0, 'node_type': 'internal', 'num_date': 1274.0, 'num_date_confidence': [1211, 1318], 'province': 'NA', 'country': 'Germany', 'state_prob': 36.0, 'state

## Auspice

In [17]:
for branch in tree_dict:
    print(branch)
    # Store the color
    if branch == "0.ANT4":
        branch_major_color = colors_dict["branch_major"]["0.ANT"]
    else:
        branch_major_color = colors_dict["branch_major"][branch]

    auspice_dict = auspice_export(
        tree=tree_dict[branch]["tree"],
        augur_json_paths=tree_dict[branch]["augur_json_path"], 
        auspice_config_path=auspice_config_path, 
        auspice_colors_path=out_path_colors,
        auspice_latlons_path=out_path_latlon, 
        auspice_geo_res=["country","province"],
        )


    label_col = list(tree_dict[branch]["auspice_df"])
    #print(label_col)

    # Recursively add branch attrs
    branch_attributes(
        tree_dict=auspice_dict["tree"], 
        sub_dict=auspice_dict["tree"], 
        df=tree_dict[branch]["auspice_df"],
        label_col=label_col,
        )
    
    
    # Last manual changes
    auspice_dict_copy = copy.deepcopy(auspice_dict)
    for i in range(0, len(auspice_dict_copy["meta"]["colorings"])):
        coloring = auspice_dict_copy["meta"]["colorings"][i]
        for key in coloring:
            # Node type as internal or terminal
            if coloring[key] == "node_type":
                auspice_dict["meta"]["colorings"][i]['scale'] = [['internal', '#FFFFFF'], ['terminal', branch_major_color]]
                #print(auspice_dict["meta"]["colorings"][i])
            # Confidence category
            if "conf_category" in coloring[key]:
                auspice_dict["meta"]["colorings"][i]['scale'] = [['LOW', '#FFFFFF'], ['HIGH', branch_major_color]]
                #print(auspice_dict["meta"]["colorings"][i])
            # Host Human binary
            if "host_human" in coloring[key]:
                auspice_dict["meta"]["colorings"][i]['scale'] = [['Human', '#CBB742'], ['Non-Human', "#60B6F2"], ['NA', "#D6D6D6"]]

    # Write outputs - For Local Rendering
    out_path_auspice_local_json = os.path.join(auspice_dir, branch + ".json" )
    utils.write_json(data=auspice_dict, file_name=out_path_auspice_local_json, indent=JSON_INDENT, include_version=False)
    export_v2.validate_data_json(out_path_auspice_local_json)
    print("Validation successful for local JSON.\n")

1.ORI
Validating schema of '/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/main//config/auspice_config.json'...
Validation success.




















Validating produced JSON
Validating schema of '/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/main/auspice/all/chromosome/full/filter5/beast/1.ORI.json'...
Validating that the JSON is internally consistent...
Validation successful for local JSON.

1.PRE
Validating schema of '/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/main//config/auspice_config.json'...
Validation success.




















Validating produced JSON
Validating schema of '/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/main/auspice/all/chromosome/full/filter5/beast/1.PRE.json'...
Validating that the JSON is internally consistent...
Validation successful for local JSON.

2.MED
Validating schema of '/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/main//config/auspice_config.json'...
Validation success.




















Validating produced JSON
Validating schema of '/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/main/auspice/all/chromosome/full/filter5/beast/2.MED.json'...
Validating that the JSON is internally consistent...
Validation successful for local JSON.

0.ANT4
Validating schema of '/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/main//config/auspice_config.json'...
Validation success.
Validating produced JSON
Validating schema of '/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/main/auspice/all/chromosome/full/filter5/beast/0.ANT4.json'...
Validating that the JSON is internally consistent...
Validation successful for local JSON.

0.PRE




















Validating schema of '/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/main//config/auspice_config.json'...
Validation success.
Validating produced JSON
Validating schema of '/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/main/auspice/all/chromosome/full/filter5/beast/0.PRE.json'...
Validating that the JSON is internally consistent...
Validation successful for local JSON.





















In [18]:

df = tree_dict["1.ORI"]["df"]
df.loc[df.index.str.endswith('NODE0')]

DEPRECATED: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



Unnamed: 0_level_0,strain,date,date_bp,country,province,country_lat,country_lon,province_lat,province_lon,biovar,branch_major,branch_minor,biosample_accession,biosample_comment,branch_number,continent,date_mean,date_bp_mean,date_err,lat,lon,host_human,branch_major_color,geometry_size,geometry,root_rtt_dist,clade_rtt_dist,length_range,state.rate_range,default.rate_95%_HPD,length_95%_HPD,state.rate_95%_HPD,default.rate_range,state.rate,default.rate,length,posterior,state.prob,height_median,height_range,height_95%_HPD,state.rate_median,default.rate_median,length_median,state,state.set.prob,state.set,height,branch_length,branch_length_sub,node_type,branch_support,branch_support_conf_category,branch_support_conf_char,country_date_strain,province_date_strain,rate,rate_hpd,rate_sub,rate_sub_year,state_prob,state_conf_category,state_rate,state_rate_hpd,state_continent,state_lat,state_lon,height_hpd,timetree_num_date,timetree_num_date_confidence
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1
NODE0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,China,,,149.507456,0.0,0.0,internal,100.0,HIGH,*,,,0.0,"[0.0, 0.0]",0.0,0.0,59.0,LOW,0.0,"[0.0, 0.0]",Asia,35.000074,104.999927,"[108.807767550407, 206.2509600693948]",1866.0,"[1810, 1907]"
