---
# Setup

## Modules

In [1]:
import os
import pandas as pd
import copy
from Bio import Phylo, AlignIO
from functions import *
import subprocess
import matplotlib.pyplot as plt
from matplotlib import colors

## Paths

In [2]:
WILDCARDS = ["all", "chromosome", "full", "30"]
project_dir = "/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/denmark/"
#project_dir = "/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/main/"
results_dir = project_dir

READS_ORIGIN = WILDCARDS[0]
LOCUS_NAME = WILDCARDS[1]
PRUNE = WILDCARDS[2]
MISSING_DATA = WILDCARDS[3]

In [3]:
metadata_path = os.path.join(results_dir, "iqtree/all/chromosome/{}/filter{}/filter-taxa/metadata.tsv".format(PRUNE, MISSING_DATA))
tree_path = os.path.join(results_dir, "beast/all/chromosome/{}/filter{}/relaxed_clock/dates/run/beast_mcc.nex".format(PRUNE, MISSING_DATA))
auspice_config_path = results_dir + "config/auspice_config.json"

# ------------------------------------------
# Alignment
constant_sites_path = project_dir + "/snippy_multi/all/chromosome/full/snippy-multi.constant_sites.txt"
aln_path            = project_dir + "/iqtree/all/chromosome/full/filter30/filter-sites/snippy-multi.snps.aln"

# ------------------------------------------
# Output
auspice_dir = os.path.join(results_dir, "auspice/all/chromosome/{}/filter{}/beast/".format(PRUNE, MISSING_DATA))
if not os.path.exists(auspice_dir):
    subprocess.run(["mkdir", "-p", auspice_dir]) 
    
augur_dir = os.path.join(results_dir, "augur/all/chromosome/{}/filter{}/beast/".format(PRUNE, MISSING_DATA))
if not os.path.exists(augur_dir):
    subprocess.run(["mkdir", "-p", augur_dir]) 

In [4]:
NO_DATA_CHAR = "NA"
UNKNOWN_CHAR = "?"
JSON_INDENT = 2
CURRENT_YEAR = 2021

# ------------------------------------------
# Alignment
with open(constant_sites_path) as infile:
    data = infile.read().strip().split(",")
    constant_sites = sum([int(count) for count in data])

aln = AlignIO.read(aln_path, "fasta")
variant_sites = len(aln[0].seq)
SEQ_LEN = constant_sites + variant_sites
print(SEQ_LEN)

4290166


## Metadata

In [5]:
metadata_df = pd.read_csv(metadata_path, sep='\t')
metadata_df.set_index(metadata_df.columns[0], inplace=True)
metadata_df.fillna(NO_DATA_CHAR, inplace=True)

display(metadata_df)

Unnamed: 0_level_0,strain,date,date_bp,country,province,country_lat,country_lon,province_lat,province_lon,biovar,...,date_bp_mean,date_err,lat,lon,host_human,branch_major_color,geometry_size,geometry,root_rtt_dist,clade_rtt_dist
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAMEA5818830,STN021,[1485:1635],[-536:-386],Switzerland,Nidwalden,46.798562,8.231974,46.942756,8.411977,Second Pandemic,...,461.0,75.0,46.942756,8.411977,Human,#8000ff,8.0,POINT (8.4119773 46.942756),1.21946e-05,1.21946e-05
SAMEA5818829,STN020,[1485:1635],[-536:-386],Switzerland,Nidwalden,46.798562,8.231974,46.942756,8.411977,Second Pandemic,...,461.0,75.0,46.942756,8.411977,Human,#8000ff,8.0,POINT (8.4119773 46.942756),1.19382e-05,1.19382e-05
SAMEA5818828,STN019,[1485:1635],[-536:-386],Switzerland,Nidwalden,46.798562,8.231974,46.942756,8.411977,Second Pandemic,...,461.0,75.0,46.942756,8.411977,Human,#8000ff,8.0,POINT (8.4119773 46.942756),1.20081e-05,1.20081e-05
SAMEA5818826,STN014,[1485:1635],[-536:-386],Switzerland,Nidwalden,46.798562,8.231974,46.942756,8.411977,Second Pandemic,...,461.0,75.0,46.942756,8.411977,Human,#8000ff,8.0,POINT (8.4119773 46.942756),1.19003e-05,1.19003e-05
SAMEA5818825,STN013,[1485:1635],[-536:-386],Switzerland,Nidwalden,46.798562,8.231974,46.942756,8.411977,Second Pandemic,...,461.0,75.0,46.942756,8.411977,Human,#8000ff,8.0,POINT (8.4119773 46.942756),1.24044e-05,1.24044e-05
SAMEA5818822,STN008,[1485:1635],[-536:-386],Switzerland,Nidwalden,46.798562,8.231974,46.942756,8.411977,Second Pandemic,...,461.0,75.0,46.942756,8.411977,Human,#8000ff,8.0,POINT (8.4119773 46.942756),1.27904e-05,1.27904e-05
SAMEA5818821,STN007,[1485:1635],[-536:-386],Switzerland,Nidwalden,46.798562,8.231974,46.942756,8.411977,Second Pandemic,...,461.0,75.0,46.942756,8.411977,Human,#8000ff,8.0,POINT (8.4119773 46.942756),1.21946e-05,1.21946e-05
SAMEA5818818,STN002,[1485:1635],[-536:-386],Switzerland,Nidwalden,46.798562,8.231974,46.942756,8.411977,Second Pandemic,...,461.0,75.0,46.942756,8.411977,Human,#8000ff,8.0,POINT (8.4119773 46.942756),1.20579e-05,1.20579e-05
SAMEA5818817,STA001,[1420:1630],[-601:-391],Germany,Bavaria,51.08342,10.423447,48.946756,11.403872,Second Pandemic,...,496.0,105.0,48.946756,11.403872,Human,#8000ff,4.0,POINT (11.4038717 48.9467562),5.6598e-06,5.6598e-06
SAMEA5818815,NMS002,[1475:1536],[-546:-485],England,East of England,52.531021,-1.264906,52.219977,0.487578,Second Pandemic,...,515.5,30.5,52.219977,0.487578,Human,#8000ff,1.0,POINT (0.4875777469166293 52.2199774),9.3109e-06,9.3109e-06


## Phylogeny

### Import Tree

In [6]:
tree = Phylo.read(tree_path, format="nexus")
tree.ladderize(reverse=False)

# Rename internal nodes
node_i = 0
for c in tree.find_clades():
    if not c.name:
        c.name = "NODE{}".format(node_i)
        node_i += 1

### Add Tree Metadata to Dataframe

In [7]:
parameters = [
    "branch_length",
    "branch_length_sub",
    "node_type",    
    "branch_support",
    "branch_support_conf_category",
    "branch_support_conf_char",
    # Custom
    "country_date_strain",
    "province_date_strain",
    "rate",
    "rate_hpd",
    "rate_sub",
    "rate_sub_year",
    "height",
    "height_hpd",
    "timetree_num_date",
    "timetree_num_date_confidence",
    "continuous_geo",
    "continuous_geo_coord",    
    "continuous_geo_lat", 
    "continuous_geo_lon",
]

# Add to dataframe
for param in parameters:
    metadata_df[param] = [NO_DATA_CHAR] * len(metadata_df)

# Dummy naming of locations geo
locations_geo_dict = {}
locations_geo_i = 1
    
for c in tree.find_clades():
    # Defaults
    node_type = "internal"
    branch_support = 0
    branch_support_conf_category = "LOW"
    branch_support_conf_char = ""
    branch_length = 0
    branch_length_sub = 0
    country_date_strain = NO_DATA_CHAR
    province_date_strain = NO_DATA_CHAR
    rate_hpd = [0,0]
    rate = 0
    rate_sub_year = 0
    timetree_num_date = 0
    timetree_num_date_confidence = [0,0]
    height = 0
    height_hpd = [0,0]    
    locations_geo_coord = [NO_DATA_CHAR, NO_DATA_CHAR]
    locations_geo_name = NO_DATA_CHAR
    
    comment_dict = parse_comment(c.comment)

    # Branch Length
    if c.branch_length:
        branch_length = c.branch_length
        
    if "posterior" in comment_dict:
        branch_support = float(comment_dict["posterior"]) * 100
        if branch_support >= 95:
            branch_support_conf_category = "HIGH"
            branch_support_conf_char = "*"
            
    if "rate" in comment_dict:
        rate = float(comment_dict["rate"])
        rate_sub = rate
        
        if c.branch_length:
            rate_sub_year = rate * SEQ_LEN
            branch_length_sub = rate_sub_year * c.branch_length
        
    if "rate_95%_HPD" in comment_dict:
        rate_hpd_split = comment_dict["rate_95%_HPD"].strip("{}").split(",")
        rate_hpd = [float(r) for r in rate_hpd_split]
        
    if "height" in comment_dict:
        height = float(comment_dict["height"])

    if "height_95%_HPD" in comment_dict:
        height_hpd_split = comment_dict["height_95%_HPD"].strip("{}").split(",")
        height_hpd = [float(h) for h in height_hpd_split]
        
    if "locationsgeo" in comment_dict:
        locations_geo_coord_split = comment_dict["locationsgeo"].strip("{}").split(",")
        locations_geo_coord = [float(l) for l in locations_geo_coord_split]
        
        if locations_geo_coord not in locations_geo_dict.values():
            locations_geo_name = "loc{}".format(locations_geo_i)
            locations_geo_dict[locations_geo_name] = locations_geo_coord
            locations_geo_i += 1
        else:
            for name, coord in locations_geo_dict.items():
                if coord == locations_geo_coord:
                    locations_geo_name = name
        
        
    if c.is_terminal():
        node_type = "terminal"
        country = metadata_df["country"][c.name]
        province = metadata_df["province"][c.name]
        date = metadata_df["date"][c.name]
        strain = metadata_df["strain"][c.name]
        
        country_date_strain = "{} {} {}".format(country, date, strain)
        province_date_strain = "{} {} {}".format(province, date, strain)
        
    metadata_df.at[c.name, "branch_length"] = branch_length
    metadata_df.at[c.name, "branch_length_sub"] = branch_length_sub
    metadata_df.at[c.name, "node_type"] = node_type
    metadata_df.at[c.name, "branch_support"] = branch_support
    metadata_df.at[c.name, "branch_support_conf_category"] = branch_support_conf_category
    metadata_df.at[c.name, "branch_support_conf_char"] = branch_support_conf_char
    metadata_df.at[c.name, "country_date_strain"] = country_date_strain
    metadata_df.at[c.name, "province_date_strain"] = province_date_strain
    
    metadata_df.at[c.name, "rate"] = rate
    metadata_df.at[c.name, "rate_hpd"] = rate_hpd
    metadata_df.at[c.name, "rate_sub"] = rate_sub
    metadata_df.at[c.name, "rate_sub_year"] = rate_sub_year
    metadata_df.at[c.name, "height"] = height
    metadata_df.at[c.name, "height_hpd"] = height_hpd    
    
    metadata_df.at[c.name, "continuous_geo_coord"] = locations_geo_coord
    metadata_df.at[c.name, "continuous_geo"] = locations_geo_name
    metadata_df.at[c.name, "continuous_geo_lat"] = locations_geo_coord[0]
    metadata_df.at[c.name, "continuous_geo_lon"] = locations_geo_coord[1]
    
    
# Update internal nodes NA
metadata_df.fillna(NO_DATA_CHAR, inplace=True)
display(metadata_df)

Unnamed: 0_level_0,strain,date,date_bp,country,province,country_lat,country_lon,province_lat,province_lon,biovar,...,rate_sub,rate_sub_year,height,height_hpd,timetree_num_date,timetree_num_date_confidence,continuous_geo,continuous_geo_coord,continuous_geo_lat,continuous_geo_lon
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAMEA5818830,STN021,[1485:1635],[-536:-386],Switzerland,Nidwalden,46.7986,8.23197,46.9428,8.41198,Second Pandemic,...,4.003006e-08,0.171736,237.698983,"[172.2302786764335, 320.8567069106599]",,,,"[NA, NA]",,
SAMEA5818829,STN020,[1485:1635],[-536:-386],Switzerland,Nidwalden,46.7986,8.23197,46.9428,8.41198,Second Pandemic,...,4.027226e-08,0.172775,245.936139,"[180.05505350951182, 327.53466125263174]",,,,"[NA, NA]",,
SAMEA5818828,STN019,[1485:1635],[-536:-386],Switzerland,Nidwalden,46.7986,8.23197,46.9428,8.41198,Second Pandemic,...,4.101832e-08,0.175975,246.115659,"[181.6098172292643, 328.5680990974853]",,,,"[NA, NA]",,
SAMEA5818826,STN014,[1485:1635],[-536:-386],Switzerland,Nidwalden,46.7986,8.23197,46.9428,8.41198,Second Pandemic,...,4.096838e-08,0.175761,247.460674,"[183.03436053134095, 331.7023619277743]",,,,"[NA, NA]",,
SAMEA5818825,STN013,[1485:1635],[-536:-386],Switzerland,Nidwalden,46.7986,8.23197,46.9428,8.41198,Second Pandemic,...,4.612915e-08,0.197902,233.790112,"[170.11192431228056, 319.9122117845398]",,,,"[NA, NA]",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NODE43,,,,,,,,,,,...,4.129195e-08,0.177149,105.194996,"[41.46678498233081, 180.9104256514633]",,,,"[NA, NA]",,
NODE44,,,,,,,,,,,...,4.462087e-08,0.191431,102.541757,"[50.866416242647745, 177.87373855065306]",,,,"[NA, NA]",,
NODE45,,,,,,,,,,,...,5.666355e-08,0.243096,92.944694,"[44.92340368976829, 166.37894097195522]",,,,"[NA, NA]",,
NODE46,,,,,,,,,,,...,4.362423e-08,0.187155,90.323229,"[44.19821780401924, 164.99067783500874]",,,,"[NA, NA]",,


## Adjust Dates

In [8]:
# Find the sample that had a fixed date
fixed_date_sample = "SAMN00715800"
fixed_date = 672
estimate_date = metadata_df.loc[fixed_date_sample]["height"]
date_conversion = fixed_date - estimate_date
print(date_conversion)

# Convert heights to calendar dates
for rec in metadata_df.iterrows():
    sample = rec[0]
    height = rec[1]["height"]
    height_hpd = rec[1]["height_hpd"]
    height_hpd_reverse = copy.copy(height_hpd)
    height_hpd_reverse.reverse()
    
    timetree_num_date = round(CURRENT_YEAR - height - date_conversion)
    timetree_num_date_confidence = [round(CURRENT_YEAR - h - date_conversion) for h in height_hpd_reverse]

    metadata_df.at[sample, "timetree_num_date"] = timetree_num_date
    metadata_df.at[sample, "timetree_num_date_confidence"] = timetree_num_date_confidence

215.32019811860272


### Colors

In [9]:
out_path_colors = os.path.join(augur_dir, "colors.tsv")
states = ["country", "province", "branch_major"]
colors_dict = {}

for attr in states:
    # Create the color map
    attr_key = attr.lower()
    colors_dict[attr_key] = {}
    
    for t in tree.get_terminals():
        attr_val = metadata_df[attr][t.name]
        # Remove the letter suffix from branch_minor
        if attr == "branch_minor":
            while attr_val[-1].isalpha():
                attr_val = attr_val[:-1] 
        if attr_val not in colors_dict[attr_key] and attr_val != NO_DATA_CHAR:
            colors_dict[attr_key][attr_val] = {}
              
    # Create the custom color map (pyplot)
    cmap = plt.get_cmap("rainbow", len(colors_dict[attr_key]))
    # Convert the color map to a list of RGB values
    cmaplist = [cmap(i) for i in range(cmap.N)]
    # Convert RGB values to hex colors
    attr_hex = [colors.to_hex(col) for col in cmaplist]
    
    # Assign colors to value
    for attr_val, attr_col in zip(colors_dict[attr_key], attr_hex):
        colors_dict[attr_key][attr_val] = attr_col   
        
    # Add unknown
    colors_dict[attr_key][UNKNOWN_CHAR] = "#969696"
    
print(colors_dict)

with open(out_path_colors, "w") as outfile:
    for attr_key in colors_dict:
        for attr_val in colors_dict[attr_key]:
            outfile.write(str(attr_key) + "\t" + str(attr_val) + "\t" + str(colors_dict[attr_key][attr_val]) + "\n")

{'country': {'Russia': '#8000ff', 'Denmark': '#5148fc', 'France': '#238af5', 'Spain': '#0cc1e8', 'England': '#3ae8d7', 'Germany': '#68fcc1', 'The Netherlands': '#97fca7', 'Norway': '#c5e88a', 'Italy': '#f3c16a', 'Switzerland': '#ff8a48', 'Lithuania': '#ff4824', 'Poland': '#ff0000', '?': '#969696'}, 'province': {'Tatarstan': '#8000ff', 'Region of Southern Denmark': '#632cfe', 'Occitanie': '#4757fb', 'Catalonia': '#2b7ff6', 'Greater London': '#0ea4f0', 'Bavaria': '#0ec3e7', 'North Brabant': '#2adddd', 'Oslo': '#47f0d1', 'Lazio': '#63fbc3', 'Central Denmark Region': '#80ffb4', 'East of England': '#9cfba4', 'Brandenburg': '#b8f092', 'Nidwalden': '#d4dd80', 'Baden-Württemberg': '#f1c36c', 'Vilnius County': '#ffa457', 'Pomeranian Voivodeship': '#ff8042', 'Rostov Oblast': '#ff572c', 'Chechnya': '#ff2c16', "Provence-Alpes-Côte d'Azur": '#ff0000', '?': '#969696'}, 'branch_major': {'1.PRE': '#8000ff', '?': '#969696'}}


### Latitude and Longitude

In [10]:
latlon_country_df = pd.DataFrame()
latlon_province_df = pd.DataFrame()

df_list = [latlon_country_df, latlon_province_df]
attr_list = ["country", "province"]

# Create a mapping of geo name to lat,lon
for df,attr in zip(df_list, attr_list):
    # Get data
    for rec in metadata_df.iterrows():
        node_name = rec[0]
        node_type = rec[1]["node_type"]
        name = rec[1][attr]
        country = rec[1]["country"]
        
        if node_type == "internal":
            continue

        if attr == "province" and name == NO_DATA_CHAR and node_type == "terminal" and country != "Russia":
            # Use country instead
            name = rec[1]["country"]
            lat = rec[1]["country_lat"]
            lon = rec[1]["country_lon"]
        else:
            lat = rec[1][attr + "_lat"]
            lon = rec[1][attr + "_lon"]             


        if name not in df.index:
            df.at[name, "lat"] = lat
            df.at[name, "lon"] = lon
            df.at[name, "size"] = 1
        else:
            df["size"][name] += 1

#display(latlon_country_df)
#display(latlon_province_df)

# Mapping file for auspice
out_path_latlon = os.path.join(augur_dir, "latlon.tsv")
print(out_path_latlon)

# Countries
with open(out_path_latlon, "w") as outfile:
    for country in latlon_country_df.index:
        lat = str(latlon_country_df["lat"][country])
        lon = str(latlon_country_df["lon"][country])
        outfile.write("country" + "\t" + country + "\t" + lat + "\t" + lon + "\n")
# Provinces
with open(out_path_latlon, "a") as outfile:
    for province in latlon_province_df.index:
        lat = str(latlon_province_df["lat"][province])
        lon = str(latlon_province_df["lon"][province])
        outfile.write("province" + "\t" + province + "\t" + lat + "\t" + lon + "\n")
        
# Continuous
with open(out_path_latlon, "a") as outfile:
    for name, coord in locations_geo_dict.items():
        lat = str(coord[0])
        lon = str(coord[1])
        outfile.write("continuous_geo" + "\t" + name + "\t" + lat + "\t" + lon + "\n")

#display(metadata_df[metadata_df["continent"] == "Europe"])

/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/denmark/augur/all/chromosome/full/filter30/beast/latlon.tsv


---
# Export

## Metadata

In [14]:
out_path_metadata = os.path.join(augur_dir, "metadata.tsv")
print(out_path_metadata)
metadata_df.to_csv(out_path_metadata, sep="\t", index=True)

/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/denmark/augur/all/chromosome/full/filter30/beast/metadata.tsv


## Timetree

In [15]:
out_tree = copy.deepcopy(tree)

metadata_to_comment(out_tree, metadata_df)    
out_path_tree_nex = os.path.join(augur_dir, "all.timetree.nex")
Phylo.write(out_tree, out_path_tree_nex, "nexus")

for c in out_tree.find_clades():
    c.comment = None

out_tree_nwk_path = os.path.join(augur_dir, "all.timetree.nwk")
Phylo.write(out_tree, out_tree_nwk_path, "newick")

1

## Create Sub Dataframe

In [16]:
# Remember, order atters when dealing with confidence!

columns = [
    # Required
    "branch_length",
    "branch_length_sub",
    "node_type",
    # Time Tree
    "timetree_num_date",
    "timetree_num_date_confidence",
    # Geo
    "country",
    "province",
    "continuous_geo",
    # Colors and Filters   
    "branch_support",
    "branch_support_conf_category",  
    "branch_support_conf_char",  
    "continent",
    "province_lat",
    "province_lon",
    "country_lat",
    "country_lon",
    # Text Description
    "biosample_accession",
    "strain",
    "country_date_strain",
    "province_date_strain",
    "host_human",
    # Tip Dates
    "date_mean",
    "date_err",
    "date_bp_mean",
    # Stats
    "root_rtt_dist",
    "clade_rtt_dist",
    "rate_sub",
    "rate_sub_year",
]

# Edit df
auspice_df = copy.copy(metadata_df[columns])
auspice_df["node_name"] = list(auspice_df.index)
auspice_df["blank"] = [" "] * len(auspice_df)

display(auspice_df)

Unnamed: 0_level_0,branch_length,branch_length_sub,node_type,timetree_num_date,timetree_num_date_confidence,country,province,continuous_geo,branch_support,branch_support_conf_category,...,host_human,date_mean,date_err,date_bp_mean,root_rtt_dist,clade_rtt_dist,rate_sub,rate_sub_year,node_name,blank
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAMEA5818830,4.699374,0.807050,terminal,1568,"[1485, 1633]",Switzerland,Nidwalden,,0.000000,LOW,...,Human,1560,75,461,1.21946e-05,1.21946e-05,4.003006e-08,0.171736,SAMEA5818830,
SAMEA5818829,11.803908,2.039416,terminal,1560,"[1478, 1626]",Switzerland,Nidwalden,,0.000000,LOW,...,Human,1560,75,461,1.19382e-05,1.19382e-05,4.027226e-08,0.172775,SAMEA5818829,
SAMEA5818828,11.597622,2.040896,terminal,1560,"[1477, 1624]",Switzerland,Nidwalden,,0.000000,LOW,...,Human,1560,75,461,1.20081e-05,1.20081e-05,4.101832e-08,0.175975,SAMEA5818828,
SAMEA5818826,10.279374,1.806714,terminal,1558,"[1474, 1623]",Switzerland,Nidwalden,,0.000000,LOW,...,Human,1560,75,461,1.19003e-05,1.19003e-05,4.096838e-08,0.175761,SAMEA5818826,
SAMEA5818825,8.608244,1.703586,terminal,1572,"[1486, 1636]",Switzerland,Nidwalden,,0.000000,LOW,...,Human,1560,75,461,1.24044e-05,1.24044e-05,4.612915e-08,0.197902,SAMEA5818825,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NODE43,26.096277,4.622938,internal,1700,"[1625, 1764]",,,,100.000000,HIGH,...,,,,,,,4.129195e-08,0.177149,NODE43,
NODE44,28.749516,5.503547,internal,1703,"[1628, 1755]",,,,100.000000,HIGH,...,,,,,,,4.462087e-08,0.191431,NODE44,
NODE45,9.597063,2.333008,internal,1713,"[1639, 1761]",,,,100.000000,HIGH,...,,,,,,,5.666355e-08,0.243096,NODE45,
NODE46,1.009244,0.188885,internal,1715,"[1641, 1761]",,,,31.029886,LOW,...,,,,,,,4.362423e-08,0.187155,NODE46,


## Augur JSON

In [17]:
augur_dict = augur_export(
    tree_path=None, 
    aln_path=None,  
    tree=tree, 
    tree_df=auspice_df, 
    color_keyword_exclude=["geometry"],
    type_convert = {
        "branch_number" : (lambda x : str(x))
    },
)

first_node = list(augur_dict["nodes"].keys())[0]

print(augur_dict["nodes"][first_node])

out_path_augur_json = os.path.join(augur_dir, "all.json" )
utils.write_json(data=augur_dict, file_name=out_path_augur_json, indent=JSON_INDENT)

{'branch_length': 0.0, 'branch_length_sub': 0.0, 'node_type': 'internal', 'num_date': 1295, 'num_date_confidence': [1215, 1361], 'country': 'NA', 'province': 'NA', 'continuous_geo': 'NA', 'branch_support': 100.0, 'branch_support_conf_category': 'HIGH', 'branch_support_conf_char': '*', 'continent': 'NA', 'province_lat': 'NA', 'province_lon': 'NA', 'country_lat': 'NA', 'country_lon': 'NA', 'biosample_accession': 'NA', 'strain': 'NA', 'country_date_strain': 'NA', 'province_date_strain': 'NA', 'host_human': 'NA', 'date_mean': 'NA', 'date_err': 'NA', 'date_bp_mean': 'NA', 'root_rtt_dist': 'NA', 'clade_rtt_dist': 'NA', 'rate_sub': 1.0, 'rate_sub_year': 0.0, 'node_name': 'NODE0', 'blank': ' '}


## Auspice JSON

In [18]:
#import sys, importlib
#importlib.reload(sys.modules['functions'])
#from functions import auspice_export, branch_attributes

auspice_dict = auspice_export(
    tree=tree,
    augur_json_paths=[out_path_augur_json], 
    auspice_config_path=auspice_config_path, 
    auspice_colors_path=out_path_colors,
    auspice_latlons_path=out_path_latlon, 
    auspice_geo_res=["country", "province", "continuous_geo",]
    )


label_col = list(auspice_df.columns)

# Recursively add branch attrs
branch_attributes(
    tree_dict=auspice_dict["tree"], 
    sub_dict=auspice_dict["tree"], 
    df=auspice_df,
    label_col=label_col,
    )


# Last manual changes
auspice_dict_copy = copy.deepcopy(auspice_dict)
for i in range(0, len(auspice_dict_copy["meta"]["colorings"])):
    coloring = auspice_dict_copy["meta"]["colorings"][i]
    for key in coloring:
        # Node type as internal or terminal
        if coloring[key] == "node_type":
            auspice_dict["meta"]["colorings"][i]['scale'] = [['internal', '#FFFFFF'], ['terminal', '#000000']]
            #print(auspice_dict["meta"]["colorings"][i])
        # Confidence category
        if "conf_category" in coloring[key]:
            auspice_dict["meta"]["colorings"][i]['scale'] = [['LOW', '#FFFFFF'], ['HIGH', '#000000']]
            #print(auspice_dict["meta"]["colorings"][i])
        # Host Human binary
        if "host_human" in coloring[key]:
            auspice_dict["meta"]["colorings"][i]['scale'] = [['Human', '#CBB742'], ['Non-Human', "#60B6F2"], ['NA', "#D6D6D6"]]

        
# Write outputs - For Local Rendering
out_path_auspice_local_json = os.path.join(auspice_dir, "all.json" )
utils.write_json(data=auspice_dict, file_name=out_path_auspice_local_json, indent=JSON_INDENT, include_version=False)
export_v2.validate_data_json(out_path_auspice_local_json)
print("Validation successful for local JSON.\n")

#out_path_auspice_remote_json = os.path.join(auspice_dir, "{}_all.json".format(AUSPICE_PREFIX))
#utils.write_json(data=auspice_dict, file_name=out_path_auspice_local_json, indent=JSON_INDENT, include_version=False)
#export_v2.validate_data_json(out_path_auspice_local_json)
#print("Validation successful for local JSON.\n")

Validating schema of '/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/denmark/config/auspice_config.json'...
Validation success.
























Validating produced JSON
Validating schema of '/mnt/c/Users/ktmea/Projects/plague-phylogeography-projects/denmark/auspice/all/chromosome/full/filter30/beast/all.json'...
Validating that the JSON is internally consistent...
Validation successful for local JSON.



