# Treetime Analysis

## Setup

---

### Module Imports

In [45]:
from Bio import Phylo
from treetime.utils import parse_dates
from treetime import wrappers
import pandas
import os

### Input File Paths

In [3]:
'''
tree_path = str(snakemake.input.tree)
aln_path = str(snakemake.input.snp_aln)
metadata_path = str(snakemake.input.metadata)
treetime_path = str(snakemake.output.treetime)
'''
tree_path = "../../docs/results/2020-11-09/iqtree/iqtree-core_chromosome.filter5.treefile"
metadata_path = "../../results/metadata/all/metadata.txt"

print(tree_path)
print(metadata_path)

../../docs/results/2020-11-09/iqtree/iqtree-core_chromosome.filter5.treefile
../../results/metadata/all/metadata.txt


### Constants and Variables

In [38]:
NAME_COL = "Sample"
DATES_COL = "Date"
ATTRIBUTE = "Biovar"
NO_DATA_CHAR = "NA"

file_prefix = "mugration-biovar"
tree_type = "divtree"

### Fix Tree Parsing

In [59]:
tree = Phylo.read(tree_path, "newick")

node_i = 0

for c in tree.find_clades():
    if not c.name:
        continue
    #print(dir(c))
    
    # Assuming IQTREE was run to have format aLRT/UFBoot.
    name_split = c.name.split("/")
    
    # Check if the name has 'accidentally' become the confidence vals
    if (
        len(name_split) == 2 
        and float(name_split[0]) >= 0 and float(name_split[1]) <= 100
        and float(name_split[1]) >= 0 and float(name_split[1])  <= 100
        ):

        # Name the internal node
        c.name = "NODE" + str(node_i) + ":"
        
        # Assign confidence values to internal node
        #c.confidence = [float(val) for val in name_split]
        c.confidence = float(name_split[1])
        
        # Increment node counter
        node_i += 1

In [60]:
out_tree = os.path.splitext(tree_path)[0] + "_rename.nexus" 
print(tree)
print(out_tree)
# Write Tree File
Phylo.write(tree, 
            out_tree,
            "nexus",
            format_branch_length='%1.10f')

Tree(rooted=False, weight=1.0)
    Clade(confidence=96.0, name='NODE0:')
        Clade(branch_length=3.0272e-06, confidence=100.0, name='NODE1:')
            Clade(branch_length=9.6807e-06, confidence=100.0, name='NODE2:')
                Clade(branch_length=7.4258e-06, confidence=100.0, name='NODE3:')
                    Clade(branch_length=1.1729e-06, confidence=100.0, name='NODE4:')
                        Clade(branch_length=4.4892e-06, confidence=100.0, name='NODE5:')
                            Clade(branch_length=1.4074e-06, confidence=100.0, name='NODE6:')
                                Clade(branch_length=3.7661e-06, confidence=73.0, name='NODE7:')
                                    Clade(branch_length=2.35e-08, confidence=62.0, name='NODE8:')
                                        Clade(branch_length=4.691e-07, confidence=73.0, name='NODE9:')
                                            Clade(branch_length=7.037e-07, confidence=73.0, name='NODE10:')
                        

1

### Parse Dates

In [36]:
# Parse the metadata
states = pandas.read_csv(metadata_path, sep='\t')

# Tell treetime which column is the taxon name
if NAME_COL in states.columns:
    taxon_name = NAME_COL

# Tell treetime which column is the attribute for mugration
if ATTRIBUTE in states.columns:
    attr = ATTRIBUTE

# Get tips names
tree = Phylo.read(tree_path, "newick")
tree_tip_names = [t.name for t in tree.get_terminals()]

## Analysis

### Mugration Estimation

In [39]:
# Map the taxon name to the mugration attribute
leaf_to_attr = {x[taxon_name]:str(x[attr]) for xi, x in states.iterrows()
                    if x[attr]!=NO_DATA_CHAR and x[attr]}



# Run the mugration (default nan comes from previous step)
mug, letter_to_state, reverse_alphabet = wrappers.reconstruct_discrete_traits(tree_path, 
                                                                     leaf_to_attr, 
                                                                     missing_data="nan",
                                                                     #pc=pc, 
                                                                     #sampling_bias_correction=sampling_bias_correction, 
                                                                     verbose=4, 
                                                                     #weights=params.weights
                                                                    )
# ???
unique_states = sorted(letter_to_state.values())


0.00	-TreeAnc: set-up
Assigned discrete traits to 327 out of 540 taxa.


0.18	-SequenceData: loaded alignment.

0.18	-SeqData: making compressed alignment...

0.18	-SequenceData: constructed compressed alignment...

0.20	-TreeAnc.infer_ancestral_sequences with method: ml, marginal
0.20	--TreeAnc._ml_anc_marginal: type of reconstruction: Marginal
0.20	---Attaching sequence profiles to leafs...
0.21	---Postorder: computing likelihoods...
0.26	---Computing root node sequence and total tree likelihood...
0.26	---Preorder: computing marginal profiles...
0.32	---TreeAnc._ml_anc_marginal: ...done
0.37	--TreeAnc.infer_gtr: counting mutations...
0.46	---TreeAnc.infer_gtr: counting mutations...done

0.46	-GTR: with alphabet: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
    	 'K', 'L', 'M', 'N']
0.46	--GTR: ambiguous character: O
0.46	----GTR: no gap symbol!
0.46	---GTR: init with dummy values!

0.46	-GTR: model inference
0.46	---GTR inference iteration 0 change: 0.2672612419124244
0.47	--

3.23	---Computing root node sequence and total tree likelihood...
3.23	---Attaching sequence profiles to leafs...
3.23	---Postorder: computing likelihoods...
3.28	---Computing root node sequence and total tree likelihood...
3.28	---Attaching sequence profiles to leafs...
3.29	---Postorder: computing likelihoods...
3.35	---Computing root node sequence and total tree likelihood...
3.35	---Attaching sequence profiles to leafs...
3.36	---Postorder: computing likelihoods...
3.42	---Computing root node sequence and total tree likelihood...
3.42	---Attaching sequence profiles to leafs...
3.43	---Postorder: computing likelihoods...
3.51	---Computing root node sequence and total tree likelihood...
3.51	---Attaching sequence profiles to leafs...
3.51	---Postorder: computing likelihoods...
3.57	---Computing root node sequence and total tree likelihood...
3.57	---Attaching sequence profiles to leafs...
3.58	---Postorder: computing likelihoods...
3.64	---Computing root node sequence and total tree 

6.87	---Computing root node sequence and total tree likelihood...
6.87	---Attaching sequence profiles to leafs...
6.88	---Postorder: computing likelihoods...
6.93	---Computing root node sequence and total tree likelihood...
6.93	---Attaching sequence profiles to leafs...
6.94	---Postorder: computing likelihoods...
6.99	---Computing root node sequence and total tree likelihood...
6.99	---Attaching sequence profiles to leafs...
7.00	---Postorder: computing likelihoods...
7.05	---Computing root node sequence and total tree likelihood...
7.05	---Attaching sequence profiles to leafs...
7.05	---Postorder: computing likelihoods...
7.11	---Computing root node sequence and total tree likelihood...
7.11	---Attaching sequence profiles to leafs...
7.11	---Postorder: computing likelihoods...
7.17	---Computing root node sequence and total tree likelihood...
7.17	---Attaching sequence profiles to leafs...
7.18	---Postorder: computing likelihoods...
7.23	---Computing root node sequence and total tree 

In [40]:
print(unique_states)

['Altaica', 'Antiqua', 'Bronze Age Plague', 'Caucasica', 'First Pandemic', 'Hissarica', 'Medievalis', 'Microtus', 'Orientalis', 'Pre-First Pandemic', 'Second Pandemic', 'Talassica', 'Ulegeica', 'Xilingolensis', 'nan']


In [42]:
for c in mug.tree.find_clades():
    print(c.name)

77.8/96
SAMEA3541827
85.7/99
SAMEA104233046
98.2/100
SAMEA104233050
87.2/100
SAMEA104233047
99.6/100
SAMEA104233049
NODE_0000005
SAMEA104233048
SAMEA3541826
98.2/100
100/100
GCA_000323485.1_ASM32348v1_genomic
GCA_000323845.1_ASM32384v1_genomic
100/100
100/100
100/100
GCA_001601675.1_ASM160167v1_genomic
0/38
GCA_002127375.1_ASM212737v1_genomic
0/26
GCA_006376585.1_ASM637658v1_genomic
0/21
GCA_001294825.1_ASM129482v1_genomic
0/39
78.8/100
GCA_008630415.1_ASM863041v1_genomic
0/90
GCA_008630455.1_ASM863045v1_genomic
90.6/99
GCA_001972395.1_ASM197239v1_genomic
GCA_002165475.1_ASM216547v1_genomic
0/54
89.5/65
GCA_002127355.1_ASM212735v1_genomic
GCA_002127365.1_ASM212736v1_genomic
73.2/65
GCA_001972465.1_ASM197246v1_genomic
46.8/85
GCA_001972385.1_ASM197238v1_genomic
GCA_001972405.1_ASM197240v1_genomic
100/100
100/100
85.9/100
GCA_001295055.1_ASM129505v1_genomic
79.9/100
GCA_003074435.1_ASM307443v1_genomic
0/39
GCA_001294985.1_ASM129498v1_genomic
0/40
GCA_001972415.1_ASM197241v1_genomic
87.1/

### Add Node Comments

In [104]:
# Write Tree File
Phylo.write(mug.tree, 
            "{}_{}.nexus".format(file_prefix, tree_type), 
            "nexus",
            format_branch_length='%1.10f')

1