# Phylogenetic Tree Analysis

In [1]:
combine_tree = './trees/combine.ph'

In [5]:
!perl ./tools/topd_v4.6.pl


###########################################
#                                         #
#      TOPD-FMTS version 4.6              #
#      (August 2012)                      #
#                                         #
#      Author: Pere Puigbo, PhD           #
#      http://genomes.urv.cat/topd        #
#      http://ppuigbo.me/programs/topd    #
#                                         #
###########################################



####################################################################################################################

Parameters to run TOPD/fMtS:

	$/topd_v* -f [file] -m [nodal/split/i/triplets/all] -r [yes/no] -n [1-1000] -s [10-10000/all/relative] -c [single/multiple/reference] -p [yes/no] -help

	INPUT FILE NAME			-f	<file_name>
	OUTPUT FILE NAME			-out	<file_name>
	METHOD				-m	<bsd/nodal/split/quartets/triplets/disagree/all>	default: split
	NUMBER OF TRIPLETS&QUARTETS	-u	<all/random/relative>	default: all
	MAXIMUM LEVEL IN THE DISAGREE METHOD 

In [12]:
!perl ./tools/topd_v4.6.pl -f {combine_tree} -r random -n 100 -m split


###########################################
#                                         #
#      TOPD-FMTS version 4.6              #
#      (August 2012)                      #
#                                         #
#      Author: Pere Puigbo, PhD           #
#      http://genomes.urv.cat/topd        #
#      http://ppuigbo.me/programs/topd    #
#                                         #
###########################################


Reading parameters ...

-f ./trees/combine.ph .......................................... ok
-m split .......................................... ok
-l 1 .......................................... default
-th no .......................................... default
-th 100 .......................................... default
-u all .......................................... default
-r random .......................................... ok
-n 100 .......................................... ok
-c single .......................................... default
-s relati

---

In [97]:
from dendropy import Tree, TaxonNamespace
from dendropy.calculate import treecompare

def disagree(newick_file1, newick_file2):
    tns = TaxonNamespace()
    
    tree1 = Tree.get(path=newick_file1, schema="newick", taxon_namespace=tns)
    tree2 = Tree.get(path=newick_file2, schema="newick", taxon_namespace=tns)

    tree1.encode_bipartitions()
    tree2.encode_bipartitions()

    
    symmetric_difference = treecompare.symmetric_difference(tree1, tree2)
    false_positives_and_negatives = treecompare.false_positives_and_negatives(tree1, tree2)
    
    return {
        "symmetric_difference" : symmetric_difference,
        "false_positives_and_negatives": false_positives_and_negatives
    }

In [99]:
from dendropy import Tree, TaxonNamespace
from dendropy.calculate import treecompare

def unweighted_commpare(newick_file1, newick_file2):
    tns = TaxonNamespace()
    
    tree1 = Tree.get(path=newick_file1, schema="newick", taxon_namespace=tns)
    tree2 = Tree.get(path=newick_file2, schema="newick", taxon_namespace=tns)

    tree1.encode_bipartitions()
    tree2.encode_bipartitions()

    symmetric_difference = treecompare.symmetric_difference(tree1, tree2)
    false_positives_and_negatives = treecompare.false_positives_and_negatives(tree1, tree2)
    
    disagree_count = 0

    for node1, node2 in zip(tree1.postorder_internal_node_iter(), tree2.postorder_internal_node_iter()):
        if node1.split_bitmask != node2.split_bitmask:
            disagree_count += 1

    return {
        "symmetric_difference" : symmetric_difference,
        "false_positives_and_negatives": false_positives_and_negatives,
        "disagree_count": disagree_count
    }

In [22]:
from dendropy import Tree, TaxonNamespace
from dendropy.calculate import treecompare

def weighted_commpare(newick_file1, newick_file2):
    tns = TaxonNamespace()
    
    tree1 = Tree.get(path=newick_file1, schema="newick", taxon_namespace=tns)
    tree2 = Tree.get(path=newick_file2, schema="newick", taxon_namespace=tns)

    tree1.encode_bipartitions()
    tree2.encode_bipartitions()

    euclidean_distance = treecompare.euclidean_distance(tree1, tree2)
    weighted_robinson_foulds_distance = treecompare.weighted_robinson_foulds_distance(tree1, tree2)
    return {
        "euclidean_distance": euclidean_distance,
        "weighted_robinson_foulds_distance" : weighted_robinson_foulds_distance
    }

In [23]:
baseline_tree = './trees/baseline_consensus.ph'
fastgzip_tree = './trees/fastgzip_consensus.ph'
bzip3_tree = './trees/bzip3_consensus.ph'
genozip_tree = './trees/genozip_consensus.ph'

In [100]:
unweighted_commpare(baseline_tree, baseline_tree)

{'symmetric_difference': 0,
 'false_positives_and_negatives': (0, 0),
 'disagree_count': 0}

In [101]:
unweighted_commpare(baseline_tree, fastgzip_tree)

{'symmetric_difference': 33,
 'false_positives_and_negatives': (17, 16),
 'disagree_count': 17}

In [102]:
unweighted_commpare(baseline_tree, bzip3_tree)

{'symmetric_difference': 25,
 'false_positives_and_negatives': (13, 12),
 'disagree_count': 17}

In [103]:
unweighted_commpare(baseline_tree, genozip_tree)

{'symmetric_difference': 23,
 'false_positives_and_negatives': (12, 11),
 'disagree_count': 17}

In [74]:
from dendropy import Tree
from dendropy.calculate.treemeasure import B1, N_bar, patristic_distance

def unary_tree_statistics(newick_file):
    tree = Tree.get(path=newick_file, schema="newick")
    pdm = tree.phylogenetic_distance_matrix()
    
    return {
        "B1": B1(tree),
        "N_bar": N_bar(tree),
        "pdm": pdm.mean_pairwise_distance()
    }

In [75]:
unary_tree_statistics(baseline_tree)

{'B1': 8.162301587301588, 'N_bar': 6.894736842105263, 'pdm': 0.0}

In [76]:
unary_tree_statistics(fastgzip_tree)

{'B1': 10.176190476190476, 'N_bar': 5.7368421052631575, 'pdm': 0.0}

In [77]:
unary_tree_statistics(bzip3_tree)

{'B1': 8.801190476190474, 'N_bar': 5.052631578947368, 'pdm': 0.0}

In [78]:
unary_tree_statistics(genozip_tree)

{'B1': 9.001190476190475, 'N_bar': 6.0, 'pdm': 0.0}

---

In [90]:
import dendropy

def boot_split_distance(newick_file1, newick_file2):
    tns = TaxonNamespace()
    
    tree1 = Tree.get(path=newick_file1, schema="newick", taxon_namespace=tns)
    tree2 = Tree.get(path=newick_file2, schema="newick", taxon_namespace=tns)

    # Set the number of bootstrap replicates
    num_replicates = 100

    # Initialize dictionaries to store the bipartitions and their frequencies in each tree
    bs1_counts = {}
    bs2_counts = {}

    # Perform bootstrap resampling on both trees and count bipartition frequencies
    for i in range(num_replicates):
        # Clone the original trees
        bs_tree1 = tree1.clone(depth=1)
        bs_tree2 = tree2.clone(depth=1)

        # Randomly resolve any unresolved nodes
        bs_tree1.resolve_polytomies()
        bs_tree2.resolve_polytomies()

        # Encode the bipartitions of each bootstrap tree and count their frequencies
        bs_tree1.encode_bipartitions()
        bs_tree2.encode_bipartitions()

        for bipartition in bs_tree1.bipartition_encoding:
            bs1_counts[bipartition] = bs1_counts.get(bipartition, 0) + 1

        for bipartition in bs_tree2.bipartition_encoding:
            bs2_counts[bipartition] = bs2_counts.get(bipartition, 0) + 1

    # Calculate the number of shared and non-shared bipartitions
    shared_bipartitions = 0
    nonshared_bipartitions = 0
    for bipartition, freq1 in bs1_counts.items():
        freq2 = bs2_counts.get(bipartition, 0)
        if freq2 > 0:
            shared_bipartitions += min(freq1, freq2)
        nonshared_bipartitions += abs(freq1 - freq2)

    # Calculate the Boot-Split distance
    boot_split_distance = 1 - (shared_bipartitions / (shared_bipartitions + nonshared_bipartitions))

    # Print the result
    print('Boot-Split distance:', boot_split_distance)

In [91]:
boot_split_distance(baseline_tree, baseline_tree)

Boot-Split distance: 0.0


In [92]:
boot_split_distance(baseline_tree, fastgzip_tree)

Boot-Split distance: 0.4444444444444444


In [93]:
boot_split_distance(baseline_tree, bzip3_tree)

Boot-Split distance: 0.33333333333333337


In [94]:
boot_split_distance(baseline_tree, genozip_tree)

Boot-Split distance: 0.3055555555555556
