In [None]:
class cd:
    """
    Context manager for changing the current working directory
    """
    def __init__(self, newPath):
        self.newPath = os.path.expanduser(newPath)

    def __enter__(self):
        self.savedPath = os.getcwd()
        os.chdir(self.newPath)

    def __exit__(self, etype, value, traceback):
        os.chdir(self.savedPath)

In [None]:
def merge_polyphyletic_clades(clades, udag):
    clades = deepcopy(clades)
    
    merge_clades_threshold = ceil(len(udag.vs.select(is_leaf=False)) * 0.01)
    while True:

        should_merge = False
        for clade1, clade2 in itertools.combinations(clades, 2):
#             clade1_index = int(clade1.split('_')[1])
#             clade2_index = int(clade2.split('_')[1])
            
            node1 = udag.vs[clade1]
            node2 = udag.vs[clade2]
            
            node1_ancestors = node1.get_shortest_paths(udag.vs[0])[0]
            node2_ancestors = node2.get_shortest_paths(udag.vs[0])[0]

            bipartitions_in_between = node1.shortest_paths(node2)[0][0] - 1
            if bipartitions_in_between <= merge_clades_threshold or \
               clade1 in node2_ancestors or \
               clade2 in node1_ancestors:
                
                node1_ancestors = node1.get_shortest_paths(udag.vs[0])[0]
                node2_ancestors = node2.get_shortest_paths(udag.vs[0])[0]

                common_ancestors = set(node1_ancestors).intersection(node2_ancestors)
                lca = sorted(common_ancestors)[-1]
                if lca == 0:
                    continue
                else:
                    should_merge = True
                    break

        if should_merge:
            clades.remove(clade1)
            clades.remove(clade2)
            clades.add(lca)
        else:
            break
    
    return(clades)

In [None]:
def get_leaf_names(node):
    leaf_names = []
    
    if node['is_leaf']:
        leaf_names.append(node['name'])
        
    for successor in node.successors():
        if successor['is_leaf']:
            leaf_names.append(successor['name'])
        else:
            leaf_names.extend(get_leaf_names(successor))
    
    return(leaf_names)

In [None]:
def get_descendant_names(node, leaves=False):
    descendant_names = []
    
    for successor in node.successors():
        if successor['is_leaf']:
            if leaves:
                descendant_names.append(successor['name'])

        else:
            descendant_names.append(successor['name'])
            descendant_names.extend(get_descendant_names(successor, leaves))
    
    return(descendant_names)

In [None]:
def get_descendant_indices(node, leaves=False):
    descendant_names = []
    
    for successor in node.successors():
        if successor['is_leaf']:
            if leaves:
                descendant_names.append(successor.index)

        else:
            descendant_names.append(successor.index)
            descendant_names.extend(get_descendant_indices(successor, leaves))
    
    return(descendant_names)

In [None]:
def tree_to_dag(tree):
    for count, node in enumerate(tree.traverse()):
        if not node.is_leaf():
            node.name = 'node_%i' % count

    edges = []
    for node in tree.traverse():
        if not node.is_leaf():
            for child in node.get_children():
                edges.append((node.name,
                              child.name,
                              child.dist,
                              child.support))

    dag  = ig.Graph.TupleList(edges     =tuple(edges), 
                              directed  =True,
                              edge_attrs=['weight', 'support']
                             )
    dag.vs['is_leaf'] = [False if name.startswith('node_') else True
                         for name in dag.vs['name']]
    return(dag)

In [None]:
def get_pairwise_distances_from_dag(dag, leaf_names):
    bipartitions_in_between = np.array(dag.shortest_paths(source=leaf_names, 
                                                          target=leaf_names, 
                                                          weights=None)) - 1
    patristic_distances     = np.array(dag.shortest_paths(source=leaf_names, 
                                                          target=leaf_names, 
                                                          weights='weight'))
                                       
    np.fill_diagonal(bipartitions_in_between, 0.0)
    
    dist_matrix = pd.DataFrame(index  =leaf_names, 
                               columns=leaf_names, 
                               data   =patristic_distances**bipartitions_in_between
                              )
    return(dist_matrix)

In [None]:
def create_taxa_graph(dist_matrix):
    triu_indices       = np.triu_indices_from(dist_matrix, k=1)
    
    edge_list                 = pd.DataFrame()
    edge_list['sequence1']    = dist_matrix.index[triu_indices[0]]
    edge_list['sequence2']    = dist_matrix.index[triu_indices[1]]
    edge_list['distance']     = dist_matrix.values[triu_indices]
    edge_list['inverse_dist'] = np.e**np.negative(edge_list.distance)


    graph  = ig.Graph.TupleList(edges=edge_list[['sequence1', 
                                                 'sequence2', 
                                                 'inverse_dist']].itertuples(index=False), 
                                directed=False, 
                                weights =True)
    
    return(graph)

In [3]:
def match_rooting(reference_root, tree_to_root):
    tmp_tree = tree_to_root.copy()
    
    for node in sorted( reference_root.children, key=len ):
        if node.is_leaf():
            leaf = tmp_tree.get_leaves_by_name(node.name)[0]
            tmp_tree.set_outgroup(leaf)
            return tmp_tree
        else:
            is_it_monophyletic, clade_type, fucking_up = tmp_tree.check_monophyly(
                node.get_leaf_names(), 
                'name',
                unrooted=False
            )
            if is_it_monophyletic:
                equivalent = tmp_tree.get_common_ancestor(node.get_leaf_names())
                tmp_tree.set_outgroup(equivalent)
            else:
                tmp_tree.set_outgroup(fucking_up.pop())
                equivalent = tmp_tree.get_common_ancestor(node.get_leaf_names())
                tmp_tree.set_outgroup(equivalent)

            return tmp_tree

In [25]:
def get_pairwise_distances(group_id):
    
    tree = ete3.Tree(working_trees.loc[group_id, 'tree'])

    leaf_names = []
    for count, node in enumerate(tree.traverse()):
        if node.is_leaf():
            leaf_names.append(node.name)
        else:
            node.name = 'node_%i' % count
    leaf_names = np.array(leaf_names)

    nodes         = []
    children      = []
    branch_length = []
    for node in tree.traverse():
        if not node.is_leaf():
            for child in node.get_children():
                nodes.append(         node.name)
                children.append(     child.name)
                branch_length.append(child.dist)

    branch_length_df                  = pd.DataFrame()
    branch_length_df['node']          = nodes
    branch_length_df['child']         = children
    branch_length_df['branch_length'] = branch_length

    dag  = ig.Graph.TupleList(edges=branch_length_df[['node', 
                                                      'child', 
                                                      'branch_length']].itertuples(index=False), 
                                directed=False, 
                                weights=True)
    
    dist_matrix = pd.DataFrame(index  =leaf_names, 
                               columns=leaf_names, 
                               data   =np.array(dag.shortest_paths(source=leaf_names, 
                                                                   target=leaf_names, 
                                                                   weights='weight'))
                              )
    return(dist_matrix)

In [25]:
def get_pairwise_distances_from_tree(tree):
    
    leaf_names = []
    for count, node in enumerate(tree.traverse()):
        if node.is_leaf():
            leaf_names.append(node.name)
        else:
            node.name = 'node_%i' % count
    leaf_names = np.array(leaf_names)

    nodes         = []
    children      = []
    branch_length = []
    for node in tree.traverse():
        if not node.is_leaf():
            for child in node.get_children():
                nodes.append(         node.name)
                children.append(     child.name)
                branch_length.append(child.dist)

    branch_length_df                  = pd.DataFrame()
    branch_length_df['node']          = nodes
    branch_length_df['child']         = children
    branch_length_df['branch_length'] = branch_length

    dag  = ig.Graph.TupleList(edges=branch_length_df[['node', 
                                                      'child', 
                                                      'branch_length']].itertuples(index=False), 
                                directed=False, 
                                weights=True)
    
    dist_matrix = pd.DataFrame(index  =leaf_names, 
                               columns=leaf_names, 
                               data   =np.array(dag.shortest_paths(source=leaf_names, 
                                                                   target=leaf_names, 
                                                                   weights='weight'))
                              )
    return(dist_matrix)

In [34]:
def extract_cluster(clusterID):
    group_id, cluster_num = clusterID.split('#')
    dist_matrix = get_pairwise_distances(group_id)

    taxids = [int(leaf.split('.')[0]) for leaf in dist_matrix.index]
    phyla  = eggNOG_taxonomy.loc[taxids, 'phylum'].values.astype(int)

    graph  = create_taxa_graph(dist_matrix)

    random.seed(12345)
    clusters = graph.community_multilevel(weights='weight')

    node_data = pd.DataFrame(columns=['name', 'phylum', 'cluster'],
                             data   =zip(dist_matrix.index, 
                                         phyla, 
                                         clusters.membership)
                            )
    
    cluster_seqs = node_data.loc[node_data.cluster==int(cluster_num), 'name'].values
    
    if not os.path.isfile('alignments/%s' % group_id):
        subprocess.call(['curl', 
                         'http://eggnogapi5.embl.de/nog_data/text/raw_alg/%s' % group_id,
                         '--output', 'alignments/%s.gz' % group_id])
        
        if subprocess.call(['gzip',
                            '-d',
                            'alignments/%s.gz' % group_id]):
            subprocess.call(['cp',
                             'alignments/%s.gz' % group_id, 
                             'alignments/%s' % group_id])
    
    with open('alignments/%s' % group_id) as fasta_handle,\
         open('alignments/%s-cluster%s.faa' % (group_id, cluster_num), 'w') as out:
        
        fasta = fasta_handle.read()
        if not fasta.startswith('>'):
            return(False)
        
        for entry in fasta.split('>'):
            if entry and entry.split()[0] in cluster_seqs:
                out.write('>%s' % entry)
    
    return(True)

In [2]:
def prepare_candidate_for_tree(group_cluster):
    cluster_extracted = extract_cluster(group_cluster)
    
    if not cluster_extracted:
        return(False)
    
    group_id, cluster_num = group_cluster.split('#')
    
    with open('alignments/%s-cluster%s.aln' % (group_id, cluster_num), 'w') as out:
        subprocess.call(['/cm/shared/engaging/mafft/7.245-with-extensions/bin/mafft', 
                         '--auto', 
                         '--reorder', 
                         'alignments/%s-cluster%s.faa' % (group_id, cluster_num)],
                        stdout=out)
    return(True)

In [2]:
def reconstruct_candidate_fastTree(group_cluster):
    ready_for_tree = prepare_candidate_for_tree(group_cluster)
    
    if not ready_for_tree:
        return(False)
    
    group_id, cluster_num = group_cluster.split('#')
    
    subprocess.call(['/cm/shared/engaging/FastTree/2.1.8/bin/FastTree',
                     '-out', 'trees/%s-cluster%s.fastTree' % (group_id, cluster_num),
                     '-quiet',
                     '-wag',
                     '-gamma',
                     'alignments/%s-cluster%s.aln' % (group_id, cluster_num)])
    
    subprocess.call(['/home/thiberio/.conda/envs/py37/bin/python3',
                     '/nobackup1b/users/thiberio/mad.py',
                     'trees/%s-cluster%s.fastTree' % (group_id, cluster_num),
                     '-t'])

In [2]:
def reconstruct_candidate(group_cluster):
    extract_cluster(group_cluster)
    
    group_id, cluster_num = group_cluster.split('#')
    
    with open('alignments/%s-cluster%s.aln' % (group_id, cluster_num), 'w') as out:
        subprocess.call(['/cm/shared/engaging/mafft/7.245-with-extensions/bin/mafft', 
                         '--auto', 
                         '--reorder', 
                         'alignments/%s-cluster%s.faa' % (group_id, cluster_num)],
                        stdout=out)
        
    subprocess.call(['/cm/shared/engaging/iqtree/1.6.3/bin/iqtree',
                     '-m', 'LG+G',
                     '-nt', '2',
                     '-s', 'alignments/%s-cluster%s.aln' % (group_id, cluster_num),
                     '-pre', 'trees/%s-cluster%s' % (group_id, cluster_num),
                     '-bb', '1000', 
                     '-quiet',
                     '-safe'])
    
    subprocess.call(['/home/thiberio/.conda/envs/py37/bin/python3',
                     '/nobackup1b/users/thiberio/mad.py',
                     'trees/%s-cluster%s.treefile' % (group_id, cluster_num),
                     '-t'])

In [45]:
# def visualize_reconstruct_candidate(group_cluster):

#     group_id, cluster_num = group_cluster.split('#')
    
#     with cd('trees'):
#         tree   = match_rooting(ete3.Tree('%s-cluster%s.treefile.rooted' % (group_id, cluster_num)),
#                                ete3.Tree('%s-cluster%s.treefile'        % (group_id, cluster_num)))

#     out  = open('%s-cluster%s.figTree' % (group_id, cluster_num), 'w')
#     out.write("#NEXUS\nbegin taxa;\n\tdimensions ntax=%i;\n\ttaxlabels\n" %len(tree))

#     for node in tree.traverse():
#         if node.is_leaf():
#             taxid, locus_tag = node.name.split('.')
#             try:
#                 lineage = {j: i for i, j in ncbi.get_rank(ncbi.get_lineage(taxid)).items()}
#             except ValueError:
#                 out.write('\t%s\n' %(node.name))
#                 continue
#             else:
#                 lineage_names = ncbi.get_taxid_translator(lineage.values())

#             out.write('\t%s ' %(node.name))
#             comment = []
#             for rank in ['class', 'phylum', 'order', 'family', 'species']:
#                 if rank in lineage:
#                     comment.append('tax_%s="%s"' %(rank, lineage_names[lineage[rank]]))
#             out.write('[&%s]\n' %' '.join(comment))

#     newick_text = tree.write(format=0)
#     out.write(';\nend;\n')
#     out.write('begin trees;\n\ttree tree_1 = [&R] %s\nend;' %newick_text)
#     out.close()

In [1]:
def visualize_reconstructed_candidate(tree):

    out = "#NEXUS\nbegin taxa;\n\tdimensions ntax=%i;\n\ttaxlabels\n" %len(tree)

    count               = 0
    internal_node_names = {}
    for node in tree.traverse():
        if node.is_leaf():
            taxid, locus_tag = node.name.split('.')
            try:
                lineage = {j: i for i, j in ncbi.get_rank(ncbi.get_lineage(taxid)).items()}
            except ValueError:
                out += '\t%s\n' %(node.name)
                continue
            else:
                lineage_names = ncbi.get_taxid_translator(lineage.values())

            out += '\t%s ' %(node.name)
            comment = []
            for rank in ['class', 'phylum', 'order', 'family', 'species']:
                if rank in lineage:
                    comment.append('tax_%s="%s"' %(rank, lineage_names[lineage[rank]]))
            if 'tax_phylum="Cyanobacteria"' in comment:
                comment.append('!color=#00ff00')
            elif 'tax_phylum="Chlorobi"' in comment:
                comment.append('!color=#ff0000')
            out += '[&%s]\n' %' '.join(comment)

        else:
            internal_node_names['node_%i_' % count] = '[&node_name=%s,support=%.2f]' % (node.name, node.support)
            node.name = 'node_%i_' % count
            count += 1

    newick_text = tree.write(format=1)
    for tmp_name, full_name in internal_node_names.items():
        newick_text = newick_text.replace(tmp_name, full_name)
        
    out += ';\nend;\n'
    out += 'begin trees;\n\ttree tree_1 = [&R] %s\nend;' %newick_text

    return(out)

In [45]:
def visualize_reconstruct_candidate_fastTree(group_cluster):

    group_id, cluster_num = group_cluster.split('#')
    
    with cd('trees'):
        tree   = match_rooting(ete3.Tree('%s-cluster%s.fastTree.rooted' % (group_id, cluster_num)),
                               ete3.Tree('%s-cluster%s.fastTree'        % (group_id, cluster_num)))

    out  = open('%s-cluster%s.fastFigTree' % (group_id, cluster_num), 'w')
    out.write("#NEXUS\nbegin taxa;\n\tdimensions ntax=%i;\n\ttaxlabels\n" %len(tree))

    for node in tree.traverse():
        if node.is_leaf():
            taxid, locus_tag = node.name.split('.')
            try:
                lineage = {j: i for i, j in ncbi.get_rank(ncbi.get_lineage(taxid)).items()}
            except ValueError:
                out.write('\t%s\n' %(node.name))
                continue
            else:
                lineage_names = ncbi.get_taxid_translator(lineage.values())

            out.write('\t%s ' %(node.name))
            comment = []
            for rank in ['class', 'phylum', 'order', 'family', 'species']:
                if rank in lineage:
                    comment.append('tax_%s="%s"' %(rank, lineage_names[lineage[rank]]))
            out.write('[&%s]\n' %' '.join(comment))

    newick_text = tree.write(format=0)
    out.write(';\nend;\n')
    out.write('begin trees;\n\ttree tree_1 = [&R] %s\nend;' %newick_text)
    out.close()

In [45]:
def visualize_candidates(group_cluster):
    extract_cluster(group_cluster)
    
    group_id, cluster_num = group_cluster.split('#')
    
    with open('alignments/%s-cluster%s.aln' % (group_id, cluster_num), 'w') as out:
        subprocess.call(['/cm/shared/engaging/mafft/7.245-with-extensions/bin/mafft', 
                         '--auto', 
                         '--reorder', 
                         'alignments/%s-cluster%s.faa' % (group_id, cluster_num)],
                        stdout=out)
        
    subprocess.call(['/cm/shared/engaging/FastTree/2.1.8/bin/FastTree',
                     '-gamma', 
                     '-wag', 
                     '-out', 'alignments/%s-cluster%s.tree' % (group_id, cluster_num), 
                     'alignments/%s-cluster%s.aln' % (group_id, cluster_num)])
    
    tree = ete3.Tree('alignments/%s-cluster%s.tree' % (group_id, cluster_num), format=0)

    out  = open('chlorobi_to_cyano/%s-cluster%s.fastFigTree' % (group_id, cluster_num), 'w')
    out.write("#NEXUS\nbegin taxa;\n\tdimensions ntax=%i;\n\ttaxlabels\n" %len(tree))

    for node in tree.traverse():
        if node.is_leaf():
            taxid, locus_tag = node.name.split('.')
            try:
                lineage = {j: i for i, j in ncbi.get_rank(ncbi.get_lineage(taxid)).items()}
            except ValueError:
                out.write('\t%s\n' %(node.name))
                continue
            else:
                lineage_names = ncbi.get_taxid_translator(lineage.values())

            out.write('\t%s ' %(node.name))
            comment = []
            for rank in ['class', 'phylum', 'order', 'family', 'species']:
                if rank in lineage:
                    comment.append('tax_%s="%s"' %(rank, lineage_names[lineage[rank]]))
            out.write('[&%s]\n' %' '.join(comment))

    newick_text = tree.write(format=0)
    out.write(';\nend;\n')
    out.write('begin trees;\n\ttree tree_1 = [&R] %s\nend;' %newick_text)
    out.close()