# Parse trees adding the information from the `bactaxR`'s clusters and any other table(s) to the leaves' names

The aim is to add useful information to the tip labels/leaves' names of a tree. Here, we add the cluster assignments accoding to `bactaxR` and the taxonomic classification according to `GTDB-tk`

In [4]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
from collections import defaultdict
from ete3 import Tree

## Read a table with relevant information

This table contains the species classification according to `GTDB-tk`

In [10]:
table = '/run/media/mibu/LR-orico_mini/Recovery_ADATA/Umayor/Work/Akker/Verrucomicrobia/QC/merged_akk_hand_edited.tsv'
merged_df = pd.read_csv(table, sep='\t')
# Create a new column representing the gtdb species classification
merged_df['GTDB_specie'] = merged_df['classification'].str.extract(';(s_.+)')
merged_df.info()
merged_df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Genome                       201 non-null    object 
 1   Completeness                 201 non-null    float64
 2   Contamination                201 non-null    float64
 3   classification               201 non-null    object 
 4   classification_method        201 non-null    object 
 5   sum_len                      201 non-null    int64  
 6   N50                          201 non-null    int64  
 7   16S ribosomal RNA > 1400 nt  151 non-null    float64
 8   Total_tRNAs                  201 non-null    int64  
 9   host                         198 non-null    object 
 10  NCBI_organism_name           147 non-null    object 
 11  NCBI_infraspecific_name      140 non-null    object 
 12  relation_to_type_material    4 non-null      object 
 13  type_of_sample      

Unnamed: 0,Genome,Completeness,Contamination,classification,classification_method,sum_len,N50,16S ribosomal RNA > 1400 nt,Total_tRNAs,host,NCBI_organism_name,NCBI_infraspecific_name,relation_to_type_material,type_of_sample,GTDB_specie
0,DGYMR06203__metabat2_low_PE.047.contigs,91.12,0.0,d__Bacteria;p__Verrucomicrobiota;c__Verrucomic...,taxonomic classification defined by topology a...,2417897,26278,,44,gallus gallus,,,,metagenome,s__Akkermansia sp900548895
1,F157a_European_Toad__metabat2_high_PE.005.contigs,89.32,2.3,d__Bacteria;p__Verrucomicrobiota;c__Verrucomic...,taxonomic classification defined by topology a...,2029501,8457,,40,bufo bufo,,,,metagenome,s__


## Create the dictionary to add the species name

In [9]:
asm_to_sp = defaultdict()
for i, row in merged_df[['Genome','GTDB_specie']].iterrows():
    asm_to_sp[row['Genome']] = row['GTDB_specie']#.str.replace('s__', '')
len(asm_to_sp)

201

## Parse the trees
We have more than one `bactaxR` table, each one created with a particular ANI treshold. For each of those tables, we create a parsed tree.

In [114]:
# root path where tables are located
bac_path = Path("FastANI/bactaxR_related/")
# our tables. It's important that the names of the tables contain the ani
# threshold used to create them, because we use it in the output name (parsed tree)
cluster_f = ['g_akk_clusters_96.5.tsv', 'g_akk_clusters_97.tsv']

for f in cluster_f:
    Read the table with cluster assignments from `bactaxR`
    clusters = pd.read_csv(bac_path.joinpath(f), sep='\t')
    # mapping from species (assembly name) to cluster assignment 
    sp_to_clust = defaultdict()
    for i, row in clusters[['Cluster', 'Genome']].iterrows():
        sp_to_clust[row['Genome']] = row['Cluster']
    print(len(sp_to_clust))
    
    #Read the orthofinder tree
    root = Path("/Species_Tree")
    tree_f = root.joinpath('SpeciesTree_rooted_node_labels.txt')
    tree = Tree(tree_f.as_posix(), format=1)
    
    for node in tree.traverse():
    # check if the node.name is in the table, sometimes the table/asm_to_sp
    # doesn't have all the information
        if node.is_leaf() and node.name in asm_to_sp.keys():
            # build the new leaf name from the previous 2 dictionaries
            node.name = f'{asm_to_sp[node.name]} | {sp_to_clust[node.name]} | {node.name}'
    # get the ani threshold to use it in the output name. Change it if you have other pattern 
    # in the input files' names.
    suf = re.search(r'_([0-9]{2}\.?[0-9]?).tsv', f).group(1) 
    # The output tree, one for each bactaxr table
    tree_outf = root.joinpath(f'species_tree_{suf}.nwk')
    tree.write(format=1, outfile=tree_outf.as_posix())

# print the last tree 
print(tree)

201
201

   /-GCF_000172155.1_ASM17215v1_genomic
  |
  |               /-s__Akkermansia sp900545155 | 1 | GCA_905197035.1_ERR2013654-mag-bin.36_genomic
  |            /-|
  |           |   \-s__Akkermansia sp900545155 | 1 | GCA_900545155.1_UMGS750_genomic
  |         /-|
  |        |  |   /-s__Akkermansia sp900545155 | 2 | GCA_900765745.1_SRS294966_17_genomic
  |      /-|   \-|
  |     |  |      \-s__Akkermansia sp900545155 | 3 | MGYG-HGUT-02584
  |     |  |
  |     |   \-s__ | 1 | F157a_European_Toad__metabat2_high_PE.005.contigs
  |     |
  |     |         /-s__Akkermansia sp004167605 | 4 | GCA_900759685.1_ERS608607_23_genomic
  |     |        |
  |     |      /-|      /-s__Akkermansia sp004167605 | 4 | GCA_905210405.1_ERR321569-mag-bin.26_genomic
  |     |     |  |   /-|
  |     |     |  |  |   \-s__Akkermansia sp004167605 | 4 | GCF_008422275.1_ASM842227v1_genomic
  |     |     |   \-|
  |     |     |     |   /-s__Akkermansia sp004167605 | 4 | GCA_900539575.1_UMGS141_genomic
  |    