# Assign genus IDs based on pyANI analysis, and piecewise linear regression boundaries, and other.

**Set Up**

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import networkx as nx
import dwave_networkx as dnx
import ete3
from ete3 import Tree, TreeStyle, faces
from ete3 import PhyloNode
from ete3 import NodeStyle
from collections import defaultdict

**Loading data**
Here we are interested in coverage and identity matrices from pyANI analysis.

In [2]:
identity = pd.read_csv(Path("../../supplementary_file_3/output/pyani_matrices/matrix_identity_1.tab"), sep='\t').rename(columns={'Unnamed: 0': 'genome1'})
coverage = pd.read_csv(Path("../../supplementary_file_3/output/pyani_matrices/matrix_coverage_1.tab"), sep='\t').rename(columns={'Unnamed: 0': 'genome1'})

**Assigning species and genus ID**
Here, we:
- write a function that removes suffix provided in the pyANI matrices
- get a function that will use provided dataframe to calculate MST, and remove edges between genomes if their thereshold is lower than the one provided. In case, where cliques are present it will keep removing the lowest weight until no cliques are present. 

In [3]:
def remove_suffix(row, col):
    """Return accession number from 'FILE' column."""

    try:
        new = ' '.join(row[col].split(':')[:-1])
    except IndexError:
        new = 'NA'

    return new



In [4]:
def assign_ANI_tax_ID_one_attribute(df, thereshold, attribute, ANI_ID):
        """Assign taxon IDs based on ANI analysis. 
        
        :param df: dataframe with each row providing pyANI identity and coverage for a given parir of genomes
        :param thereshold: thereshold at which genomes should be separated (num)
        :param attribute: comparision type deciding which should be used to separate genomes identity or coverage
        :param ANI_ID: current number of assigned IDs
        """
        
        #Generate NetworkX grah with identity and coverage as edge attibutes 
        G_comp=nx.from_pandas_edgelist(df, 'genome1', 'genome2', ['identity', 'coverage'])
        
        current_assignments = {} #Hold empty dictionary

        ANI_ID = ANI_ID

        #Remove edges if thereshold for a given attribute is lower 
        edges_to_remove = [(n1,n2) for n1, n2, attrs in G_comp.edges(data=True) if attrs[attribute] < thereshold]
        G_comp.remove_edges_from(edges_to_remove)
        
        #Check if the components are clique, if not remove edges until clique is achived
        components = [_ for _ in list(nx.connected_components(G_comp))]
        for component in components:
            while dnx.is_clique(G_comp, component) == False:
                weights = sorted(list(set([attrs[attribute] for n1, n2, attrs in G_comp.edges(data=True) if n1 in component and n2 in component])))
                edges_to_remove = [(n1,n2) for n1, n2, attrs in G_comp.edges(data=True) if attrs[attribute] < (weights)[1] and n1 in component and n2 in component]
                G_comp.remove_edges_from(edges_to_remove)
                weights.remove(weights[0])

                components = [_ for _ in list(nx.connected_components(G_comp))]
                for component in components:
                    if dnx.is_clique(G_comp, component) == True:
                        break
        #Assign IDs
        components = [_ for _ in list(nx.connected_components(G_comp))]
        for component in components:
            current_assignments.update({_:ANI_ID for _ in component})
            ANI_ID += 1

    
        return current_assignments

In [5]:
current_species_ID = 1
current_genus_ID = 1
genome_genus_ID = {}
genome_species_ID = {}
                   #Melting data
identity_melt = pd.melt(identity, id_vars=['genome1'], value_vars=[_ for _ in identity if _ != 'genome1'], var_name='genome2', value_name='identity')
coverage_melt = pd.melt(coverage, id_vars=['genome1'], value_vars=[_ for _ in identity if _ != 'genome1'], var_name='genome2', value_name='coverage')
    
                    #Combine data
combined = pd.merge(identity_melt, coverage_melt,  how='left', left_on=['genome1', 'genome2'], right_on = ['genome1','genome2'])
combined = combined[combined['genome1'] != combined['genome2']] #Remove self-to-self comparisions
                    #Remove duplicate comparisions and keep minimum coverage and average identity
combined[['genome1','genome2']] = np.sort(combined[['genome1','genome2']].to_numpy(),axis=1)
fixed = (combined.groupby(['genome1','genome2']).agg(identity = ('identity','mean'), coverage = ('coverage','min')).reset_index())
fixed["genome1"] = fixed.apply(remove_suffix,col='genome1', axis=1)
fixed['genome2'] = fixed.apply(remove_suffix,col='genome2', axis=1)


**Generate dataframe to which we will append information**

In [6]:
df = pd.read_csv(Path("../../supplementary_file_3/input/custom_labels.txt"), sep='\t', names=['MD5_hash', 'FILE', 'label'])

In [7]:
def get_accession(row):
    """Return accession number from 'FILE' column."""

    try:
        acc = '_'.join(row['FILE'].split('_')[:2])
    except IndexError:
        acc = 'NA'

    return acc

df["accession"] = df.apply(get_accession, axis=1)

In [8]:
del df['MD5_hash']
del df['FILE']

**Assigning genus IDs using boundries with 3 segment piecewise regression**

Here, we only consider one attribute; genome coverage. 

In [9]:
#Getting Genus ID
groups = assign_ANI_tax_ID_one_attribute(fixed, 0.459, 'coverage', current_genus_ID)
labels_to_accession = df.set_index('label').to_dict()['accession']
genome_genus_ID = {labels_to_accession[label]:genus_ID for label,genus_ID in groups.items()}

In [10]:
df['genus_ID_pc_3'] = df['accession'].map(genome_genus_ID)

**Assigning genus IDs using boundries with 2 segment piecewise regression**

In [11]:
groups = assign_ANI_tax_ID_one_attribute(fixed, 0.498, 'coverage', current_genus_ID)
labels_to_accession = df.set_index('label').to_dict()['accession']
genome_genus_ID = {labels_to_accession[label]:genus_ID for label,genus_ID in groups.items()}



In [12]:
df['genus_ID_pc_2'] = df['accession'].map(genome_genus_ID)

**Assigining genus/species IDs based on two attributes.**

Here, we will write a function that will assign genomes to candidate genus and/or species by considering both genome coverage and genome identity. 

The function will work in the following steps:
- Create a complete graph, where each genome/node is connected to evry other genome/node
- Assign genome coverage and genome identity as edges attributes
- Remove edges if the thereshold for genome covera is lower than the one provodes. In case, where cliques are present it will keep removing the lowest weigh until no cliques are present. 
- Then, we will do the same but this time with genome identity for a given/specified genome identity threshold. 



In [13]:
def assign_ANI_tax_ID_two_attribute(df, threshold_1, attribute_1, ANI_ID, attribute_2, threshold_2):
        """Assign taxon IDs based on ANI analysis. 
        
        :param df: dataframe with each row providing pyANI identity and coverage for a given parir of genomes
        :param thereshold: thereshold at which genomes should be separated (num)
        :param attribute: comparision type deciding which should be used to separate genomes identity or coverage
        :param ANI_ID: current number of assigned IDs
        """
        
        #Generate NetworkX grah with identity and coverage as edge attibutes 
        G_comp=nx.from_pandas_edgelist(df, 'genome1', 'genome2', ['identity', 'coverage'])
        
        current_assignments = {} #Hold empty dictionary

        ANI_ID = ANI_ID

        
        
        edges_to_remove = [(n1,n2) for n1, n2, attrs in G_comp.edges(data=True) if attrs[attribute_1] < threshold_1 or attrs[attribute_2] < threshold_2]
        G_comp.remove_edges_from(edges_to_remove)
        
        
        #Check if the components are clique, if not remove edges until clique is achived
        components = [_ for _ in list(nx.connected_components(G_comp))]
        for component in components:
            while dnx.is_clique(G_comp, component) == False:
                weights = sorted(list(set([attrs[attribute_1] for n1, n2, attrs in G_comp.edges(data=True) if n1 in component and n2 in component])))
                edges_to_remove = [(n1,n2) for n1, n2, attrs in G_comp.edges(data=True) if attrs[attribute_1] < (weights)[1] and n1 in component and n2 in component]
                G_comp.remove_edges_from(edges_to_remove)
                weights.remove(weights[0])

                components = [_ for _ in list(nx.connected_components(G_comp))]
                for component in components:
                    if dnx.is_clique(G_comp, component) == True:
                        break
        #Assign IDs
        components = [_ for _ in list(nx.connected_components(G_comp))]
        for component in components:
            current_assignments.update({_:ANI_ID for _ in component})
            ANI_ID += 1

    
        return current_assignments

In [14]:
test = fixed.head(5)

In [15]:
groups = assign_ANI_tax_ID_two_attribute(fixed, 0.4590, 'coverage', current_genus_ID, 'identity', 0.868)
labels_to_accession = df.set_index('label').to_dict()['accession']
genome_genus_ID = {labels_to_accession[label]:genus_ID for label,genus_ID in groups.items()}

In [16]:
df['genus_ID_pc_3_with_ID'] = df['accession'].map(genome_genus_ID)

**Checking if the groups are monophyletic in SCO tree.**

One of the main objectives of carrying this anlysis was to identify biologically meaningful groups for pangenomic analysis. 

After mapping the identified genus for genome coverage theresholds identified by piecewise regression, it was found that the groups do not form monophyletic groups on SCO tree. 

Therefore, to find groupings in which that problem does not occur the analysis will be run at starting genome coverage between 40% and 60% in steps of 0.1%.

**Check monophyly**

Writing function that will check if a given set of genomes form monophyletic caldes in the SCO phylogenetic tree.

In [17]:
def check_monophyly(tree, group):
    """Return True if teh given group
    is monophyletic, otherwise return False.
    """
    
    monophyly_status = tree.check_monophyly(values=group, target_attr="name", ignore_missing=True)[0]
    
    
    return monophyly_status

In [18]:
SCO_tree = Tree('../../supplementary_file_5/output/tree/04_tbe.raxml.support', format=1)

R = SCO_tree.get_midpoint_outgroup()
# and set it as tree outgroup
SCO_tree.set_outgroup(R)

**Assigning genus IDs with starting genome coverage between 45% and 55% in steps of 0.1%**

In [19]:
genus_df = pd.DataFrame(columns=['coverage_threshold', 'clusters', 'singletons', 'monophyletic', 'non_monophyletic'])

for i in np.arange(0.400, 0.851, 0.001):
    
    cluster_members = defaultdict(list) #Hold an empty defaultdict; here members of the same group will be keyed by asigned genus ID
    groups = assign_ANI_tax_ID_one_attribute(fixed, round(i, 3), 'coverage', current_genus_ID) #Assigning genus IDs
    labels_to_accession = df.set_index('label').to_dict()['accession']
    genome_genus_ID = {labels_to_accession[label]:genus_ID for label,genus_ID in groups.items()} #Mapping genome accessions
    #Get cluster/genus ID members
    for genome, genus_ID in genome_genus_ID.items():
        cluster_members[genus_ID].append(genome)
    
    no_of_monophyletic_groups = 0
    no_of_non_monophyletic_groups = 0
    singleton = 0
    #Check monophyly of all groups with at least 2 genomes
    for k, v in cluster_members.items():
        if len(v) !=1:
            monophyly_stat = check_monophyly(SCO_tree, v)
            if monophyly_stat == True:
                no_of_monophyletic_groups +=1
            else:
                no_of_non_monophyletic_groups += 1
        else:
            singleton += 1
    genus_df.loc[len(genus_df)] = [round(i*100, 3), len(cluster_members), singleton ,int(no_of_monophyletic_groups), int(no_of_non_monophyletic_groups)]
    

In [20]:
genus_df[['clusters', 'singletons', 'monophyletic', 'non_monophyletic']] = genus_df[['clusters', 'singletons', 'monophyletic', 'non_monophyletic']].applymap(np.int64)

In [21]:
genus_df.to_csv(Path("../output/monophyly_status.csv").expanduser(), index=False)

**Reorder df**

Here, we will reorder dataframe so that the accessions match the ordering of the leave nodes in the SCO tree. 

In [22]:
data2 = pd.read_csv(Path("../../supplementary_file_5/output/SCOG_tree_node_order.csv").expanduser())

In [23]:
data2 = pd.merge(data2, df, on='accession')

In [24]:
del data2['label']

In [25]:
data2

Unnamed: 0,accession,genus_ID_pc_3,genus_ID_pc_2,genus_ID_pc_3_with_ID
0,GCF_000709915.1,5,5,5
1,GCF_018966745.1,5,5,5
2,GCF_900079415.1,5,5,5
3,GCF_000719785.1,5,5,5
4,GCF_002899455.1,5,5,5
...,...,...,...,...
290,GCF_000813365.1,28,29,28
291,GCF_000745345.1,27,27,27
292,GCF_900105395.1,42,47,43
293,GCF_000717725.1,19,19,19


In [26]:
data2.to_csv(Path("../output/pyANI_genus_IDs.csv").expanduser(), index=False)