# MLST phylogenetic tree analysis
This notebook was used to check which and how many connected groups of STs are congurent with clades present in MLST phylogenetic tree.

**Set up**
Loading all necessary libraries

In [1]:
import ete3
from ete3 import Tree, TreeStyle, faces
from ete3 import PhyloNode
from ete3 import NodeStyle
import pandas as pd
from collections import defaultdict
from pylab import *
from Bio import SeqIO
import seaborn as sns
from pathlib import Path

**Load data**

- `mlsa_tree` - phylogenetic tree inferred from concatenated sequences of all 6 marker alleles
- `mlst_data` - csv data which contains information (eg. connected component_id) about each genome used in MLST analysis
- `raxml_log` - RAxML log file which contains infomration about the sequences which were found to identical and collapsed

In [3]:
mlsa_tree = Tree("../output/tree/04_tbe_mlst.raxml.support")
R = mlsa_tree.get_midpoint_outgroup()
# and set it as tree outgroup
mlsa_tree.set_outgroup(R)

In [4]:
mlst_data = pd.read_csv(Path("../../supplementary_file_8/output/Genome_ST_info.csv").expanduser())

In [5]:
with open(Path("../output/tree/01_check.raxml.log").expanduser()) as raxml_log:
    collapsed = {_.split('and')[0].split('Sequences')[-1].strip():_.split('and')[-1].split('are')[0].strip() for _ in raxml_log.readlines() if 'identical' in _}

First we need to check if the collapsed sequences come from the same connected component!
If all collapsed sequences are indeed from the same connected component, there is no need to do anything else.

In [6]:
for genome1, genome2 in collapsed.items():
    group_1 = mlst_data.loc[mlst_data['accession'] == genome1]['pyani_group_ID'].values[0]
    group_2 = mlst_data.loc[mlst_data['accession'] == genome2]['pyani_group_ID'].values[0]
    if group_1!=group_2:
        print(group_1, group_2)
        

Next we can create dictionary with accession number as the key, and the group as the value.

In [7]:
accession_group = mlst_data.set_index('accession').to_dict()['pyani_group_ID']

In [8]:
group_accession = defaultdict(list)
for k, v in accession_group.items():
    group_accession[v].append(k)

Next we can get a dictionary with the connected component id, and wether it is congruent or not

In [9]:
cntd_comp_congruent = {}
for k, v in group_accession.items():
    if len(v) == 1:
        cntd_comp_congruent[k] = 'NA'
    else:
        cntd_comp_congruent[k] = mlsa_tree.check_monophyly(values=v, target_attr="name", ignore_missing=True)[0]

Next we can get dictionary with number of genomes found in that connected comonent 

In [10]:
cntd_comp_genoms = {k:len(v) for k, v in group_accession.items()}

Next we can get the number of STs per that connected components

In [11]:
import xml.dom.minidom
import networkx as nx
file=("../../supplementary_file_16/output/mlst_mst_fixed_node_positions.xml")
positions ={}
xmldoc=xml.dom.minidom.parse(file)
STNode = xmldoc.getElementsByTagName('node')
for STChild in STNode:
    ST = STChild.attributes['label'].value
    for parts in STChild.childNodes:
        if parts.nodeType == xml.dom.Node.ELEMENT_NODE: 
            if parts.tagName =='graphics':
                x = float(parts.attributes['x'].value)
                y = float(parts.attributes['y'].value)
                positions[int(ST)] = (x,y)

In [12]:
edges =[]
xmldoc=xml.dom.minidom.parse(file)
edgesNode = xmldoc.getElementsByTagName('edge')
for _ in edgesNode:
    source = str(_.attributes['label'].value).split(' (-) ')
    edges.append([int(source[0]), int(source[1])])

In [13]:
G = nx.Graph()
pos = nx.spring_layout(G)
for k, v in positions.items():
    G.add_node(k)
    pos[k] = v

In [14]:
for _ in edges:
    G.add_edge(_[0], _[1])

In [15]:
cntd_comp_size = {}
for _ in nx.connected_components(G):
    for x in _:
        if len(mlst_data.loc[mlst_data['ST'] == x]['pyani_group_ID'].values) >= 1:
            cntd_comp_size[mlst_data.loc[mlst_data['ST'] == x]['pyani_group_ID'].values[0]] = len(_)

In [16]:
congruent_dta = pd.DataFrame.from_dict(cntd_comp_size, orient='index', columns=['cntd_comp_size']).reset_index().rename(columns={'index':'cntd_comp_id'})

In [17]:
congruent_dta['cntd_comp_num_genomes'] = congruent_dta['cntd_comp_id'].map(cntd_comp_genoms)

In [18]:
congruent_dta['congruent'] = congruent_dta['cntd_comp_id'].map(cntd_comp_congruent)

In [19]:
congruent_dta = congruent_dta.sort_values(by=['cntd_comp_id'])

In [24]:
congruent_dta.to_csv(Path("../output/congruence/MLSA_MLST_congruence.csv").expanduser(), index=False)

**Create df for annotation in R**

Here we want a dataframe which will contain name/accession in the first columns, and the congruence information in another.

We know that the conected comonent 3 is the biggest monophletic connected component. Therefore, we will have it assigned a diffrent value/color for representation

In [28]:
leafs = [leaf.name for leaf in mlsa_tree]

In [50]:
cntd_comp_congruent = {k:'Biggest Monophyletic' if k==3 else v for k, v in cntd_comp_congruent.items()}

In [51]:
annotations = {genome:cntd_comp_congruent[group_id] for genome, group_id in accession_group.items() if genome in leafs}

In [52]:
df = pd.DataFrame(list(annotations.items()), columns = ['Name','Congruence'])

In [53]:
df.to_csv(Path('../output/congruence/MLSA_tree_annotations.csv').expanduser(), index=False)

In [33]:
from collections import Counter
print(Counter(cntd_comp_congruent.values()))

Counter({'NA': 122, True: 59, False: 57})


In [54]:
print(mlsa_tree)


   /-GCF_001418545.1
  |
  |      /-GCF_001418125.1
  |   /-|
  |  |   \-GCF_000813365.1
  |  |
--|  |               /-GCF_000411495.1
  |  |            /-|
  |  |         /-|   \-GCF_000720885.1
  |  |        |  |
  |  |      /-|   \-GCF_004768505.1
  |  |     |  |
  |  |   /-|   \-GCF_011006355.1
   \-|  |  |
     |  |  |   /-GCF_000719425.1
     |  |   \-|
     |  |      \-GCF_000719735.1
     |  |
     |  |      /-GCF_000297635.1
     |  |   /-|
     |  |  |   \-GCF_013364095.1
     |  |  |
     |  |  |   /-GCF_000380165.1
      \-|  |  |
        |  |  |               /-GCF_003947265.2
        |  |  |            /-|
        |  |  |         /-|   \-GCF_000220705.2
        |  |  |        |  |
        |  |  |      /-|   \-GCF_002966105.1
        |  |  |     |  |
        |  |  |   /-|   \-GCF_000719865.1
        |  |  |  |  |
        |  |  |  |   \-GCF_000745345.1
        |  |  |  |
        |  |  |  |         /-GCF_000092385.1
         \-|  |  |      /-|
           |  |  |     |   \-G

In [35]:
# Define conditions for retaining rows
condition1 = congruent_dta['congruent'].isin([True])  # Retain rows with these IDs
condition2 = congruent_dta['cntd_comp_num_genomes'] >= 2  # Retain rows with Value greater than or equal to 20

# Apply conditions using logical AND (&) or OR (|) operators
filtered_df = congruent_dta[(condition1)]

In [36]:
filtered_df

Unnamed: 0,cntd_comp_id,cntd_comp_size,cntd_comp_num_genomes,congruent
3,3,61,57,True
1,7,15,28,True
70,11,11,19,True
23,16,6,7,True
143,18,6,6,True
11,21,5,4,True
37,24,5,4,True
147,25,5,6,True
16,26,4,10,True
29,27,4,2,True


In [37]:
mlst_data

Unnamed: 0,ST,accession,organism,16S_copies,strain,assembly_status,status_tax,Type_Strain,pyani_group_ID,pyani_label,degree,pyani_genus_ID,pyani_species_ID
0,2,GCF_016906245.1,Streptomyces californicus,6,FDAARGOS_1213,Complete Genome,correct name,No,7,GCF_016906245.1 | Streptomyces californicus FD...,6,96,133
1,2,GCF_000715745.1,Streptomyces californicus,1,NRRL B-3320,Scaffold,correct name,No,7,GCF_000715745.1 | Streptomyces californicus NR...,6,96,133
2,2,GCF_000717645.1,Streptomyces californicus,1,NRRL B-2098,Contig,correct name,Yes,7,GCF_000717645.1 | Streptomyces californicus NR...,6,96,133
3,2,GCF_000717965.1,Streptomyces purpeochromogenes,1,NRRL B-3012,Contig,no record in LPSN entry,No,7,GCF_000717965.1 | Streptomyces purpeochromogen...,6,96,133
4,2,GCF_000718245.1,Streptomyces californicus,1,NRRL B-2988,Contig,correct name,No,7,GCF_000718245.1 | Streptomyces californicus NR...,6,96,133
...,...,...,...,...,...,...,...,...,...,...,...,...,...
868,801,GCF_900119365.1,Streptomyces atratus,1,OK807,Scaffold,correct name,No,277,GCF_900119365.1 | Streptomyces atratus OK807 -...,0,248,294
869,802,GCF_900171555.1,Streptomyces albidoflavus,1,R-53649,Scaffold,correct name,No,3,GCF_900171555.1 | Streptomyces albidoflavus R-...,1,57,79
870,803,GCF_900206255.1,Streptomyces sp.,4,TLI_55,Contig,not validly described,No,278,GCF_900206255.1 | Streptomyces sp. TLI_55 - ST...,0,249,295
871,804,GCF_900215595.1,Streptomyces sp.,6,1222.2,Contig,not validly described,No,80,GCF_900215595.1 | Streptomyces sp. 1222.2 - ST...,1,107,146


In [38]:
for _ in filtered_df['cntd_comp_id']:
    condition1 = mlst_data['pyani_group_ID'].isin([_])
    filtered_df2 = mlst_data[(condition1)]
    print((([_ for _ in filtered_df2['pyani_species_ID']])))
    

[79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79]
[133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133]
[18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18]
[39, 39, 39, 39, 39, 39, 39]
[45, 45, 45, 45, 45, 45]
[63, 63, 63, 63]
[70, 70, 70, 70]
[73, 73, 73, 73, 73, 73]
[74, 74, 74, 74, 74, 74, 74, 74, 74, 74]
[76, 76]
[77, 77, 77, 77]
[80, 80, 80]
[81, 81, 81, 81]
[84, 84, 84, 84, 84]
[88, 88, 88, 88]
[92, 92, 92, 92]
[93, 93, 93, 93]
[94, 94, 95]
[104, 104]
[105, 105, 105]
[106, 106]
[108, 108, 108]
[109, 109]
[111, 111, 111, 111, 111]
[116, 116, 116, 116]
[120, 120, 120, 120, 120, 120]
[124, 124]
[128, 128]
[129, 129]
[130, 130]
[131, 131]
[134, 134]
[135, 135]
[138, 138]
[139, 139, 139]
[140, 140]


In [71]:
counter = 0
x = []
mlsa_tree = Tree("../output/tree/04_tbe_mlst.raxml.support")
val = []
for node in mlsa_tree.traverse():
    if node.support <= 0.7:
        x.append(node.support)
print(counter)
        

0


In [72]:
min(x)

0.08