# MLST phylogenetic tree analysis
This notebook was used to check which and how many connected groups of STs are congurent with clades present in MLST phylogenetic tree.

**Set up**
Loading all necessary libraries

In [1]:
import ete3
from ete3 import Tree, TreeStyle, faces
from ete3 import PhyloNode
from ete3 import NodeStyle
import pandas as pd
from collections import defaultdict
from pylab import *
from Bio import SeqIO
import seaborn as sns
from pathlib import Path

**Load data**

- `mlsa_tree` - phylogenetic tree inferred from concatenated sequences of all 6 marker alleles
- `mlst_data` - csv data which contains information (eg. connected component_id) about each genome used in MLST analysis
- `raxml_log` - RAxML log file which contains infomration about the sequences which were found to identical and collapsed

In [3]:
mlsa_tree = Tree("../output/tree/04_tbe_mlst.raxml.support")
R = mlsa_tree.get_midpoint_outgroup()
# and set it as tree outgroup
mlsa_tree.set_outgroup(R)

In [4]:
mlst_data = pd.read_csv(Path("../../supplementary_file_8/output/Genome_ST_info.csv").expanduser())

In [5]:
with open(Path("../output/tree/01_check.raxml.log").expanduser()) as raxml_log:
    collapsed = {_.split('and')[0].split('Sequences')[-1].strip():_.split('and')[-1].split('are')[0].strip() for _ in raxml_log.readlines() if 'identical' in _}

First we need to check if the collapsed sequences come from the same connected component!
If all collapsed sequences are indeed from the same connected component, there is no need to do anything else.

In [6]:
for genome1, genome2 in collapsed.items():
    group_1 = mlst_data.loc[mlst_data['accession'] == genome1]['pyani_group_ID'].values[0]
    group_2 = mlst_data.loc[mlst_data['accession'] == genome2]['pyani_group_ID'].values[0]
    if group_1!=group_2:
        print(group_1, group_2)
        

Next we can create dictionary with accession number as the key, and the group as the value.

In [7]:
accession_group = mlst_data.set_index('accession').to_dict()['pyani_group_ID']

In [8]:
group_accession = defaultdict(list)
for k, v in accession_group.items():
    group_accession[v].append(k)

Next we can get a dictionary with the connected component id, and wether it is congruent or not

In [9]:
cntd_comp_congruent = {}
for k, v in group_accession.items():
    if len(v) == 1:
        cntd_comp_congruent[k] = 'NA'
    else:
        cntd_comp_congruent[k] = mlsa_tree.check_monophyly(values=v, target_attr="name", ignore_missing=True)[0]

Next we can get dictionary with number of genomes found in that connected comonent 

In [10]:
cntd_comp_genoms = {k:len(v) for k, v in group_accession.items()}

Next we can get the number of STs per that connected components

In [10]:
import xml.dom.minidom
import networkx as nx
file=("../../data/MLST_scheme_revision/mlst_mst.xml")
positions ={}
xmldoc=xml.dom.minidom.parse(file)
STNode = xmldoc.getElementsByTagName('node')
for STChild in STNode:
    ST = STChild.attributes['label'].value
    for parts in STChild.childNodes:
        if parts.nodeType == xml.dom.Node.ELEMENT_NODE: 
            if parts.tagName =='graphics':
                x = float(parts.attributes['x'].value)
                y = float(parts.attributes['y'].value)
                positions[int(ST)] = (x,y)

In [11]:
edges =[]
xmldoc=xml.dom.minidom.parse(file)
edgesNode = xmldoc.getElementsByTagName('edge')
for _ in edgesNode:
    source = str(_.attributes['label'].value).split(' (-) ')
    edges.append([int(source[0]), int(source[1])])

In [12]:
G = nx.Graph()
pos = nx.spring_layout(G)
for k, v in positions.items():
    G.add_node(k)
    pos[k] = v

In [13]:
for _ in edges:
    G.add_edge(_[0], _[1])

In [14]:
cntd_comp_size = {}
for _ in nx.connected_components(G):
    for x in _:
        if len(mlst_data.loc[mlst_data['ST'] == x]['pyani_group_ID'].values) >= 1:
            cntd_comp_size[mlst_data.loc[mlst_data['ST'] == x]['pyani_group_ID'].values[0]] = len(_)

In [15]:
congruent_dta = pd.DataFrame.from_dict(cntd_comp_size, orient='index', columns=['cntd_comp_size']).reset_index().rename(columns={'index':'cntd_comp_id'})

In [16]:
congruent_dta['cntd_comp_num_genomes'] = congruent_dta['cntd_comp_id'].map(cntd_comp_genoms)

In [17]:
congruent_dta['congruent'] = congruent_dta['cntd_comp_id'].map(cntd_comp_congruent)

In [18]:
congruent_dta = congruent_dta.sort_values(by=['cntd_comp_id'])

In [19]:
congruent_dta.to_csv(Path("~/Desktop/Kiepas_et_al_2023_MLST/data/phylogeny/congruence/MLSA_MLST_congruence.csv").expanduser(), index=False)

**Create df for annotation in R**

Here we want a dataframe which will contain name/accession in the first columns, and the congruence information in another.

In [20]:
leafs = [leaf.name for leaf in mlsa_tree]

In [21]:
annotations = {genome:cntd_comp_congruent[group_id] for genome, group_id in accession_group.items() if genome in leafs}

In [22]:
df = pd.DataFrame(list(annotations.items()), columns = ['Name','Congruence'])

In [23]:
df.to_csv(Path('~/Desktop/Kiepas_et_al_2023_MLST/data/phylogeny/congruence/MLSA_tree_annotations.csv').expanduser(), index=False)

In [24]:
from collections import Counter
print(Counter(cntd_comp_congruent.values()))

Counter({'NA': 125, False: 60, True: 57})


In [25]:
print(mlsa_tree)


      /-GCF_001418545.1
   /-|
  |   \-GCF_013371965.1
  |
--|   /-GCF_010550625.1
  |  |
  |  |   /-GCF_000725635.1
   \-|  |
     |  |   /-GCF_016613335.1
     |  |  |
      \-|  |      /-GCF_009864985.1
        |  |   /-|
        |  |  |  |   /-GCF_004195745.1
        |  |  |   \-|
        |  |  |      \-GCF_004195735.1
         \-|  |
           |  |         /-GCF_000359525.1
           |  |      /-|
           |  |     |   \-GCF_900171555.1
           |  |     |
           |  |   /-|   /-GCF_004195855.1
           |  |  |  |  |
            \-|  |  |  |         /-GCF_009865275.1
              |  |  |  |      /-|
              |  |   \-|     |   \-GCF_002910815.1
              |  |     |   /-|
              |  |     |  |  |   /-GCF_018619195.1
              |  |     |  |   \-|
              |  |     |  |      \-GCF_004195755.1
              |  |      \-|
              |  |        |      /-GCF_004195885.1
              |  |        |   /-|
              |  |        |  |  |   /-GCF_00

In [46]:
# Define conditions for retaining rows
condition1 = congruent_dta['congruent'].isin([True])  # Retain rows with these IDs
condition2 = congruent_dta['cntd_comp_num_genomes'] >= 2  # Retain rows with Value greater than or equal to 20

# Apply conditions using logical AND (&) or OR (|) operators
filtered_df = congruent_dta[(condition1)]

In [47]:
filtered_df

Unnamed: 0,cntd_comp_id,cntd_comp_size,cntd_comp_num_genomes,congruent
117,5,19,32,True
114,10,11,20,True
48,16,7,8,True
2,17,6,7,True
4,18,6,5,True
76,22,5,4,True
109,25,5,6,True
122,26,5,6,True
1,28,4,4,True
14,29,4,4,True


In [48]:
mlst_data

Unnamed: 0,ST,accession,organism,16S_copies,strain,status_tax,Type_Strain,pyani_group_ID,pyani_label,pyani_genus_ID,pyani_species_ID,degree
0,2,GCF_000715745.1,Streptomyces californicus,1,NRRL B-3320,correct name,No,5,Streptomyces californicus NRRL B-3320 - ST 2,79,114,6
1,2,GCF_016906245.1,Streptomyces californicus,6,FDAARGOS_1213,correct name,No,5,Streptomyces californicus FDAARGOS_1213 - ST 2,79,114,6
2,2,GCF_016906225.1,Streptomyces californicus,5,FDAARGOS_1211,correct name,No,5,Streptomyces californicus FDAARGOS_1211 - ST 2,79,114,6
3,2,GCF_016906205.1,Streptomyces californicus,6,FDAARGOS_1212,correct name,No,5,Streptomyces californicus FDAARGOS_1212 - ST 2,79,114,6
4,2,GCF_016906185.1,Streptomyces californicus,5,FDAARGOS_1209,correct name,No,5,Streptomyces californicus FDAARGOS_1209 - ST 2,79,114,6
...,...,...,...,...,...,...,...,...,...,...,...,...
904,821,GCF_000721435.1,Streptomyces sp.,1,NRRL F-5527,not validly described,No,18,Streptomyces sp. NRRL F-5527 - ST 821,20,35,1
905,822,GCF_001014595.1,Streptomyces sp.,1,KE1,not validly described,No,3,Streptomyces sp. KE1 - ST 822,56,80,1
906,823,GCF_001652895.1,Streptomyces sp.,1,NBRC 109436,not validly described,No,17,Streptomyces sp. NBRC 109436 - ST 823,18,33,1
907,824,GCF_003363195.1,Streptomyces sp.,2,M7,not validly described,No,28,Streptomyces sp. M7 - ST 824,52,76,2


In [58]:
for _ in filtered_df['cntd_comp_id']:
    condition1 = mlst_data['pyani_group_ID'].isin([_])
    filtered_df2 = mlst_data[(condition1)]
    print((([_ for _ in filtered_df2['pyani_species_ID']])))
    

[114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114, 114]
[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12]
[31, 31, 31, 31, 31, 31, 31, 31]
[33, 33, 33, 33, 33, 33, 33]
[35, 35, 35, 35, 35]
[56, 56, 56, 56]
[67, 67, 67, 67, 67, 67]
[72, 72, 72, 72, 72, 72]
[76, 76, 76, 76]
[79, 79, 79, 79]
[84, 84, 84, 84]
[85, 85, 85, 85, 85, 85, 85, 85, 85, 85]
[86, 86]
[87, 87, 87, 87, 87]
[91, 91, 91, 91]
[92, 92, 92, 92]
[93, 93, 93]
[103, 103, 103, 103]
[104, 104]
[111, 111]
[115, 115, 115, 115, 115]
[116, 116, 116, 116, 116, 116]
[117, 117, 117]
[118, 118]
[119, 119, 119]
[123, 123, 123]
[124, 124]
[127, 127]
[128, 128, 128]
[129, 129]
[130, 130]
[131, 131]
[132, 132]
[133, 133]
[134, 134, 134]
[135, 135, 135]
[139, 139]
[140, 140]
[141, 141, 141]
[142, 142]
[143, 143]
[144, 144]
[145, 145, 145]
[146, 146]
[148, 148]
[153, 153]
[154, 154]
[155, 155]
[156, 156, 15