Purpose: NleG protein-family encoding genes are very similar in protein sequence, and many are very similar in coding sequence. This notebook looks for unique sequences from the nelG genes to ascertain whether a specific sub-types are present in our genomes.

For this, we have used representatives from clades with different nleG profiles based on protein BLAST results.

Rather than create a spreadsheet, this program will simply spit out a report into the jupyter environment.

It will use data from both the protein and nucleotide queries.

Strains to use (representative of their clade, 2 per clade):

This is a follow up set to make the original more robust. More from herd P, EDL933-like strains, clade towards the bottom of the phylogenetic tree, and others close to those with differences in NleG profile.

In [2]:
import os
import xml.etree.ElementTree as ET

In [16]:
strains = ['S88','S58','S42','S45','S76','S75','S71','S68','S05','S74','S23','S29','S08']

In [17]:
nucdir = './../VFDB_NucB_hits/'
prodir = './../VFDB_ProB_hits/'

nucfiles = [i for i in os.listdir(nucdir) if 'nleG' in i]
profiles = [i for i in os.listdir(prodir) if 'nleG' in i]
nucfiles.append('nleF.xml')
profiles.append('nleF.xml')


genes  = [i.split('.')[0] for i in profiles]

print(nucfiles, '\n\n', profiles, '\n\n', genes)

['nleG-1.xml', 'nleG-2.xml', 'nleG-3.xml', 'nleG.xml', 'nleG2-2.xml', 'nleG2-3.xml', 'nleG2-4.xml', 'nleG5-1.xml', 'nleG5-2.xml', 'nleG6-1.xml', 'nleG6-2.xml', 'nleG6-3.xml', 'nleG7.xml', 'nleG8-2.xml', 'nleF.xml'] 

 ['nleG-1.xml', 'nleG-2.xml', 'nleG-3.xml', 'nleG.xml', 'nleG2-2.xml', 'nleG2-3.xml', 'nleG2-4.xml', 'nleG5-1.xml', 'nleG5-2.xml', 'nleG6-1.xml', 'nleG6-2.xml', 'nleG6-3.xml', 'nleG7.xml', 'nleG8-2.xml', 'nleF.xml'] 

 ['nleG-1', 'nleG-2', 'nleG-3', 'nleG', 'nleG2-2', 'nleG2-3', 'nleG2-4', 'nleG5-1', 'nleG5-2', 'nleG6-1', 'nleG6-2', 'nleG6-3', 'nleG7', 'nleG8-2', 'nleF']


In [18]:
#Set up data structure. Top dictionary - hitdict. Primary subdictionary - Strain. Lists - Gene.
#Data stored as a truple (contig, contig start position, contig stop position, query start, query stop, ids) where start < stop.
#different dicts for nuc and pro - will have many of the same hits, but not all...

nuchitdict = {}
prohitdict = {}
for st in strains:
    nuchitdict[st] = {}
    prohitdict[st] = {}
    for gn in genes:
        nuchitdict[st][gn] = []
        prohitdict[st][gn] = []
    

#fill nuchitdict, then prohitdict

#list of query lengths for reference later (gene, length)
nucquerylens = []
proquerylens = []
nuchitlens = {}


for file in nucfiles:
    if os.path.getsize(nucdir+file)<1:
        continue
    else:
        pass
    gn = file.split('.')[0]
    queryf = nucdir+file
    tree = ET.parse(queryf)
    root = tree.getroot()
    for iteration in root.findall(".//Iteration"):
            qlen = iteration.find('Iteration_query-len').text
    nucquerylens.append((gn,int(qlen)))
    for hit in root.findall(".//Hit"):
            contig = hit.find('Hit_id').text
            st = contig.split('(')[0]
            if st not in strains:
                continue
            else:
                pass
            hitlen = int(hit.find('Hit_len').text)
            nuchitlens[contig]=hitlen
            counter = 1 #needed! some contigs have multiple hits for some genes
            for hsp in hit.findall('.//Hit_hsps/Hsp'):
                qstart = int(hsp.find('Hsp_query-from').text)
                qstop = int(hsp.find('Hsp_query-to').text)
                hstart = int(hsp.find('Hsp_hit-from').text)
                hstop = int(hsp.find('Hsp_hit-to').text)
                ids = int(hsp.find('Hsp_identity').text)
                if hstart>hstop:
                    hstart = int(hsp.find('Hsp_hit-to').text)
                    hstop = int(hsp.find('Hsp_hit-from').text)
                else:
                    pass
                nuchitdict[st][gn].append((contig,hstart,hstop,qstart,qstop,ids))
                counter+=1

for file in profiles:
    if os.path.getsize(prodir+file)<1:
        continue
    else:
        pass
    gn = file.split('.')[0]
    queryf = prodir+file
    tree = ET.parse(queryf)
    root = tree.getroot()
    for iteration in root.findall(".//Iteration"):
            qlen = iteration.find('Iteration_query-len').text
    proquerylens.append((gn,int(qlen)))
    for hit in root.findall(".//Hit"):
            contig = hit.find('Hit_id').text
            st = contig.split('(')[0]
            if st not in strains:
                continue
            else:
                pass
            counter = 1 #needed! some contigs have multiple hits for some genes
            for hsp in hit.findall('.//Hit_hsps/Hsp'):
                qstart = int(hsp.find('Hsp_query-from').text)
                qstop = int(hsp.find('Hsp_query-to').text)
                hstart = int(hsp.find('Hsp_hit-from').text)
                hstop = int(hsp.find('Hsp_hit-to').text)
                ids = int(hsp.find('Hsp_identity').text)
                if hstart>hstop:
                    hstart = int(hsp.find('Hsp_hit-to').text)
                    hstop = int(hsp.find('Hsp_hit-from').text)
                else:
                    pass
                prohitdict[st][gn].append((contig,hstart,hstop,qstart,qstop,ids))
                counter+=1

In [19]:
#Some test code to organize my thoughts

for st in strains:
    #List of contigs
    contigs = []
    for gn in list(nuchitdict[st].keys()):
        for i in range(len(nuchitdict[st][gn])):
            contigs.append(nuchitdict[st][gn][i][0])
    scontigs = set(contigs)
    for gn in list(sorted(list(nuchitdict[st].keys()))):
        for contig in scontigs:
            for i in range(len(nuchitdict[st][gn])):
                if nuchitdict[st][gn][i][0] == contig:
                    print(gn, contig, nuchitdict[st][gn][i][1], nuchitdict[st][gn][i][2], nuchitdict[st][gn][i][3], nuchitdict[st][gn][i][4])
    print('\n\n')
        

nleF S88(NODE_56) 1056 1625 1 570
nleG S88(NODE_76) 2034 2579 34 579
nleG-1 S88(NODE_16) 1697 2344 1 648
nleG-2 S88(NODE_106) 1294 1520 353 579
nleG-2 S88(NODE_76) 1615 1749 1 135
nleG-3 S88(NODE_106) 1294 1543 324 573
nleG-3 S88(NODE_76) 1615 1743 1 129
nleG2-2 S88(NODE_125) 1197 1246 1 50
nleG2-2 S88(NODE_205) 502 551 1 50
nleG2-2 S88(NODE_106) 1294 1869 1 576
nleG2-2 S88(NODE_56) 6104 6578 92 566
nleG2-3 S88(NODE_106) 1294 1629 1 336
nleG2-3 S88(NODE_56) 6104 6429 1 326
nleG2-4 S88(NODE_76) 2071 2564 1 495
nleG5-1 S88(NODE_96) 2207 2465 384 642
nleG5-1 S88(NODE_106) 1 510 1 510
nleG5-1 S88(NODE_23) 73856 74114 384 642
nleG5-2 S88(NODE_96) 2207 2465 384 642
nleG5-2 S88(NODE_106) 1 510 1 510
nleG5-2 S88(NODE_23) 73856 74114 384 642
nleG6-1 S88(NODE_37) 28385 28832 182 630
nleG6-1 S88(NODE_106) 592 1221 1 630
nleG6-2 S88(NODE_37) 28385 28832 182 630
nleG6-2 S88(NODE_106) 592 1221 1 630
nleG6-3 S88(NODE_37) 28378 28857 1 480
nleG6-3 S88(NODE_106) 591 1040 26 474
nleG7 S88(NODE_76) 673 1

In [20]:
for item in nuchitdict:
    for jtem in nuchitdict[item]:
        print(jtem)

nleG8-2
nleG5-2
nleG6-2
nleG2-2
nleG2-4
nleG5-1
nleG7
nleF
nleG
nleG-1
nleG-3
nleG2-3
nleG6-1
nleG6-3
nleG-2
nleG8-2
nleG5-2
nleG6-2
nleG2-2
nleG2-4
nleG5-1
nleG7
nleF
nleG
nleG-1
nleG-3
nleG2-3
nleG6-1
nleG6-3
nleG-2
nleG8-2
nleG5-2
nleG6-2
nleG2-2
nleG2-4
nleG5-1
nleG7
nleF
nleG
nleG-1
nleG-3
nleG2-3
nleG6-1
nleG6-3
nleG-2
nleG8-2
nleG5-2
nleG6-2
nleG2-2
nleG2-4
nleG5-1
nleG7
nleF
nleG
nleG-1
nleG-3
nleG2-3
nleG6-1
nleG6-3
nleG-2
nleG8-2
nleG5-2
nleG6-2
nleG2-2
nleG2-4
nleG5-1
nleG7
nleF
nleG
nleG-1
nleG-3
nleG2-3
nleG6-1
nleG6-3
nleG-2
nleG8-2
nleG5-2
nleG6-2
nleG2-2
nleG2-4
nleG5-1
nleG7
nleF
nleG
nleG-1
nleG-3
nleG2-3
nleG6-1
nleG6-3
nleG-2
nleG8-2
nleG5-2
nleG6-2
nleG2-2
nleG2-4
nleG5-1
nleG7
nleF
nleG
nleG-1
nleG-3
nleG2-3
nleG6-1
nleG6-3
nleG-2
nleG8-2
nleG5-2
nleG6-2
nleG2-2
nleG2-4
nleG5-1
nleG7
nleF
nleG
nleG-1
nleG-3
nleG2-3
nleG6-1
nleG6-3
nleG-2
nleG8-2
nleG5-2
nleG6-2
nleG2-2
nleG2-4
nleG5-1
nleG7
nleF
nleG
nleG-1
nleG-3
nleG2-3
nleG6-1
nleG6-3
nleG-2
nleG8-2
nleG5-2
nle

In [21]:
for st in strains:
    #List of contigs
    contigs = []
    for gn in list(nuchitdict[st].keys()):
        for i in range(len(nuchitdict[st][gn])):
            contigs.append(nuchitdict[st][gn][i][0])
    scontigs = set(contigs)
    for ct in scontigs:
        print('\n', ct,'len:'+str(nuchitlens[ct])+'| h0 hx | q0 qx | len ids')
        for gn in genes:
            for i in range(len(nuchitdict[st][gn])):
                if nuchitdict[st][gn][i][0]==ct:
                    fgn = gn
                    if len(fgn)<7:
                        sp = 7-len(fgn)
                        for j in range(sp):
                            fgn+=' '
                    print('\t', fgn, '|', nuchitdict[st][gn][i][1], nuchitdict[st][gn][i][2], '|', nuchitdict[st][gn][i][3], nuchitdict[st][gn][i][4], '|',(nuchitdict[st][gn][i][2]-nuchitdict[st][gn][i][1]), nuchitdict[st][gn][i][5])


 S88(NODE_125) len:1246| h0 hx | q0 qx | len ids
	 nleG2-2 | 1197 1246 | 1 50 | 49 50

 S88(NODE_37) len:29536| h0 hx | q0 qx | len ids
	 nleG6-1 | 28385 28832 | 182 630 | 447 430
	 nleG6-2 | 28385 28832 | 182 630 | 447 432
	 nleG6-3 | 28378 28857 | 1 480 | 479 480
	 nleG8-2 | 26461 27111 | 1 651 | 650 651

 S88(NODE_205) len:551| h0 hx | q0 qx | len ids
	 nleG2-2 | 502 551 | 1 50 | 49 50

 S88(NODE_96) len:2465| h0 hx | q0 qx | len ids
	 nleG5-1 | 2207 2465 | 384 642 | 258 253
	 nleG5-2 | 2207 2465 | 384 642 | 258 258

 S88(NODE_16) len:100768| h0 hx | q0 qx | len ids
	 nleG-1  | 1697 2344 | 1 648 | 647 648

 S88(NODE_106) len:1946| h0 hx | q0 qx | len ids
	 nleG-2  | 1294 1520 | 353 579 | 226 216
	 nleG-3  | 1294 1543 | 324 573 | 249 239
	 nleG2-2 | 1294 1869 | 1 576 | 575 575
	 nleG2-3 | 1294 1629 | 1 336 | 335 334
	 nleG5-1 | 1 510 | 1 510 | 509 507
	 nleG5-2 | 1 510 | 1 510 | 509 507
	 nleG6-1 | 592 1221 | 1 630 | 629 630
	 nleG6-2 | 592 1221 | 1 630 | 629 616
	 nleG6-3 | 591 104

In [30]:
from Bio import SeqIO
import pandas as pd

In [31]:
#Calculate the number of contig ends that Nle genes are directly adjacent to.

#First, calculate the number of contigs in each genome

contigdict = {}
for genome in [i for i in os.listdir('./../../Genomes') if i.endswith('.fasta')]:
    gn = genome.split('_')[0]
    i = 0
    for contig in SeqIO.parse('./../../Genomes/'+genome,'fasta'):
        i+=1
    contigdict[gn]=i


In [41]:
#Strains in the final table:
table = pd.read_excel('./Polished_table.xlsx', skiprows=1)
strains = []
for i in range(table.shape[0]):
    strains.append(table.ix[i][0])
print(len(strains), strains)

83 ['S36', 'S35', 'S88', 'S83', 'S82', 'S84', 'S73', 'S58', 'S61', 'S60', 'S57', 'S42', 'S80', 'S79', 'S48', 'S43', 'S62', 'S52', 'S53', 'S47', 'S63', 'S59', 'S44', 'S65', 'S64', 'S50', 'S51', 'S72', 'S56', 'S46', 'S49', 'S45', 'S55', 'S76', 'S75', 'S38', 'S39', 'S77', 'S78', 'S81', 'S41', 'S40', 'S87', 'S71', 'S68', 'S67', 'S15', 'S13', 'S16', 'S05', 'S20', 'S74', 'S27', 'S22', 'S06', 'S86', 'S85', 'S12', 'S37', 'S07', 'S14', 'S34', 'S01', 'S31', 'S10', 'S23', 'S02', 'S21', 'S32', 'S26', 'S29', 'S03', 'S19', 'S30', 'S18', 'S24', 'S25', 'S28', 'S11', 'S04', 'S08', 'S33', 'S09']


In [44]:
nucdir = './../VFDB_NucB_hits/'
nucfiles = [i for i in os.listdir(nucdir) if 'nleG' in i]
genes  = [i.split('.')[0] for i in nucfiles]

print(nucfiles, '\n\n', profiles, '\n\n', genes)

['nleG-1.xml', 'nleG-2.xml', 'nleG-3.xml', 'nleG.xml', 'nleG2-2.xml', 'nleG2-3.xml', 'nleG2-4.xml', 'nleG5-1.xml', 'nleG5-2.xml', 'nleG6-1.xml', 'nleG6-2.xml', 'nleG6-3.xml', 'nleG7.xml', 'nleG8-2.xml'] 

 ['nleG-1.xml', 'nleG-2.xml', 'nleG-3.xml', 'nleG.xml', 'nleG2-2.xml', 'nleG2-3.xml', 'nleG2-4.xml', 'nleG5-1.xml', 'nleG5-2.xml', 'nleG6-1.xml', 'nleG6-2.xml', 'nleG6-3.xml', 'nleG7.xml', 'nleG8-2.xml', 'nleF.xml'] 

 ['nleG-1', 'nleG-2', 'nleG-3', 'nleG', 'nleG2-2', 'nleG2-3', 'nleG2-4', 'nleG5-1', 'nleG5-2', 'nleG6-1', 'nleG6-2', 'nleG6-3', 'nleG7', 'nleG8-2']


In [45]:
#Set up data structure. Top dictionary - hitdict. Primary subdictionary - Strain. Lists - Gene.
#Data stored as a truple (contig, contig start position, contig stop position, query start, query stop, ids) where start < stop.

#borrowed from code above...

nuchitdict = {}

for st in strains:
    nuchitdict[st] = {}
    for gn in genes:
        nuchitdict[st][gn] = []

    

#fill nuchitdict, then prohitdict

#list of query lengths for reference later (gene, length)
nucquerylens = []
nuchitlens = {}


for file in nucfiles:
    if os.path.getsize(nucdir+file)<1:
        continue
    else:
        pass
    gn = file.split('.')[0]
    queryf = nucdir+file
    tree = ET.parse(queryf)
    root = tree.getroot()
    for iteration in root.findall(".//Iteration"):
            qlen = iteration.find('Iteration_query-len').text
    nucquerylens.append((gn,int(qlen)))
    for hit in root.findall(".//Hit"):
            contig = hit.find('Hit_id').text
            st = contig.split('(')[0]
            if st not in strains:
                continue
            else:
                pass
            hitlen = int(hit.find('Hit_len').text)
            nuchitlens[contig]=hitlen
            counter = 1 #needed! some contigs have multiple hits for some genes
            for hsp in hit.findall('.//Hit_hsps/Hsp'):
                qstart = int(hsp.find('Hsp_query-from').text)
                qstop = int(hsp.find('Hsp_query-to').text)
                hstart = int(hsp.find('Hsp_hit-from').text)
                hstop = int(hsp.find('Hsp_hit-to').text)
                ids = int(hsp.find('Hsp_identity').text)
                if hstart>hstop:
                    hstart = int(hsp.find('Hsp_hit-to').text)
                    hstop = int(hsp.find('Hsp_hit-from').text)
                else:
                    pass
                nuchitdict[st][gn].append((contig,hstart,hstop,qstart,qstop,ids))
                counter+=1

In [58]:
totalends = []
totalcontigs = []
for st in strains:
    #ends are the number of ends of contigs hit
    ends = 0
    #List of contigs
    contigs = []
    for gn in list(nuchitdict[st].keys()):
        for i in range(len(nuchitdict[st][gn])):
            contigs.append(nuchitdict[st][gn][i][0])
    scontigs = set(contigs)
    for ct in scontigs:
        #nuc hit lens has contig lens. if either index 1 or 2 have 1 or length, then it hits the end.
        #collect the location of ends of the genes in a list here to match with contigs
        endlist = []
        for gn in genes:
            for i in range(len(nuchitdict[st][gn])):
                if nuchitdict[st][gn][i][0]==ct:
                    endlist.append(nuchitdict[st][gn][i][1])
                    endlist.append(nuchitdict[st][gn][i][2])
        sendlist = set(endlist)
        if 1 in sendlist:
            ends+=1
        if nuchitlens[ct] in sendlist:
            ends+=1
    print(st, ends*100/(contigdict[st]*2))
    totalends.append(ends)
    totalcontigs.append(contigdict[st])
print('Final total:',100*sum(totalends)/(sum(totalcontigs)*2),'% of contigs ends are in nleG genes')

S36 0.15479876160990713
S35 0.14534883720930233
S88 0.8741258741258742
S83 1.7605633802816902
S82 1.3513513513513513
S84 1.6778523489932886
S73 0.1488095238095238
S58 0.15151515151515152
S61 0.45180722891566266
S60 0.1457725947521866
S57 0.12195121951219512
S42 0.646551724137931
S80 4.338842975206612
S79 4.117647058823529
S48 0.8333333333333334
S43 0.390625
S62 2.0114942528735633
S52 0.967741935483871
S53 1.5714285714285714
S47 1.794871794871795
S63 1.4905149051490514
S59 0.6738544474393531
S44 2.691218130311615
S65 0.8264462809917356
S64 1.2987012987012987
S50 1.9287833827893175
S51 1.2162162162162162
S72 1.1267605633802817
S56 1.2893982808022924
S46 1.4473684210526316
S49 1.8840579710144927
S45 2.293577981651376
S55 1.5463917525773196
S76 0.4297994269340974
S75 0.4087193460490463
S38 0.30181086519114686
S39 0.5252100840336135
S77 0.625
S78 0.25839793281653745
S81 0.6476683937823834
S41 1.2672811059907834
S40 0.9433962264150944
S87 0.8802816901408451
S71 0.6006006006006006
S68 0.40650