Purpose: NleG protein-family encoding genes are very similar in protein sequence, and many are very similar in coding sequence. This notebook looks for unique sequences from the nelG genes to ascertain whether a specific sub-types are present in our genomes.

For this, we have used representatives from clades with different nleG profiles based on protein BLAST results.

Rather than create a spreadsheet, this program will simply spit out a report into the jupyter environment.

It will use data from both the protein and nucleotide queries.

Strains to use (representative of their clade, 2 per clade):

S35, S82 - missing NleG-2, -3

S44, S65 - missing NleF

S39, S81 - missing NleG

    S78 - in same clade as 2 above, but has NleG2
    
S87 - missing NleG2-4

S85, S27 - none missing

S01, S21 - Partial NleG2-4

S79, S80 - Small difference in NleG6-1 and 6-2


In [41]:
import os
import xml.etree.ElementTree as ET

In [42]:
strains = ['S35', 'S82', 'S44', 'S65', 'S39', 'S81', 'S78', 'S87', 'S85', 'S27', 'S01', 'S21', 'S79', 'S80']

In [163]:
nucdir = './../VFDB_NucB_hits/'
prodir = './../VFDB_ProB_hits/'

nucfiles = [i for i in os.listdir(nucdir) if 'nleG' in i]
profiles = [i for i in os.listdir(prodir) if 'nleG' in i]
nucfiles.append('nleF.xml')
profiles.append('nleF.xml')


genes  = [i.split('.')[0] for i in profiles]

print(nucfiles, '\n\n', profiles, '\n\n', genes)

['nleG-1.xml', 'nleG-2.xml', 'nleG-3.xml', 'nleG.xml', 'nleG2-2.xml', 'nleG2-3.xml', 'nleG2-4.xml', 'nleG5-1.xml', 'nleG5-2.xml', 'nleG6-1.xml', 'nleG6-2.xml', 'nleG6-3.xml', 'nleG7.xml', 'nleG8-2.xml', 'nleF.xml'] 

 ['nleG-1.xml', 'nleG-2.xml', 'nleG-3.xml', 'nleG.xml', 'nleG2-2.xml', 'nleG2-3.xml', 'nleG2-4.xml', 'nleG5-1.xml', 'nleG5-2.xml', 'nleG6-1.xml', 'nleG6-2.xml', 'nleG6-3.xml', 'nleG7.xml', 'nleG8-2.xml', 'nleF.xml'] 

 ['nleG-1', 'nleG-2', 'nleG-3', 'nleG', 'nleG2-2', 'nleG2-3', 'nleG2-4', 'nleG5-1', 'nleG5-2', 'nleG6-1', 'nleG6-2', 'nleG6-3', 'nleG7', 'nleG8-2', 'nleF']


In [164]:
#Set up data structure. Top dictionary - hitdict. Primary subdictionary - Strain. Lists - Gene.
#Data stored as a truple (contig, contig start position, contig stop position, query start, query stop, ids) where start < stop.
#different dicts for nuc and pro - will have many of the same hits, but not all...

nuchitdict = {}
prohitdict = {}
for st in strains:
    nuchitdict[st] = {}
    prohitdict[st] = {}
    for gn in genes:
        nuchitdict[st][gn] = []
        prohitdict[st][gn] = []
    

#fill nuchitdict, then prohitdict

#list of query lengths for reference later (gene, length)
nucquerylens = []
proquerylens = []
nuchitlens = {}


for file in nucfiles:
    if os.path.getsize(nucdir+file)<1:
        continue
    else:
        pass
    gn = file.split('.')[0]
    queryf = nucdir+file
    tree = ET.parse(queryf)
    root = tree.getroot()
    for iteration in root.findall(".//Iteration"):
            qlen = iteration.find('Iteration_query-len').text
    nucquerylens.append((gn,int(qlen)))
    for hit in root.findall(".//Hit"):
            contig = hit.find('Hit_id').text
            st = contig.split('(')[0]
            if st not in strains:
                continue
            else:
                pass
            hitlen = int(hit.find('Hit_len').text)
            nuchitlens[contig]=hitlen
            counter = 1 #needed! some contigs have multiple hits for some genes
            for hsp in hit.findall('.//Hit_hsps/Hsp'):
                qstart = int(hsp.find('Hsp_query-from').text)
                qstop = int(hsp.find('Hsp_query-to').text)
                hstart = int(hsp.find('Hsp_hit-from').text)
                hstop = int(hsp.find('Hsp_hit-to').text)
                ids = int(hsp.find('Hsp_identity').text)
                if hstart>hstop:
                    hstart = int(hsp.find('Hsp_hit-to').text)
                    hstop = int(hsp.find('Hsp_hit-from').text)
                else:
                    pass
                nuchitdict[st][gn].append((contig,hstart,hstop,qstart,qstop,ids))
                counter+=1

for file in profiles:
    if os.path.getsize(prodir+file)<1:
        continue
    else:
        pass
    gn = file.split('.')[0]
    queryf = prodir+file
    tree = ET.parse(queryf)
    root = tree.getroot()
    for iteration in root.findall(".//Iteration"):
            qlen = iteration.find('Iteration_query-len').text
    proquerylens.append((gn,int(qlen)))
    for hit in root.findall(".//Hit"):
            contig = hit.find('Hit_id').text
            st = contig.split('(')[0]
            if st not in strains:
                continue
            else:
                pass
            counter = 1 #needed! some contigs have multiple hits for some genes
            for hsp in hit.findall('.//Hit_hsps/Hsp'):
                qstart = int(hsp.find('Hsp_query-from').text)
                qstop = int(hsp.find('Hsp_query-to').text)
                hstart = int(hsp.find('Hsp_hit-from').text)
                hstop = int(hsp.find('Hsp_hit-to').text)
                ids = int(hsp.find('Hsp_identity').text)
                if hstart>hstop:
                    hstart = int(hsp.find('Hsp_hit-to').text)
                    hstop = int(hsp.find('Hsp_hit-from').text)
                else:
                    pass
                prohitdict[st][gn].append((contig,hstart,hstop,qstart,qstop,ids))
                counter+=1

In [165]:
#Some test code to organize my thoughts

for st in strains:
    #List of contigs
    contigs = []
    for gn in list(nuchitdict[st].keys()):
        for i in range(len(nuchitdict[st][gn])):
            contigs.append(nuchitdict[st][gn][i][0])
    scontigs = set(contigs)
    for gn in list(sorted(list(nuchitdict[st].keys()))):
        for contig in scontigs:
            for i in range(len(nuchitdict[st][gn])):
                if nuchitdict[st][gn][i][0] == contig:
                    print(gn, contig, nuchitdict[st][gn][i][1], nuchitdict[st][gn][i][2], nuchitdict[st][gn][i][3], nuchitdict[st][gn][i][4])
    print('\n\n')
        

nleF S35(NODE_85) 1056 1625 1 570
nleG S35(NODE_113) 2032 2577 34 579
nleG-1 S35(NODE_106) 1376 2023 1 648
nleG-2 S35(NODE_113) 1613 1747 1 135
nleG-2 S35(NODE_93) 1452 1678 353 579
nleG-3 S35(NODE_113) 1613 1741 1 129
nleG-3 S35(NODE_93) 1429 1678 324 573
nleG2-2 S35(NODE_303) 245 294 1 50
nleG2-2 S35(NODE_85) 6104 6578 92 566
nleG2-2 S35(NODE_93) 1103 1678 1 576
nleG2-3 S35(NODE_85) 6104 6429 1 326
nleG2-3 S35(NODE_93) 1343 1678 1 336
nleG2-4 S35(NODE_113) 2069 2562 1 495
nleG5-1 S35(NODE_93) 2462 3103 1 642
nleG5-2 S35(NODE_93) 2462 3103 1 642
nleG6-1 S35(NODE_50) 28385 28832 182 630
nleG6-1 S35(NODE_93) 1751 2380 1 630
nleG6-2 S35(NODE_50) 28385 28832 182 630
nleG6-2 S35(NODE_93) 1751 2380 1 630
nleG6-3 S35(NODE_50) 28378 28857 1 480
nleG6-3 S35(NODE_93) 1932 2381 26 474
nleG7 S35(NODE_113) 671 1321 1 651
nleG8-2 S35(NODE_50) 26461 27111 1 651



nleF S82(NODE_57) 1056 1625 1 570
nleG S82(NODE_82) 2034 2579 34 579
nleG-1 S82(NODE_17) 1697 2344 1 648
nleG-2 S82(NODE_82) 1615 1749 1 

In [166]:
for item in nuchitdict:
    for jtem in nuchitdict[item]:
        print(jtem)

nleG7
nleG2-2
nleG6-1
nleG6-3
nleG
nleG2-4
nleG5-1
nleG-2
nleG6-2
nleG-3
nleG8-2
nleG5-2
nleG-1
nleG2-3
nleF
nleG7
nleG2-2
nleG6-1
nleG6-3
nleG
nleG2-4
nleG5-1
nleG-2
nleG6-2
nleG-3
nleG8-2
nleG5-2
nleG-1
nleG2-3
nleF
nleG7
nleG2-2
nleG6-1
nleG6-3
nleG
nleG2-4
nleG5-1
nleG-2
nleG6-2
nleG-3
nleG8-2
nleG5-2
nleG-1
nleG2-3
nleF
nleG7
nleG2-2
nleG6-1
nleG6-3
nleG
nleG2-4
nleG5-1
nleG-2
nleG6-2
nleG-3
nleG8-2
nleG5-2
nleG-1
nleG2-3
nleF
nleG7
nleG2-2
nleG6-1
nleG6-3
nleG
nleG2-4
nleG5-1
nleG-2
nleG6-2
nleG-3
nleG8-2
nleG5-2
nleG-1
nleG2-3
nleF
nleG7
nleG2-2
nleG6-1
nleG6-3
nleG
nleG2-4
nleG5-1
nleG-2
nleG6-2
nleG-3
nleG8-2
nleG5-2
nleG-1
nleG2-3
nleF
nleG7
nleG2-2
nleG6-1
nleG6-3
nleG
nleG2-4
nleG5-1
nleG-2
nleG6-2
nleG-3
nleG8-2
nleG5-2
nleG-1
nleG2-3
nleF
nleG7
nleG2-2
nleG6-1
nleG6-3
nleG
nleG2-4
nleG5-1
nleG-2
nleG6-2
nleG-3
nleG8-2
nleG5-2
nleG-1
nleG2-3
nleF
nleG7
nleG2-2
nleG6-1
nleG6-3
nleG
nleG2-4
nleG5-1
nleG-2
nleG6-2
nleG-3
nleG8-2
nleG5-2
nleG-1
nleG2-3
nleF
nleG7
nleG2-2
nleG6

In [167]:
for st in strains:
    #List of contigs
    contigs = []
    for gn in list(nuchitdict[st].keys()):
        for i in range(len(nuchitdict[st][gn])):
            contigs.append(nuchitdict[st][gn][i][0])
    scontigs = set(contigs)
    for ct in scontigs:
        print('\n', ct,'len:'+str(nuchitlens[ct])+'| h0 hx | q0 qx | len ids')
        for gn in genes:
            for i in range(len(nuchitdict[st][gn])):
                if nuchitdict[st][gn][i][0]==ct:
                    fgn = gn
                    if len(fgn)<7:
                        sp = 7-len(fgn)
                        for j in range(sp):
                            fgn+=' '
                    print('\t', fgn, '|', nuchitdict[st][gn][i][1], nuchitdict[st][gn][i][2], '|', nuchitdict[st][gn][i][3], nuchitdict[st][gn][i][4], '|',(nuchitdict[st][gn][i][2]-nuchitdict[st][gn][i][1]), nuchitdict[st][gn][i][5])


 S35(NODE_113) len:3620| h0 hx | q0 qx | len ids
	 nleG-2  | 1613 1747 | 1 135 | 134 125
	 nleG-3  | 1613 1741 | 1 129 | 128 119
	 nleG    | 2032 2577 | 34 579 | 545 530
	 nleG2-4 | 2069 2562 | 1 495 | 493 494
	 nleG7   | 671 1321 | 1 651 | 650 651

 S35(NODE_303) len:294| h0 hx | q0 qx | len ids
	 nleG2-2 | 245 294 | 1 50 | 49 50

 S35(NODE_85) len:7218| h0 hx | q0 qx | len ids
	 nleG2-2 | 6104 6578 | 92 566 | 474 387
	 nleG2-3 | 6104 6429 | 1 326 | 325 273
	 nleF    | 1056 1625 | 1 570 | 569 570

 S35(NODE_50) len:29045| h0 hx | q0 qx | len ids
	 nleG6-1 | 28385 28832 | 182 630 | 447 430
	 nleG6-2 | 28385 28832 | 182 630 | 447 432
	 nleG6-3 | 28378 28857 | 1 480 | 479 480
	 nleG8-2 | 26461 27111 | 1 651 | 650 651

 S35(NODE_93) len:5586| h0 hx | q0 qx | len ids
	 nleG-2  | 1452 1678 | 353 579 | 226 216
	 nleG-3  | 1429 1678 | 324 573 | 249 239
	 nleG2-2 | 1103 1678 | 1 576 | 575 575
	 nleG2-3 | 1343 1678 | 1 336 | 335 334
	 nleG5-1 | 2462 3103 | 1 642 | 641 637
	 nleG5-2 | 2462 3103

In [142]:
print(st, gn, i)

S35 nleG 2


In [141]:
print(nuchitdict['S35']['nleG'])

[('S35(NODE_113)', 2032, 2577, 34, 579, 530)]


In [168]:
print(nucquerylens)

[('nleG-1', 648), ('nleG-2', 579), ('nleG-3', 573), ('nleG', 579), ('nleG2-2', 576), ('nleG2-3', 336), ('nleG2-4', 495), ('nleG5-1', 642), ('nleG5-2', 642), ('nleG6-1', 630), ('nleG6-2', 630), ('nleG6-3', 480), ('nleG7', 651), ('nleG8-2', 651), ('nleF', 570)]
