In [33]:
import os
import Bio.SeqIO as SeqIO
import pandas as pd

In [34]:
fastadir = './FASTA/'
blastfile = './BLASTn-hits-genomesub/VFDB_setB_nt.blastn'
gene_name_xl = './Comparative_tables_from_VFDB/Salmonella_VFs_comparsion.xls'

In [35]:
#Build a dictionary of gene names.
#Subunit names:
subunits = ['cdtB','pltA','pltB']

gd = {} #gene dictionary

for subunit in subunits:
    if subunit not in gd:
        gd[subunit] = []

#build dictionary using pandas reference

#pulled from older code
descdict = {}
for seq in SeqIO.parse('./BLASTDB/VFDB_setB_nt.fas', 'fasta'):
    descdict[seq.name] = seq.description

    
#pull out the gene names (VFG Tag) for searching blast hits
for ref_gene in list(descdict.keys()):
    for tox_gene in subunits:
        if tox_gene in descdict[ref_gene]:
            gd[tox_gene].append(ref_gene)
            
sd = {} #subunit dictionary
for tox_gene in gd:
    for vfg in gd[tox_gene]:
        sd[vfg]=tox_gene

In [36]:
#Extract the blast hits, were created in blast format
hits = []
for line in open(blastfile,'r'):
    for subunit in gd:
        if any([True if i in line else False for i in gd[subunit]]):
            hits.append(line)
print(len(hits))

1562


In [46]:
genome_hits = {}
subunit_hits = {}
for hit in hits:
    splits = hit.split('\t')
    vfg = splits[0]
    contig = splits[1]
    pct = splits[2]
    start = splits[-4]
    stop = splits[-3]
    genome = contig.split('_')[0]
    subunit = sd[vfg]
    if float(pct)<80:
        continue
    if genome not in genome_hits:
        genome_hits[genome] = []
    if genome not in subunit_hits:
        subunit_hits[genome] = []    
    genome_hits[genome].append((genome, contig, subunit, pct, start, stop))
    subunit_hits[genome].append(subunit)
    
for genome in subunit_hits:
    subunit_hits[genome] = list(set(subunit_hits[genome]))

positive_genomes = [genome for genome in subunit_hits.keys() if sorted(subunit_hits[genome]) == sorted(subunits)]

#get contigs with all 3 genes only -> seem to sequence fine. Compare to number of positive genomes
#added Oct. 24th, 2017

contig_hits = {}
for hit in hits:
    splits = hit.split('\t')
    vfg = splits[0]
    contig = splits[1]
    pct = splits[2]
    start = splits[-4]
    stop = splits[-3]
    genome = contig.split('_')[0]
    subunit = sd[vfg]
    if contig not in contig_hits:
        contig_hits[contig] = []
    contig_hits[contig].append((genome, contig, subunit, pct, start, stop))
positive_contigs = [contig for contig in contig_hits.keys() if sorted(set([i[2] for i in contig_hits[contig]])) == sorted(subunits)]

print(len(positive_genomes))
print(len(positive_contigs))

#look for multiple hits in same genome
for genome in positive_genomes:
    if len([i for i in positive_contigs if genome in i])>1:
        contigs = [i for i in contig_hits.keys() if genome in i]
        for contig in contigs:
            print(contig, contig_hits[contig],'\n')
    if genome not in [i.split('_')[0] for i in contig_hits.keys()]:
        print('\n\n',genome,'\n\n')

162
165
SEQ000031518_NODE_64_length_18658_cov_24.7356 [('SEQ000031518', 'SEQ000031518_NODE_64_length_18658_cov_24.7356', 'cdtB', '93.462', '6480', '5701'), ('SEQ000031518', 'SEQ000031518_NODE_64_length_18658_cov_24.7356', 'cdtB', '93.205', '6480', '5701'), ('SEQ000031518', 'SEQ000031518_NODE_64_length_18658_cov_24.7356', 'cdtB', '95.309', '6500', '5691'), ('SEQ000031518', 'SEQ000031518_NODE_64_length_18658_cov_24.7356', 'pltA', '96.448', '3513', '4244'), ('SEQ000031518', 'SEQ000031518_NODE_64_length_18658_cov_24.7356', 'pltB', '76.650', '3104', '3485'), ('SEQ000031518', 'SEQ000031518_NODE_64_length_18658_cov_24.7356', 'cdtB', '93.333', '6480', '5701')] 

SEQ000031518_NODE_1_length_57489_cov_23.7477 [('SEQ000031518', 'SEQ000031518_NODE_1_length_57489_cov_23.7477', 'cdtB', '99.753', '30473', '29664'), ('SEQ000031518', 'SEQ000031518_NODE_1_length_57489_cov_23.7477', 'cdtB', '99.259', '30473', '29664'), ('SEQ000031518', 'SEQ000031518_NODE_1_length_57489_cov_23.7477', 'cdtB', '94.007', '304

In [47]:
#hits format (genome, contig, subunit, pct, start, stop)
#updated Oct. 24th to put in individual contig hits
file_lines = []
for contig in positive_contigs:
    seqs = []
    best_hits = []
    for subunit in subunits:
        #first, find the best hit for the subunit
        hits = [i for i in contig_hits[contig] if i[2] == subunit]
        pcts = sorted([float(i[3]) for i in hits])
        best_index = pcts.index(max(pcts))
        hit = hits[best_index]#pull out the first hit that has the best match
        genome, contig, subunit, pct, start, stop = hit
        best_hits.append([contig, start, stop])
    records = SeqIO.to_dict(SeqIO.parse(fastadir+genome+'.fasta', 'fasta'))
    for hit in best_hits:
        contig, start, stop = hit
        start = int(start)
        stop = int(stop)
        if stop < start:
            first = stop-1
            last = start
            seqs.append(records[contig].seq[first:last].reverse_complement())
        else:
            first = start-1
            last = stop
            seqs.append(records[contig].seq[first:last])        
    seqs = ''.join([str(i) for i in seqs])
    file_lines.append('>'+'_'.join(contig.split('_')[0:3]))
    file_lines.append(seqs)
with open('./Typhoid_tox_concatenations/unaligned.fasta','w') as f:
    f.write('\n'.join(file_lines))

In [150]:
#muscle -in unaligned.fasta -out aligned.fasta
#was run in command line before proceeding

In [167]:
seqs = []
for record in SeqIO.parse('./Typhoid_tox_concatenations/unaligned.fasta', 'fasta'):
    seqs.append(len(record.seq))
print(len(set(seqs)))

10


In [31]:
#A select few genomes seem to have more than one copy
file_lines = []
for genome in positive_genomes:
    seqs = []
    best_hits = []
    for subunit in subunits:
        #first, find the best hit for the subunit
        hits = [i for i in genome_hits[genome] if i[2] == subunit]
        contig_list = set([i[1] for i in hits])
        if len(contig_list)>1:
            print('\n'.join([','.join(hit[1:4]) for hit in sorted(hits, key = lambda x: float(x[3]))])+'\n\n')
        pcts = sorted([float(i[3]) for i in hits])
        best_index = pcts.index(max(pcts))
        hit = hits[best_index]#pull out the first hit that has the best match
        genome, contig, subunit, pct, start, stop = hit
        best_hits.append([contig, start, stop])

SAMN02844600_NODE_49_length_30708_cov_33.2201,cdtB,93.854
SAMN02844600_NODE_40_length_46449_cov_31.6221,cdtB,93.974
SAMN02844600_NODE_49_length_30708_cov_33.2201,cdtB,93.982
SAMN02844600_NODE_40_length_46449_cov_31.6221,cdtB,94.103
SAMN02844600_NODE_49_length_30708_cov_33.2201,cdtB,94.110
SAMN02844600_NODE_40_length_46449_cov_31.6221,cdtB,94.231
SAMN02844600_NODE_49_length_30708_cov_33.2201,cdtB,94.424
SAMN02844600_NODE_40_length_46449_cov_31.6221,cdtB,95.309


SAMN02844600_NODE_40_length_46449_cov_31.6221,pltA,95.492
SAMN02844600_NODE_49_length_30708_cov_33.2201,pltA,96.585


SEQ000031518_NODE_64_length_18658_cov_24.7356,cdtB,93.205
SEQ000031518_NODE_64_length_18658_cov_24.7356,cdtB,93.333
SEQ000031518_NODE_64_length_18658_cov_24.7356,cdtB,93.462
SEQ000031518_NODE_1_length_57489_cov_23.7477,cdtB,94.007
SEQ000031518_NODE_64_length_18658_cov_24.7356,cdtB,95.309
SEQ000031518_NODE_1_length_57489_cov_23.7477,cdtB,99.259
SEQ000031518_NODE_1_length_57489_cov_23.7477,cdtB,99.753
SEQ000031518_