In [1]:
import os
from Bio import SeqIO, SeqRecord
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

In [2]:
#Determine lengths for each sequence:
genelens = {}
gigo = [] #genes in genome order:
for record in SeqIO.parse('./../core_genome/reference_genome/LT2.pro.fasta', 'fasta'):
    genelens[record.id] = len(record)
    gigo.append(record.id)

In [3]:
fileloc = './../BLAST/2016-12-15_LT2_coreBLAST/'
fileloc = '/Users/jay.worley/Desktop/BLAST/2016-12-15_LT2_coreBLAST/'

hitdict = {}
for file in [i for i in os.listdir(fileloc) if i.startswith('pro') and i.endswith('.blast')]:
    print(file)
    for line in open(fileloc+file):
        (qID, hID, pctID, alnlen, mismatches, opengaps, qstart, qstop, hstart, hstop, eVal, bitScore) = line.split('\t')
        #Create a dictionary structure where the key is the gene.
        #This will allow us to make a list of all hits, then see if the hit has enough contigs in each
        #of several classes in order to be called a hit
        if qID not in hitdict:
            hitdict[qID] = []
        #query length - from input file:
        qlen = genelens[qID]
        #look for hits within 50% of querylen - to ignore partial hits, allows for insertions
        if (0.5*qlen)<int(alnlen)<(1.5*qlen):
            pass
        else:
            continue
            
        #percent ID cutoff - 75
        if float(pctID) < 75:
            continue
        else:
            pass
        
        #E value cutoff at 0.005
        if 'e' in eVal:
            pass
        elif float(eVal) > 0.005:
            continue
        else:
            pass
        hitdict[qID].append((hID, pctID, alnlen, mismatches, opengaps, hstart, hstop, eVal, bitScore.strip()))
#prune based on alignment length. Determine the longest alignment length

pro_SalDB1.blast
pro_SalDB2.blast
pro_SalDB3.blast
pro_SalDB4.blast
pro_SalDB5.blast


In [4]:
#non-Waycross exclude list (other subspecies of S. enterica)
#These genomes were excluded mostly/all for redundancy with other genomes
nwe = ['SAMN02844917','SEQ000004791','SEQ000014373','SEQ000026116','SEQ000026115',
       'SEQ000004795','SEQ000010998','SEQ000027150','SEQ000008922','SEQ000027158',
       'SEQ000018161','SEQ000015395','SEQ000041121','SAMN02844600']
waycross = ['SEQ000009893','SEQ000010060','SEQ000013419','SEQ000008927']
#combined exclusions.
ce = waycross + nwe
print(len(ce))

18


In [5]:
#show the number of times the gene was hit. Displays number of hits, then number of genes hit.
lens=[]
for item in list(hitdict.keys()):
    lens.append(len(hitdict[item]))
for item in sorted(set(lens)):
    print(item,'\t',lens.count(item))
#(Based on result) need to do some serious sorting/pruning!

0 	 12
1 	 1
2 	 1
5 	 3
7 	 1
9 	 1
11 	 4
12 	 3
13 	 9
14 	 3
15 	 1
16 	 2
17 	 1
18 	 4
23 	 1
24 	 1
25 	 1
26 	 4
27 	 1
29 	 1
31 	 2
33 	 2
34 	 2
35 	 3
37 	 2
38 	 2
39 	 3
40 	 3
41 	 1
42 	 1
43 	 7
44 	 1
45 	 2
46 	 2
47 	 2
48 	 4
49 	 2
50 	 2
51 	 2
52 	 1
55 	 1
57 	 4
58 	 1
61 	 1
62 	 2
64 	 4
66 	 2
68 	 1
69 	 2
71 	 1
72 	 2
75 	 1
76 	 5
77 	 2
78 	 1
80 	 1
81 	 3
82 	 2
83 	 2
84 	 1
85 	 1
86 	 3
91 	 1
92 	 1
93 	 2
96 	 1
97 	 1
99 	 1
101 	 1
102 	 2
103 	 1
107 	 2
109 	 1
110 	 1
111 	 4
112 	 3
113 	 5
114 	 8
115 	 2
116 	 2
117 	 2
118 	 3
119 	 1
120 	 1
121 	 1
122 	 1
123 	 3
124 	 1
125 	 1
126 	 2
127 	 3
128 	 5
130 	 2
131 	 3
132 	 1
134 	 3
135 	 2
136 	 1
137 	 2
139 	 4
140 	 8
142 	 3
143 	 2
144 	 6
145 	 2
147 	 1
148 	 1
149 	 6
150 	 1
151 	 1
152 	 1
157 	 1
159 	 1
160 	 1
162 	 2
163 	 2
164 	 3
165 	 1
166 	 1
167 	 1
168 	 9
176 	 1
177 	 1
179 	 6
181 	 2
182 	 2
183 	 3
184 	 2
185 	 2
186 	 7
187 	 3
188 	 5
189 	 2
190 	 12


In [6]:
#check that all genomes are accounted for
#Several genomes were cut later because of redundancy
genomes = []
for item in hitdict:
    for jtem in hitdict[item]:
        genomes.append(jtem[0].split('_')[0])
genomes = sorted(set(genomes))
print(len(genomes))

449


In [8]:
#pruned genomes list - ce is from combined waycross exclusion above

pg = [i for i in genomes if i not in ce]
print(len(pg))
#expect 431, returns 431!

431


In [9]:
#to prune and assign hits from individual genomes to individual genes.

#First, build a list of all possible genes (all ids in hitdict)

with open('./../core_genome/genes.txt', 'w') as f:
    f.write(','.join(list(hitdict.keys())))

with open('./../core_genome/genes.txt', 'r') as f:
    ft = f.read()
    genes = ft.split(',')    

In [10]:
#next, we build a new hitdict with each genome as the key
#hitdict2...
#this will include genomes in the excluded list, will ignore later
hitdict2 = {}
for item in hitdict:
    for jtem in hitdict[item]:
        genome = jtem[0].split('_')[0]
        if genome not in hitdict2:
            hitdict2[genome] = []
        #new entry - insert gene name at front of tuple
        ne = (item,)+jtem
        hitdict2[genome].append(ne)

In [11]:
#just checking how  often each gene is found
samplelist = [i[0] for i in hitdict2['SEQ000004769']]
counts = []
for i in set(samplelist):
    counts.append(samplelist.count(i))
for i in sorted(set(counts)):
    print(i, counts.count(i))

1 3825
2 54
3 5
6 1


In [12]:
#Investigating the one with six hits for normalcy
gig = [] #genes in genome
for i in hitdict2['SEQ000004769']:
    gig.append(i[0])
gigs = []
for i in set(gig):
    if gig.count(i) == 6:
        gigs.append(i)
for i in gigs:
    for j in hitdict2['SEQ000004769']:
        if j[0]==i:
            print(j)
    print('\n\n')

('lcl|NC_003197.2_prot_NP_447008.1_544', 'SEQ000004769_NODE_15_length_103438_cov_35.3778', '81.667', '60', '11', '0', '83705', '83884', '3.99e-24', '100')
('lcl|NC_003197.2_prot_NP_447008.1_544', 'SEQ000004769_NODE_15_length_103438_cov_35.3778', '78.333', '60', '13', '0', '187', '8', '6.40e-23', '96.7')
('lcl|NC_003197.2_prot_NP_447008.1_544', 'SEQ000004769_NODE_2_length_414205_cov_54.6362', '81.667', '60', '11', '0', '135974', '135795', '4.02e-24', '100')
('lcl|NC_003197.2_prot_NP_447008.1_544', 'SEQ000004769_NODE_28_length_4169_cov_44.2382', '80.000', '60', '12', '0', '3983', '4162', '8.40e-24', '98.6')
('lcl|NC_003197.2_prot_NP_447008.1_544', 'SEQ000004769_NODE_26_length_9688_cov_50.6962', '80.000', '60', '12', '0', '9502', '9681', '1.06e-23', '98.6')
('lcl|NC_003197.2_prot_NP_447008.1_544', 'SEQ000004769_NODE_33_length_1042_cov_380.043', '75.000', '40', '10', '0', '1', '120', '1.06e-10', '59.7')





In [13]:
#check which genes pass which threshold for presence
coregenes50 = []
coregenes70 = []
coregenes80 = []
coregenes85 = []
coregenes90 = []
coregenes95 = []
coregenes96 = []
coregenes97 = []
coregenes98 = []
coregenes99 = []
coregenes995 = []
coregenes = []
print(len(hitdict.keys()), 'genes total')
for item in hitdict:

    #genomes in hit
    hg = sorted(set([i[0].split('_')[0] for i in hitdict[item]]))
    hg = [i for i in hg if i not in ce]
    if len(hg)/len(pg) >= 0.50:
        coregenes50.append(item)
    if len(hg)/len(pg) >= 0.70:
        coregenes70.append(item)
    if len(hg)/len(pg) >= 0.80:
        coregenes80.append(item)
    if len(hg)/len(pg) >= 0.85:
        coregenes85.append(item)
    if len(hg)/len(pg) >= 0.90:
        coregenes90.append(item)
    if len(hg)/len(pg) >= 0.95:
        coregenes95.append(item)
    if len(hg)/len(pg) >= 0.96:
        coregenes96.append(item)
    if len(hg)/len(pg) >= 0.97:
        coregenes97.append(item)
    if len(hg)/len(pg) >= 0.98:
        coregenes98.append(item)
    if len(hg)/len(pg) >= 0.99:
        coregenes99.append(item)
    if len(hg)/len(pg) >= 0.995:
        coregenes995.append(item)
    if len(hg)/len(pg) == 1:
        coregenes.append(item)
print('Coregenes at various thresholds:')
print('\t50%:',len(coregenes50))
print('\t70%:',len(coregenes70))
print('\t80%:',len(coregenes80))
print('\t85%:',len(coregenes85))
print('\t90%:',len(coregenes90))
print('\t95%:',len(coregenes95))
print('\t96%:',len(coregenes96))
print('\t97%:',len(coregenes97))
print('\t98%:',len(coregenes98))
print('\t99%:',len(coregenes99))
print('\t99.5%:',len(coregenes995))
print('\t100%:',len(coregenes))
    
    

4448 genes total
Coregenes at various thresholds:
	50%: 4072
	70%: 3890
	80%: 3796
	85%: 3758
	90%: 3716
	95%: 3664
	96%: 3651
	97%: 3619
	98%: 3541
	99%: 3385
	99.5%: 3226
	100%: 2729


In [14]:
#let's take a look at 95% threshold. See if the missing genes share a phylogenetic pattern
#order list - parses nexus tree input as string\n",
order = '((SEQ000008927:0.042795,(SEQ000013419:0.027481,(SEQ000009893:2.42E-4,SEQ000010060:4.45E-4):0.02479):0.040451):0.741237,(SAMN02844600:0.477603,(((SEQ000041121:7.41E-4,SEQ000015395:0.001582):0.024773,(SEQ000018161:0.01975,(SEQ000027158:0.018809,SEQ000008922:0.02101):0.005022):0.01221):0.573179,((SEQ000027150:0.232638,(SEQ000010998:0.047887,(SEQ000004795:0.046963,((SEQ000026115:3.24E-4,SEQ000026116:2.38E-4):0.043733,(SEQ000004791:0.005934,SEQ000014373:0.004123):0.036956):0.008209):0.00943):0.102474):0.112433,(SAMN02844917:0.556757,(((SEQ000011837:0.026834,(SEQ000011867:0.025283,(SEQ000035782:0.024686,SEQ000014823:0.023438):0.008616):0.00474):0.02259,((SEQ000014568:0.012911,SEQ000014053:0.015004):0.008312,(SEQ000012000:1.25E-4,SEQ000011996:2.51E-4):0.013689):0.040617):0.087659,((SEQ000005500:0.01761,(SEQ000010784:0.017221,(SEQ000004769:0.016027,SEQ000019495:0.017239):0.003574):0.004394):0.12157,(((SEQ000005053:0.123041,(SEQ000008903:0.053293,(SEQ000013963:0.050561,((SEQ000014500:2.32E-4,SEQ000014510:3.18E-4):0.048506,(SEQ000026565:0.045167,SEQ000012077:0.043333):0.009252):0.007195):0.011353):0.024121):0.008499,(SEQ000014275:0.078046,((((SEQ000038392:3.5E-4,SEQ000008662:5.07E-4):0.062642,(SEQ000005035:0.038854,SEQ000038289:0.041373):0.027481):0.012055,((SEQ000022441:0.064814,SEQ000011827:0.066687):0.011666,((SEQ000008910:0.061392,SEQ000005044:0.063105):0.011504,((SEQ000011991:0.060173,SEQ000036076:0.061705):0.01048,((SEQ000015087:0.058048,SEQ000014282:0.059232):0.011553,(SEQ000027816:0.057965,(SEQ000028565:0.002996,SEQ000014119:0.002593):0.058742):0.011941):0.005113):0.007623):0.0085):0.006158):0.011968,(SEQ000022438:0.061689,((SEQ000029384:0.057488,SEQ000014122:0.055629):0.009793,(((SEQ000013904:0.058993,(SEQ000015506:4.36E-4,SEQ000004780:6.7E-4):0.060763):0.011384,((SEQ000008880:0.058986,(SEQ000006979:2.62E-4,SEQ000030732:2.11E-4):0.056526):0.01171,((SEQ000011998:0.059964,SEQ000036044:0.053892):0.010141,(SEQ000014532:0.075655,SEQ000036042:0.055952):0.008439):0.006556):0.004224):0.005674,(((SEQ000010268:0.05905,(SEQ000030731:0.017999,SEQ000013908:0.019554):0.036133):0.008975,((SEQ000010797:0.003131,SEQ000014930:0.002262):0.054036,(SEQ000011807:0.018515,SEQ000027455:0.018423):0.038312):0.010554):0.005357,((((SEQ000008887:0.049612,(SEQ000015955:7.8E-4,SEQ000015887:9.53E-4):0.052922):0.011591,((SEQ000003035:0.003665,SEQ000014793:0.00512):0.048992,(SEQ000015331:0.004098,SEQ000004450:0.00365):0.049538):0.010364):0.006605,((SEQ000034223:0.058581,(SEQ000037151:4.04E-4,SEQ000010684:6.5E-4):0.053878):0.010326,((SEQ000031518:0.057344,(SEQ000005666:0.001051,SEQ000036182:0.001763):0.059337):0.009582,((SEQ000008893:2.42E-4,SEQ000012327:1.57E-4):0.057038,(SEQ000004956:0.039926,((SEQ000038437:0.002719,SEQ000012080:0.00202):0.025841,(SEQ000036062:0.002246,SEQ000034133:0.002893):0.010695):0.019357):0.024471):0.007812):0.004378):0.004385):0.003978,((((SEQ000037534:6.84E-4,SEQ000038081:5.79E-4):0.055655,(SEQ000038372:0.040693,(SEQ000015057:2.59E-4,SEQ000015056:2.31E-4):0.040528):0.017712):0.007019,(((SEQ000004954:0.035791,SEQ000038065:0.035989):0.019238,((SEQ000037542:2.83E-4,SEQ000032573:6.62E-4):0.021961,(SEQ000027650:0.017298,(SEQ000034623:0.002406,(SEQ000035880:5.12E-4,SEQ000036053:5.82E-4):0.010056):0.01156):0.013138):0.028038):0.007713,((SEQ000027790:3.06E-4,(SEQ000013953:2.81E-4,SEQ000028559:3.13E-4):1.8E-5):0.030448,(SEQ000034185:0.011325,((SEQ000038350:0.001267,SEQ000014756:9.73E-4):0.007955,((SEQ000015186:0.002572,SEQ000015224:0.001896):7.2E-5,(SEQ000014395:0.001445,SEQ000014785:0.002127):5.99E-4):0.001976):0.011653):0.021792):0.027104):0.006076):0.005714,(((SEQ000012073:0.051103,(SEQ000010519:0.001254,SEQ000006821:9.37E-4):0.056176):0.010623,(SEQ000016319:0.038922,((SEQ000037835:2.59E-4,SEQ000038888:2.0E-4):0.039433,(SEQ000016315:0.031962,(SEQ000038085:0.001402,SEQ000004469:0.002687):0.027917):0.008439):0.009071):0.020958):0.006716,((((SEQ000014355:2.73E-4,SEQ000036689:2.07E-4):0.047864,(SEQ000034189:0.005654,(SEQ000034184:9.57E-4,SEQ000034762:5.95E-4):0.005341):0.047434):0.010731,((SEQ000007959:0.047271,(SEQ000014520:0.009668,SEQ000031524:0.010344):0.038806):0.010583,(SEQ000014002:0.050466,(SEQ000027175:7.2E-4,SEQ000027176:5.58E-4):0.049402):0.008587):0.006719):0.005041,(((SEQ000008884:0.049984,((SEQ000034074:6.58E-4,SEQ000014153:2.28E-4):3.96E-4,(SEQ000038382:1.19E-4,SEQ000038380:2.24E-4):4.8E-4):0.049251):0.011921,((SEQ000037109:0.029381,SEQ000036178:0.029541):0.028119,((SEQ000014862:2.81E-4,SEQ000014861:2.83E-4):0.054903,((SEQ000034621:0.016157,SEQ000033561:0.024541):0.009906,(SEQ000036733:0.006707,(SEQ000036069:1.77E-4,SEQ000036687:1.66E-4):0.022116):0.012196):0.031744):0.008423):0.005627):0.004457,(((SEQ000037522:7.26E-4,SEQ000038079:5.17E-4):0.051419,(SEQ000008876:0.024132,(SEQ000034313:0.02514,(SEQ000014191:0.009034,SEQ000032063:0.016432):0.005887):0.011182):0.029731):0.009086,((SEQ000008905:7.0E-5,SEQ000008906:3.68E-4):0.047101,(SEQ000034771:0.026842,((SEQ000017775:0.029736,(SEQ000031188:0.001344,SEQ000032617:0.0027):0.007131):0.010838,((SEQ000028739:2.37E-4,SEQ000030806:4.4E-4):0.028898,(SEQ000035866:0.025365,(SEQ000026764:0.014228,(SEQ000036157:0.007044,SEQ000014292:0.020306):0.007162):0.004413):0.012341):0.004335):0.009649):0.025859):0.011269):0.004873):0.003546):0.003579):0.003071):0.005698):0.003428):0.004625):0.004318):0.009244):0.036465):0.01325):0.010769):0.008545,((((SEQ000015181:0.070466,SEQ000037069:0.068805):0.009236,((SEQ000008886:2.96E-4,SEQ000028558:5.94E-4):0.06907,((SEQ000035003:2.92E-4,SEQ000016318:3.66E-4):0.065483,(SEQ000012412:5.05E-4,SEQ000005834:3.46E-4):0.06308):0.010155):0.004984):0.005379,((((SEQ000014151:0.064499,SEQ000011868:0.065339):0.009033,((SEQ000038348:5.76E-4,SEQ000039005:6.75E-4):0.065441,(SEQ000010415:2.21E-4,SEQ000010414:3.08E-4):0.066056):0.008688):0.004829,(((SEQ000007934:2.0E-6,SEQ000007933:2.0E-6):0.069757,((SEQ000025014:2.8E-4,SEQ000025013:3.65E-4):0.061305,(SEQ000005651:2.0E-6,SEQ000005650:2.0E-6):0.062123):0.009359):0.006568,(((SEQ000012012:8.55E-4,SEQ000028570:4.63E-4):0.065074,(SEQ000038406:4.64E-4,SEQ000038403:3.53E-4):0.061389):0.007951,(SEQ000004718:0.077749,((SEQ000036073:9.19E-4,SEQ000037273:4.93E-4):0.033071,((SEQ000036075:0.026945,(SEQ000036074:0.002575,SEQ000023412:0.001555):0.025339):0.013789,(SEQ000013945:5.96E-4,(SEQ000037331:8.35E-4,SEQ000014048:3.49E-4):3.19E-4):0.032954):0.009028):0.037374):0.005825):0.003831):0.005008):0.005261,(((SEQ000015610:0.066231,(SEQ000014929:8.48E-4,SEQ000014585:8.21E-4):0.065118):0.011543,(((SEQ000025666:0.066741,SEQ000011835:0.063297):0.008445,(SEQ000037078:0.065257,(SEQ000010241:7.02E-4,SEQ000011214:5.97E-4):0.061737):0.011183):0.004352,((SEQ000012078:0.063543,SEQ000011188:0.067407):0.009192,((SEQ000010649:0.061904,(SEQ000005212:5.48E-4,SEQ000038077:4.2E-4):0.061848):0.009237,(SEQ000010421:0.033691,(SEQ000034739:4.31E-4,SEQ000036527:9.5E-4):0.03443):0.035418):0.005591):0.004277):0.00418):0.004092,(((SEQ000014563:0.067619,SEQ000039203:0.067935):0.009131,(SEQ000035910:0.068964,SEQ000015235:0.064456):0.007935):0.00422,(((SEQ000039172:0.066595,SEQ000032618:0.068428):0.00811,(SEQ000005241:5.37E-4,(SEQ000037135:2.01E-4,SEQ000034876:1.16E-4):1.62E-4):0.072876):0.004358,((SEQ000015184:0.050917,SEQ000014576:0.050661):0.027102,(((SEQ000031739:1.57E-4,SEQ000014852:5.01E-4):0.069141,(SEQ000039004:0.054215,SEQ000036067:0.051721):0.018172):0.007547,((SEQ000008925:0.063677,SEQ000038376:0.062866):0.0118,(SEQ000014214:0.063869,(SEQ000035760:4.11E-4,SEQ000036788:2.94E-4):0.062331):0.010531):0.004252):0.005798):0.005755):0.008382):0.003847):0.003841):0.005203):0.007433,(SEQ000012954:0.08203,(((SEQ000034183:0.076212,SEQ000034510:0.080396):0.008891,((SEQ000005573:0.078962,(((SEQ000021752:0.07326,SEQ000032899:0.071187):0.012545,(SEQ000024893:0.069095,SEQ000014335:0.06653):0.013637):0.007924,((SEQ000039242:0.001292,SEQ000039232:0.001168):0.089255,(SEQ000004719:0.060401,(SEQ000008928:0.060047,(SEQ000006276:0.045339,SEQ000006284:0.046506):0.017757):0.013042):0.032635):0.027265):0.008831):0.014246,(((SEQ000013926:0.066916,SEQ000014038:0.070045):0.013612,(SEQ000008901:0.072836,(SEQ000016281:5.35E-4,SEQ000036065:1.17E-4):0.07006):0.012968):0.006677,((SEQ000033572:0.050827,(SEQ000036737:8.53E-4,SEQ000037525:0.001207):0.051863):0.02974,(SEQ000008885:0.072321,(SEQ000013880:5.41E-4,SEQ000015538:3.17E-4):0.070499):0.010636):0.006803):0.009726):0.010849):0.007278,(((SEQ000038351:0.070091,(SEQ000011888:2.06E-4,SEQ000016368:3.92E-4):0.065179):0.014804,(SEQ000013914:0.073458,(SEQ000013987:0.070408,(SEQ000031805:0.05506,(SEQ000016278:6.14E-4,SEQ000032890:4.63E-4):0.054197):0.019214):0.009111):0.007798):0.004096,(((SEQ000016231:0.069062,SEQ000008874:0.06976):0.011322,(((SEQ000012059:0.025121,SEQ000008890:0.020637):0.012069,(SEQ000032388:0.022862,(SEQ000036048:0.009831,SEQ000035037:0.003382):0.006164):0.017212):0.049358,((SEQ000023411:0.043202,(SEQ000007950:1.99E-4,SEQ000005648:5.2E-5):0.040341):0.028265,((SEQ000014198:0.004125,(SEQ000014737:0.001622,SEQ000034999:0.001148):0.00287):0.056476,((SEQ000037836:3.03E-4,SEQ000021594:5.11E-4):0.021349,(SEQ000008921:0.025619,(SEQ000034547:3.88E-4,SEQ000034548:3.21E-4):0.018764):0.010635):0.034085):0.013629):0.008057):0.00649):0.007139,(((((SEQ000015337:2.99E-4,SEQ000029449:3.38E-4):0.06589,(SEQ000017196:0.002453,SEQ000012291:8.04E-4):0.081962):0.011586,((SEQ000028561:0.066142,SEQ000014436:0.087181):0.008291,(SEQ000015355:0.068353,(SEQ000035396:3.96E-4,SEQ000035391:5.03E-4):0.06658):0.012092):0.005669):0.004134,((SEQ000000407:0.073053,(SEQ000014777:6.2E-5,SEQ000014778:9.6E-5):0.073803):0.010163,((SEQ000017189:0.083509,(SEQ000005604:0.002278,(SEQ000017191:5.62E-4,SEQ000017190:3.5E-4):0.010417):0.048338):0.016199,((SEQ000015496:7.0E-4,SEQ000031933:0.001351):0.01271,(SEQ000003055:3.96E-4,SEQ000008898:3.44E-4):0.011992):0.056365):0.008227):0.00647):0.005305,((((SEQ000007024:0.050527,(SEQ000014727:3.68E-4,SEQ000014726:3.85E-4):0.047913):0.025884,(SEQ000005575:0.072564,(SEQ000016371:0.032115,SEQ000028560:0.031103):0.039291):0.007905):0.005765,(((SEQ000037204:6.49E-4,SEQ000033224:4.27E-4):0.048747,(SEQ000034126:0.031574,(SEQ000017195:0.017802,(SEQ000038347:9.78E-4,SEQ000036900:0.001004):0.002564):0.023523):0.0269):0.026921,((SEQ000012956:2.6E-4,SEQ000002858:2.21E-4):0.068701,(((SEQ000031278:0.004213,SEQ000033556:0.006182):0.040409,(SEQ000009073:3.03E-4,SEQ000009069:1.97E-4):0.039867):0.021347,((SEQ000029932:0.00729,(SEQ000015310:5.31E-4,SEQ000015018:6.48E-4):0.007877):0.046608,((SEQ000030793:0.00231,SEQ000015531:0.002703):0.021203,(SEQ000005084:0.003937,(SEQ000015427:0.002659,SEQ000015544:0.003488):6.5E-5):0.016204):0.026587):0.012631):0.011426):0.0106):0.005414):0.003506,((((SEQ000033581:0.056976,(SEQ000014724:0.020357,SEQ000033571:0.025535):0.039198):0.017724,((SEQ000037242:8.74E-4,SEQ000035906:8.34E-4):0.054926,(SEQ000015055:4.23E-4,SEQ000029488:6.95E-4):0.057047):0.017381):0.006154,((SEQ000011773:0.065938,(SEQ000003858:0.006096,SEQ000014645:0.003288):0.068568):0.011216,((SEQ000034187:0.002463,SEQ000037511:0.001936):0.060125,((SEQ000015178:0.040132,(SEQ000008923:2.98E-4,SEQ000016351:5.45E-4):0.049955):0.012633,((SEQ000034332:2.91E-4,SEQ000034319:1.28E-4):0.003765,(SEQ000035386:4.55E-4,(SEQ000037105:2.27E-4,(SEQ000015352:2.03E-4,SEQ000015351:1.32E-4):8.0E-5):1.6E-4):0.004485):0.050607):0.01389):0.015054):0.004245):0.004092,(((((SEQ000010914:2.62E-4,SEQ000026583:8.55E-4):0.068005,(SEQ000015445:0.058785,(SEQ000026078:0.046468,(SEQ000012060:7.41E-4,SEQ000014230:1.77E-4):0.047038):0.018158):0.009675):0.007173,(SEQ000014948:0.071521,((SEQ000000373:7.73E-4,(SEQ000029765:1.77E-4,SEQ000031667:6.45E-4):1.34E-4):0.026311,(SEQ000036170:0.027495,((SEQ000011765:7.32E-4,SEQ000015407:0.003647):0.003301,(SEQ000000272:2.49E-4,SEQ000037533:2.7E-4):0.001878):0.009934):0.012659):0.040123):0.010895):0.004747,(((SEQ000012123:0.063372,SEQ000005034:0.063957):0.012229,(SEQ000038527:0.061754,SEQ000021125:0.060965):0.013565):0.004415,(((SEQ000012065:7.9E-4,SEQ000019515:8.93E-4):0.053502,(SEQ000028929:0.001352,SEQ000037523:4.45E-4):0.05699):0.014541,((SEQ000011909:1.45E-4,SEQ000026580:3.45E-4):0.064912,((SEQ000039170:5.71E-4,SEQ000038371:4.73E-4):0.042189,(SEQ000014837:0.019636,(SEQ000007957:0.015185,SEQ000007942:0.017219):0.004668):0.025649):0.025229):0.008064):0.00652):0.004378):0.00406,((((SEQ000036159:0.058445,SEQ000038650:0.067343):0.009818,((SEQ000011876:3.11E-4,SEQ000011877:2.52E-4):0.05894,(SEQ000012924:0.025218,(SEQ000015291:4.18E-4,SEQ000015293:8.95E-4):0.025578):0.03792):0.011653):0.005208,((SEQ000015070:0.061734,SEQ000029933:0.061057):0.011677,((SEQ000039007:5.56E-4,SEQ000034996:0.001346):0.06186,(SEQ000037531:0.028386,(SEQ000010891:6.7E-4,SEQ000014042:9.47E-4):0.030544):0.032371):0.010202):0.005639):0.004908,((((SEQ000014552:0.067971,(SEQ000015577:2.82E-4,SEQ000015576:4.88E-4):0.060677):0.010133,((SEQ000038396:0.006889,SEQ000010807:0.007773):0.054093,((SEQ000011949:2.07E-4,SEQ000011950:1.58E-4):0.010951,(SEQ000015266:8.64E-4,SEQ000027445:4.83E-4):0.011122):0.051458):0.012725):0.004661,(((SEQ000014470:0.001673,SEQ000034752:0.002081):0.05986,(SEQ000015353:0.001801,(SEQ000039071:0.001953,SEQ000001163:0.002626):2.0E-6):0.05937):0.014306,(SEQ000014784:0.0624,((SEQ000005639:0.007397,SEQ000027241:0.005193):0.035447,((SEQ000027789:2.23E-4,SEQ000027788:1.72E-4):2.28E-4,(SEQ000005318:5.6E-5,SEQ000005315:1.41E-4):2.71E-4):0.039555):0.027682):0.008329):0.004321):0.00405,((((SEQ000004109:9.2E-5,SEQ000004101:1.38E-4):0.048131,(SEQ000038395:0.036134,(SEQ000014630:4.3E-4,SEQ000027174:4.96E-4):0.035956):0.01769):0.021639,(SEQ000038071:0.060535,((SEQ000014967:0.006723,SEQ000034857:0.007226):0.047843,((SEQ000037526:0.001945,SEQ000011904:0.002192):0.026158,(SEQ000038074:0.012811,(SEQ000037516:0.001624,SEQ000015498:0.001026):0.035192):0.016158):0.024859):0.01354):0.011665):0.004854,((SEQ000008878:0.008131,SEQ000030783:0.007987):0.05237,(((SEQ000005043:0.001381,(SEQ000030792:5.82E-4,SEQ000013962:1.14E-4):6.02E-4):0.016588,(SEQ000039008:0.001526,(SEQ000027234:0.001688,(SEQ000023836:6.91E-4,SEQ000032666:6.33E-4):6.35E-4):5.28E-4):0.017804):0.018587,((SEQ000029096:0.01215,(SEQ000038353:0.001052,SEQ000013145:5.56E-4):0.005056):0.023118,((SEQ000005608:3.43E-4,SEQ000008889:3.22E-4):0.033308,(SEQ000030500:0.00671,SEQ000008897:0.002055):0.027806):0.004504):0.009515):0.032502):0.012244):0.005528):0.003269):0.00403):0.003781):0.003968):0.004435):0.005396):0.005847):0.007935):0.009052):0.008756):0.011072):0.01973):0.023143):0.255158):0.047633):0.041966):0.10853):0.741237)'
#order list - uses the tree order.
ol = []
for item in order.split('S'):
    if  item[0:2]=='EQ':
        ol.append('S'+item[0:11])
    elif item[0:3] =='AMN':
        ol.append('S'+item[0:11])
diffs = [i for i in coregenes995 if i not in coregenes]
diffdict = {}
for i in diffs:
    diffdict[i]=[j[0].split('_')[0] for j in hitdict[i]]
for i in ol:
    hitlist = []
    for j in diffs:
        if i in diffdict[j]:
            hitlist.append(' ')
        else:
            hitlist.append('m')
    print(i, '\t',''.join(hitlist))
    
    

SEQ000008927 	     mm       m        m            m   m  m    m  m           m m     m    m    m     m      m  mm         m                    m  m                       m  mm           m m                          m  m     m         m   m                  m  m    m          m      m     m   mm          m    m   m m             m       m            m   m      m      m     m  m          m     m   m     m m  m             m     m m              m m  m      m   m                      m     m m       mm      m 
SEQ000013419 	     mm       m        m            m   m  m    m  m           m m     m    m    m     m      m  m          m                       m                       m  mm           m              m             m  m     m         m   m                  m  m    m          m      m         mm          m    m   m m             m       m            m   m      m      m   m m  m m        m     m   m     m m  m  m          m     m m              m m         m   m                      m

In [15]:
#make a list of finals
fastas = []
for item in os.listdir('./../FASTA-final'):
    fastas.append(item.split('.')[0])

In [16]:
#should be 450 for our set - just files, not pruned set
print(len(fastas))

450


In [17]:
for item in fastas:
    if item not in ol:
        print(item)

renamed_contigs


In [18]:
#checking that the genomes in our set are those in the kSNP derived tree
with open('./../2016-10-10_kSNP/in_list', 'r') as f:
    ft = f.read()
inlist = [i.split('\t')[1] for i in ft.split('\n') if '\t' in i]
print(inlist)

['SEQ000039170', 'SEQ000010807', 'SEQ000036044', 'SEQ000014777', 'SEQ000026565', 'SEQ000015955', 'SEQ000032617', 'SEQ000038353', 'SEQ000014191', 'SEQ000016231', 'SEQ000033581', 'SEQ000037533', 'SEQ000011765', 'SEQ000031518', 'SEQ000026115', 'SEQ000036067', 'SEQ000026116', 'SEQ000029933', 'SEQ000015224', 'SEQ000038371', 'SEQ000008898', 'SEQ000009893', 'SEQ000014500', 'SEQ000012000', 'SEQ000036062', 'SEQ000021594', 'SEQ000033224', 'SEQ000017195', 'SEQ000015506', 'SEQ000019495', 'SEQ000012080', 'SEQ000014861', 'SEQ000015235', 'SEQ000011909', 'SEQ000004956', 'SEQ000014948', 'SEQ000036737', 'SEQ000014852', 'SEQ000011807', 'SEQ000004954', 'SEQ000011950', 'SEQ000036170', 'SEQ000011837', 'SEQ000037273', 'SEQ000037204', 'SEQ000008928', 'SEQ000034621', 'SEQ000009069', 'SEQ000000407', 'SEQ000014151', 'SEQ000027445', 'SEQ000028559', 'SEQ000039071', 'SEQ000006821', 'SEQ000014737', 'SEQ000001163', 'SAMN02844917', 'SEQ000034876', 'SEQ000030793', 'SEQ000014724', 'SEQ000010268', 'SEQ000030806', 'SEQ000

In [19]:
#should be 449
print(len(inlist))

449


In [20]:
for i in inlist:
    if i not in ol:
        print(i)

In [21]:
#make a matrix in genome order from phylo tree, ignoring non-enterica subsp. entericas.
hitmatrix = []
for i in [j for j in ol if j not in ce]:
    #present boolean
    pb = []
    #hits
    hits = [l[0] for l in hitdict2[i]]
    for k in gigo:
        if k in hits:
            pb.append(1)
        else:
            pb.append(0)
    hitmatrix.append(pb)

In [22]:
#core gene exclusion indices:
cgei = []
for i in range(len(hitmatrix[0])):
    for j in range(len(hitmatrix)-1):
        if ((hitmatrix[j][i] == 0) and (hitmatrix[j+1][i] == 0)):
            cgei.append(i)
            break
            

In [23]:
print(len(cgei)-len(hitmatrix[0]))

-3334


In [None]:
for i in hitmatrix:
    strung = [str(j) for j in i]
    string = ''.join(strung)

In [28]:
#get gene locations. Limit to the first genome at first

indices = [i for i in range(len(genes)) if i not in cgei]
for i in indices:
    print(genes[i])

lcl|NC_003197.2_prot_NP_461661.1_2656
lcl|NC_003197.2_prot_NP_462955.1_3951
lcl|NC_003197.2_prot_NP_459684.1_679
lcl|NC_003197.2_prot_NP_461277.1_2272
lcl|NC_003197.2_prot_NP_461538.1_2533
lcl|NC_003197.2_prot_NP_461312.1_2307
lcl|NC_003197.2_prot_NP_463238.3_4234
lcl|NC_003197.2_prot_NP_461635.1_2630
lcl|NC_003197.2_prot_NP_462013.1_3008
lcl|NC_003197.2_prot_NP_459354.1_349
lcl|NC_003197.2_prot_NP_461422.1_2417
lcl|NC_003197.2_prot_NP_462071.1_3066
lcl|NC_003197.2_prot_NP_462813.1_3809
lcl|NC_003197.2_prot_NP_460848.3_1843
lcl|NC_003197.2_prot_NP_459541.1_536
lcl|NC_003197.2_prot_NP_460787.1_1782
lcl|NC_003197.2_prot_NP_459184.1_179
lcl|NC_003197.2_prot_NP_460819.1_1814
lcl|NC_003197.2_prot_NP_462078.1_3073
lcl|NC_003197.2_prot_NP_460949.1_1944
lcl|NC_003197.2_prot_NP_462891.1_3887
lcl|NC_003197.2_prot_NP_462918.1_3914
lcl|NC_003197.2_prot_NP_460441.1_1436
lcl|NC_003197.2_prot_NP_462885.1_3881
lcl|NC_003197.2_prot_NP_459923.1_918
lcl|NC_003197.2_prot_NP_462393.1_3388
lcl|NC_003197.2_p

In [81]:
#get gene locations. Limit to the first genome at first
#this is to make the most permissive pass. Using the phylogeny and 
#genomes for extraction

#destination folder -> Darwin is nickname
dest = './../core_genome/FASTA_extractions_set_Darwin/'

#gene indices in list genes for extraction
indices = [i for i in range(len(genes)) if i not in cgei]

#genes to extract
gte = [genes[i] for i in indices]

#hitcountlist:
hcl = []
gfe = [i for i in genomes if i not in ce]
for i in gfe:
    #first, load up the genome from the project folder
    fileloc = './../FASTA-final/renamed_contigs/'+i+'.fasta'
    
    #genome records
    gr = SeqIO.parse(fileloc, 'fasta')
    
    #genome dictionary
    gd = SeqIO.to_dict(gr)
    
    #new seq record entry
    seq_records = []
    
    for k in hitdict2[i]:
            if k[0] in gte:
                gene = k[0]
                contig = k[1]
                start = int(k[6])
                stop = int(k[7])
                
                if start < stop:
                    start = start-1
                    stop = stop+3
                    newseq = Seq(str(gd[contig].seq)[start:stop], generic_dna)
                    record = SeqRecord(newseq, id = i+' '+gene, description = contig+'|'+k[6]+':'+k[7], name = gene)
                    seq_records.append(record)

                elif start > stop:
                    stop = stop-4
                    start = start
                    newseq = Seq(str(gd[contig].seq)[stop:start], generic_dna).reverse_complement()
                    record = SeqRecord(newseq, id = i+' '+gene, description = contig+'|'+k[6]+':'+k[7], name = gene)
                    seq_records.append(record)

                else:
                    print('ERROR FOUND', k, start, stop)
    #We've made a list(array) of seq_records, now it's time to spit out a fasta file!
    with open(dest+i+'.fasta', 'w') as f:
        SeqIO.write(seq_records, f, "fasta")

In [None]:
for i in gfe:
    #first, load up the genome from the project folder
    fileloc = './../FASTA-final/renamed_contigs/'+i+'.fasta'
    if not os.path.isfile(fileloc):
        print(i)

In [34]:
#This is to find the conservative core genome for Salmonella

#first, define the conserved core genome. Some genes are represented more than once, and we have highly homologus
#duplicates in at least one strain.

#This will output with each genome having the same core genes.

dest = './../core_genome/FASTA_extractions_set_CCG/'

#conserved core genes
ccg = []

for gene in genes:
    #genomes in hitdict:
    gh = [i[0] for i in hitdict[gene]]
    counts = [gh.count(i) for i in set(gh)]
    if len(gh) != len(genomes):
        continue
    elif list(set(counts)) != [1]:
        continue
    else:
        ccg.append(gene)

#use list genomes which includes all genomes, including waycross and non-enterica

for genome in genomes:
    
    #first, load up the genome from the project folder
    fileloc = './../FASTA-final/renamed_contigs/'+genome+'.fasta'
    
    #genome records
    gr = SeqIO.parse(fileloc, 'fasta')
    
    #genome dictionary
    gd = SeqIO.to_dict(gr)
    
    #new seq record entry
    seq_records = []
    for k in hitdict2[genome]:
        if k[0] in ccg:
            gene = k[0]
            contig = k[1]
            start = int(k[6])
            stop = int(k[7])
                
            if start < stop:
                start = start-1
                stop = stop+3
                newseq = Seq(str(gd[contig].seq)[start:stop], generic_dna)
                record = SeqRecord(newseq, id = genome+' '+gene, description = contig+'|'+k[6]+':'+k[7], name = gene)
                seq_records.append(record)

            elif start > stop:
                stop = stop-4
                start = start
                newseq = Seq(str(gd[contig].seq)[stop:start], generic_dna).reverse_complement()
                record = SeqRecord(newseq, id = genome+' '+gene, description = contig+'|'+k[6]+':'+k[7], name = gene)
                seq_records.append(record)

            else:
                print('ERROR FOUND', k, start, stop)
    #We've made a list(array) of seq_records, now it's time to spit out a fasta file!
    with open(dest+genome+'_ccg.fasta', 'w') as f:
        SeqIO.write(seq_records, f, "fasta")
print(len(ccg), 'conserved core single genes in ', len(genomes), 'genomes.')
        
        

2421 conserved core single genes in  449 genomes.


In [29]:
#conserved core genes list maker (code from above cell)
ccg = []

for gene in genes:
    #genomes in hitdict:
    gh = [i[0] for i in hitdict[gene]]
    counts = [gh.count(i) for i in set(gh)]
    if len(gh) != len(genomes):
        continue
    elif list(set(counts)) != [1]:
        continue
    else:
        ccg.append(gene)

In [69]:
#BY GENE - for aligning!

#This uses the previously extracted core genomes and rearranges them:
#instead of each file being from a single genome, now they will be from a single gene.

#This is to find the conservative core genome for Salmonella

#first, define the conserved core genome. Some genes are represented more than once, and we have highly homologus
#duplicates in at least one strain. (see above)

#This will output having a sequence from each genome placed into a file for each gene.

indir = './../core_genome/FASTA_extractions_set_CCG/'
outdir = './../core_genome/FASTA_extractions_by_gene_CCG/'

#make a dictionary - for each gene, include a genome name (in ordered list - ol)

#by gene dictionary

bgd = {}

#gene exclusion list - some non-S.e.entericas have duplicate genes hit. Need to skip to avoid
#allelic redundancy

gel = []

for genome in ol:
    infile = indir+genome+'_ccg.fasta'
    
    #genome records
    gr = list(SeqIO.parse(infile, 'fasta'))
    
    #find and flag genes where more than one hit exists (dupes in some non-sal.enter.enters.)
    #total gene list
    tgl = [i.description.split(' ')[1] for i in gr]
    #genes to exclude
    gte = list(set([i for i in tgl if tgl.count(i) > 1]))
    #rgr = refined genome records
    rgr = [i for i in gr if i.description.split(' ')[1] not in gte]
    
    gd = None
    
    #genome dictionary
    gd = SeqIO.to_dict(rgr, key_function = lambda rec : rec.description.split(' ')[1])    
    
    for item in gd:
        
        record = gd[item]
        record.description = record.name
        if item not in bgd:
            bgd[item] = []
        bgd[item].append(record)



In [71]:
outdir = './../core_genome/FASTA_extractions_by_gene_CCG2/'
for item in bgd:
    outfile = outdir+item+'.fasta'
    #check if every genome is in the gene collection
    #genomes in dictionary
    gid = [i.description.split(' ')[0] for i in bgd[item]]
    if sorted(gid) != sorted(ol):
        print('uh oh', item)
        continue
    else:
        with open(outfile, 'w') as f:    
            SeqIO.write(bgd[item], f, "fasta")

uh oh lcl|NC_003197.2_prot_NP_463359.1_4355
uh oh lcl|NC_003197.2_prot_NP_459753.1_748
uh oh lcl|NC_003197.2_prot_NP_461280.1_2275
uh oh lcl|NC_003197.2_prot_NP_461052.2_2047
uh oh lcl|NC_003197.2_prot_NP_461703.1_2698
uh oh lcl|NC_003197.2_prot_NP_461701.1_2696
uh oh lcl|NC_003197.2_prot_NP_462833.1_3829
uh oh lcl|NC_003197.2_prot_NP_462526.1_3521


In [None]:
with open(outfile, 'w') as f:    
    SeqIO.write(bgd[item], f, "fasta")

In [49]:
#length of final conserved core genome
print(len(ccg))

2421


In [7]:
outdir = './../core_genome/FASTA_extractions_by_gene_CCG/'
filelist = list(os.listdir(outdir))
protlist=[]
for i in filelist:
    try:
        protlist.append(i.split('NP_')[1].split('.')[0])
    except:
        (print(i,'not found'))
        continue
print(set([protlist.count(i) for i in protlist]))
    

2017-01-03_muscle.sh not found
NC_003197.2_prot_YP_009325922.1_3692.fasta not found
{1}


In [5]:
print(len(protlist))

2420


In [72]:
#rename files for compatibility with Raven HPC

outdir = './../core_genome/FASTA_extractions_by_gene_CCG/'
for file in list(os.listdir(outdir)):
    if 'lcl|' in file:
        os.rename(outdir+file, outdir+file.split('|')[1])

In [84]:
#align with muscle - creates shell scripts that can be run in parallel.
linelist1 = []
linelist2 = []
linelist3 = []
linelist4 = []
counter = 1
for file in filelist:
    outfile = './../muscle_alignments/'+file.split('.fast')[0]+'.afa'
    newline = 'muscle -in \''+file+'\' -fastaout \''+outfile+'\''
    if counter%4==0:
        linelist4.append(newline)
    elif counter%4==1:
        linelist3.append(newline)
    elif counter%4==2:
        linelist2.append(newline)
    else:
        linelist1.append(newline)
    counter += 1
with open('2017-01-03_muscle1.sh', 'w') as f:
    f.write('\n'.join(linelist1))
with open('2017-01-03_muscle2.sh', 'w') as f:
    f.write('\n'.join(linelist2))
with open('2017-01-03_muscle3.sh', 'w') as f:
    f.write('\n'.join(linelist3))
with open('2017-01-03_muscle4.sh', 'w') as f:
    f.write('\n'.join(linelist4))

In [11]:
linelist = []
for file in [i for i in filelist if i.endswith('.fasta')]:
    outfile = './../muscle_alignments/'+file.split('.fast')[0]+'.afa'
    newline = file+' '+outfile
    linelist.append(newline)
with open('musclefiles.txt', 'w') as f:
    f.write('\n'.join(linelist))
print(len(linelist))

2421
