In [1]:
import os
import openpyxl as op
import numpy as np
import scipy.misc as smp
from Bio import SeqIO

Makes heatmaps from 3 sources of data: PHASTER, MvirDB blast, and Plasmid Finder

Different classes - go by columns.

Different rows - different taxa.

For this pass, we'll just classify everything, put some white spaces in-between columns.
When we get lists of different clades, we can make them different colors, potentially.

Some alternate color scheme should be added later.

In [2]:
##Tree order
with open('./../RAxML Trees/RAxML_bipartitions.RAxML-concat-ccg_2-nd.nex', 'r') as f:
    nexus = f.read()

In [3]:
tree = nexus.split('begin trees;')[1].split('end;')[0].strip()
splits = ['S'+i.split(':')[0] for i in tree.split('S')[1:]]
for i in range(len(splits)):
    if ':' in splits[i]:
        splits[i] = splits[i].split(':')[0]
go = splits

gos = ['SEQ000010060','SAMN02844600','SEQ000008922','SEQ000004791','SAMN02844917',\
      'SEQ000011996','SEQ000019495','SEQ000015224','SEQ000014214','SEQ000012954',\
      'SEQ000034510','SEQ000037525']

with open('2017-05-31_tree_order.txt','w') as f:
    f.write('\n'.join(go))

In [4]:
##PHASTER OUTPUT##

#cutoff for phage calling
cutoff = True
co = 50 #% of genes of query region found in virus
end_tolerance = 100 #amount of tolerance to be considered close to a contig end
#gives some leeway in colling CDSs

viruses_near_ends = []
intact_viruses = 0

##first, scrape the data
phages = []
phd = {} #phage dictionary
phasterd = '/Users/jay.worley/Desktop/PHASTER/unzipped_results/'
for i in [i for i in os.listdir(phasterd) if os.path.isdir(phasterd+i) and i in go]:
    resultsf = phasterd+i+'/summary.txt'
    with open(resultsf, 'r') as f:
        rawresults = f.read()
    results = rawresults.split('\n\n\n\n\n')[1].strip().split('\n')[3:]
    
    for result in results:
        rl = ([i.strip() for i in result.split(' ') if len(i)>0])
        if rl[2].startswith('intact'):
            genome = rl[4].split('_')[0]
            phage = '_'.join(rl[13].split('(')[0].split('_')[1:])
            pct = float(rl[15].split('%')[0])
            
            intact_viruses += 1
            #cutoff if called for
            if cutoff == True and pct < co:
                continue
            #detect hits split by ends of contig or near end of contig
            contig_len = int(rl[4].split('_')[4])
            hit_start = int(rl[4].split(':')[1].split('-')[0])
            hit_stop = int(rl[4].split(':')[1].split('-')[1])
            
            if any([end_tolerance>hit_start, end_tolerance>hit_stop, (contig_len-end_tolerance)<hit_start, (contig_len-end_tolerance)<hit_stop]):
                viruses_near_ends.append(phage)
                
            phages.append(phage)
            if genome not in phd.keys():
                phd[genome] = {}
            if phage not in phd[genome].keys():
                phd[genome][phage] = pct
                continue
            elif pct > phd[genome][phage]:
                phd[genome][phage] = pct
                print(genome,phage,'dupe')
            else:
                print(genome,phage,'dupe')
                continue
            continue
print('intact viruses found:', intact_viruses)            
print('phages total:',len(phages),'near ends:',len(viruses_near_ends))
classes = sorted(set(phages))
tot_clas = []
tot_ends = []
for clas in classes:
    if phages.count(clas) > 10:
        print(clas, phages.count(clas), viruses_near_ends.count(clas))
        tot_clas.append(int(phages.count(clas)))
        tot_ends.append(int(viruses_near_ends.count(clas)))
print(sum(tot_clas), sum(tot_ends))
phages = sorted(set(phages))
print(len(phages), 'different viruses putatively identified')
print(phages)

SEQ000007959 Salmon_RE_2010_NC_019488 dupe
SEQ000015055 Salmon_Fels_2_NC_010463 dupe
SEQ000030500 Phage_Gifsy_1_NC_010392 dupe
SEQ000033581 Phage_Gifsy_1_NC_010392 dupe
SEQ000038289 Salmon_SSU5_NC_018843 dupe
SEQ000038380 Salmon_Fels_2_NC_010463 dupe
SEQ000038382 Salmon_Fels_2_NC_010463 dupe
SEQ000039232 Salmon_Fels_2_NC_010463 dupe
SEQ000039242 Salmon_Fels_2_NC_010463 dupe
intact viruses found: 1122
phages total: 586 near ends: 144
Entero_186_NC_001317 23 2
Entero_P88_NC_026014 12 1
Entero_PsP3_NC_005340 20 1
Escher_D108_NC_013594 13 1
Haemop_HP2_NC_003315 18 3
Phage_Gifsy_1_NC_010392 102 36
Phage_Gifsy_2_NC_010393 47 14
Salmon_118970_sal3_NC_031940 37 16
Salmon_Fels_2_NC_010463 95 10
Salmon_RE_2010_NC_019488 20 2
Salmon_SEN34_NC_028699 50 18
Salmon_SP_004_NC_021774 21 0
458 104
46 different viruses putatively identified
['Edward_GF_2_NC_026611', 'Entero_186_NC_001317', 'Entero_Arya_NC_031048', 'Entero_ES18_NC_006949', 'Entero_Mu_NC_000929', 'Entero_P1_NC_005856', 'Entero_P22_NC_00237

In [5]:
num_viruses = []
for i in phd:
    num_viruses.append(len(phd[i]))
print(sum(num_viruses))
print(len(phd))
print(max(num_viruses))

577
320
6


In [6]:
##NOW CREATE THE IMAGE##

#phage count cutoff

cutoff = True
if cutoff == True:
    co = 10 #cutoff value
    phagedump = []
    for genome in phd.keys():
        phagedump = phagedump + (list(phd[genome].keys()))
    phages = [i for i in phages if phagedump.count(i) > co]
    
##COLOR VALUES
#set what percent value should be red.
#will be linearly r->g up to 100%
redpct = 50


phc = {} #Phage counts
gphc = {}
#create a matrix of the image size
#((row, col), pct) -> ((genome,phage),match)

##pixel dimensions
bh = 10 #box height
bw = 40 #box width
lw = 3 #line width
ow = 5 #outline width
black = [0,0,0]
white = [255,255,255]
grey = [100,100,100]

height = (len(go)*(bh+lw))-lw+(2*ow)
width = (len(phages)*(bw+lw))-lw+(2*ow)

data = np.zeros((height,width,3), np.uint8)

##color lines white

color_my_lines = True
if color_my_lines == True:
    color = black
    lxs = [range(ow-lw+((bw+lw)*i),(ow+((bw+lw)*i))) for i in range(1,len(phages))]#line coloring values x axis
    lys = [range(ow-lw+((bh+lw)*i),(ow+((bh+lw)*i))) for i in range(1,len(go))]#line coloring values y axis
    
    #white line dividers for different subspecies/clades
    lysw = [range(ow-lw+((bh+lw)*(go.index(i)+1)),(ow+((bh+lw)*(go.index(i)+1)))) for i in gos]
    
    for x in lxs:
        for xx in x:
            for yy in range(height):
                data[yy,xx]=color
    for y in lys:
        for yy in y:
            for xx in range(width):
                data[yy,xx]=color
    for y in lysw:#dividers for clades
        for yy in y:
            for xx in range(width):
                data[yy,xx]=black
    ##color in outlines
    for x in range(ow):
        for y in range(height):
            data[y,x]=color
    for x in range(width-ow,width):
        for y in range(height):
            data[y,x]=color
    for y in range(ow):
        for x in range(width):
            data[y,x]=color
    for y in range(height-ow,height):
        for x in range(width):
            data[y,x]=color
            
    
for genome in go:
    if genome not in phd.keys():
        gphc[genome] = 0
        continue
    for phage in phages:
        if phage in phd[genome]:
            pct = phd[genome][phage]
            #data[1,10] = [255,0,0]red
            #data[2,4] = [0,255,0]green
            if pct<=(redpct+((100-redpct)/2)):
                red = 255
                green = round(((pct-redpct)/((100-redpct)/2))*(255))
            elif pct > 50:
                red = round(((100-pct)/((100-redpct)/2))*(255))
                green = 255
            color =[red, green, 0]
            for h in range(ow+(go.index(genome)*(bh+lw)),ow+(go.index(genome)*(bh+lw))+bh):
                for w in range(ow+(phages.index(phage)*(bw+lw)),ow+(phages.index(phage)*(bw+lw))+bw):
                    data[h,w] = color
            if phage not in phc.keys():
                phc[phage] = 0
            phc[phage] += 1
            if genome not in gphc.keys():
                gphc[genome] = 0
            gphc[genome] += 1
img = smp.toimage(data).save('2017-08-15_PHASTER_image_10PhageCutoff.png')
img = smp.toimage(data)
img.show()
for i in phages:
    print(phages.index(i)+1, i, ' '*(30-len(i)), 'count:', phc[i], '\tproportion:', phc[i]/len(go))
print('Min:', min([gphc[i] for i in gphc.keys()]), '\tMax:', max([gphc[i] for i in gphc.keys()]),'\tAvg:', \
      sum([gphc[i] for i in gphc.keys()])/len(go))

1 Entero_186_NC_001317            count: 23 	proportion: 0.051685393258426963
2 Entero_P88_NC_026014            count: 12 	proportion: 0.02696629213483146
3 Entero_PsP3_NC_005340           count: 20 	proportion: 0.0449438202247191
4 Escher_D108_NC_013594           count: 13 	proportion: 0.029213483146067417
5 Haemop_HP2_NC_003315            count: 18 	proportion: 0.04044943820224719
6 Phage_Gifsy_1_NC_010392         count: 100 	proportion: 0.2247191011235955
7 Phage_Gifsy_2_NC_010393         count: 47 	proportion: 0.10561797752808989
8 Salmon_118970_sal3_NC_031940    count: 37 	proportion: 0.08314606741573034
9 Salmon_Fels_2_NC_010463         count: 90 	proportion: 0.20224719101123595
10 Salmon_RE_2010_NC_019488        count: 19 	proportion: 0.04269662921348315
11 Salmon_SEN34_NC_028699          count: 50 	proportion: 0.11235955056179775
12 Salmon_SP_004_NC_021774         count: 21 	proportion: 0.04719101123595506
Min: 0 	Max: 5 	Avg: 1.0112359550561798


In [14]:
#Phage Stats

#strcutre of PHage Dictionary
#phd[genome]={}
#phd[genome][phage] = pct match

#cutoff for phage calling
cutoff = True
co = 50


##first, scrape the data
phages = []
phd = {} #phage dictionary
phasterd = '/Users/jay.worley/Desktop/PHASTER/unzipped_results/'
for i in [i for i in os.listdir(phasterd) if os.path.isdir(phasterd+i) and i in go]:
    resultsf = phasterd+i+'/summary.txt'
    with open(resultsf, 'r') as f:
        rawresults = f.read()
    results = rawresults.split('\n\n\n\n\n')[1].strip().split('\n')[3:]
    
    for result in results:
        rl = ([i.strip() for i in result.split(' ') if len(i)>0])
        if rl[2].startswith('intact'):
            genome = rl[4].split('_')[0]
            phage = '_'.join(rl[13].split('(')[0].split('_')[1:])
            pct = float(rl[15].split('%')[0])
            
            #cutoff if called for
            if cutoff == True and pct < co:
                continue
            phages.append(phage)
            if genome not in phd.keys():
                phd[genome] = {}
            if phage not in phd[genome].keys():
                phd[genome][phage] = pct
            elif pct > phd[genome][phage]:
                phd[genome][phage] = pct
            else:
                continue
            continue
phages = sorted(set(phages))

#Average number of phages per genome:
phage_counts = []
num=genomes = 445
print(len(phd))
for key in phd.keys():
    phage_counts.append(len(phd[key]))
print('Average number of complete phages per genome:',round(sum(phage_counts)/445, 2))

#total number of phages
print('Total complete phages found:', sum(phage_counts))

#nubmer of phages of each type scoring over 50% similarity

cutoff_phages = []

for key in phd.keys():
    for phage in phd[key]:
        if phd[key][phage] >= 50:
            cutoff_phages.append(phage)
print('Total phages above 50% similarity to known phage in database:', len(cutoff_phages))
print('Number below cutoff:', sum(phage_counts) - len(cutoff_phages), '\nPercent unknown:',\
     round(100*(sum(phage_counts)-len(cutoff_phages))/sum(phage_counts),1), '%\n')

#Breakdown numbers of genomes containing x number of complete phages
phage_number = []
for key in phd.keys():
    phage_number.append(len(phd[key]))
print('0 phages:',445-len(phage_number))
for i in sorted(set(phage_number)):
    print(i, 'phages:', phage_number.count(i))

set_cutoff_phages = set(cutoff_phages)
count_set_cutoff_phages = [(i, cutoff_phages.count(i)) for i in set_cutoff_phages]
count_set_cutoff_phages = sorted(count_set_cutoff_phages, key = lambda x: x[1], reverse = True)
print('\nCutoff phage counts:')
file_lines = []
for hit in count_set_cutoff_phages:
    print(hit[0]+':',hit[1])
    file_lines.append(hit[0]+'\t'+str(hit[1]))
with open('2017-10-25_Phage_table.tab', 'w') as f:
    f.write('\n'.join(file_lines))

320
Average number of complete phages per genome: 1.3
Total complete phages found: 577
Total phages above 50% similarity to known phage in database: 577
Number below cutoff: 0 
Percent unknown: 0.0 %

0 phages: 125
1 phages: 157
2 phages: 93
3 phages: 55
4 phages: 9
5 phages: 3
6 phages: 3

Cutoff phage counts:
Phage_Gifsy_1_NC_010392: 100
Salmon_Fels_2_NC_010463: 90
Salmon_SEN34_NC_028699: 50
Phage_Gifsy_2_NC_010393: 47
Salmon_118970_sal3_NC_031940: 37
Entero_186_NC_001317: 23
Salmon_SP_004_NC_021774: 21
Entero_PsP3_NC_005340: 20
Salmon_RE_2010_NC_019488: 19
Haemop_HP2_NC_003315: 18
Escher_D108_NC_013594: 13
Entero_P88_NC_026014: 12
Salmon_SEN1_NC_029003: 10
Klebsi_phiKO2_NC_005857: 10
Salmon_Fels_1_NC_010391: 9
Salmon_epsilon15_NC_004775: 8
Salmon_SJ46_NC_031129: 8
Entero_P1_NC_005856: 7
Shigel_SfII_NC_021857: 7
Salmon_ST64B_NC_004313: 5
Entero_fiAA91_ss_NC_022750: 5
Escher_phiV10_NC_007804: 5
Entero_Mu_NC_000929: 5
Edward_GF_2_NC_026611: 4
Salmon_SSU5_NC_018843: 4
Entero_UAB_Phi20_N

In [4]:
##PLASMID FINDER OUTPUT##
#Takes out individual subtypes - groups by incompatability group

##first, scrape the data
plasmids = []
pld = {} #plasmid dictionary
pff = './../LLNL-MvirDB/Plasmid_finder_hits_after_review.tab'#plasmid finder file
with open(pff,'r') as f:
    pft = f.read() #pftabs
pfr = [i.strip() for i in pft.split('\n')[1:] if len(i.strip())>0]#plasmid finder results
for hit in pfr:
    rl = hit.split('\t')#result list
    genome = rl[1].split('_')[0]
    plasmid = rl[0].split('_')[0]
    if '(' in plasmid:
        pass
    pct = float(rl[2]) #remember, pcts start at 80%
    plasmids.append(plasmid)
    if genome not in pld.keys():
        pld[genome] = {}
    if plasmid not in pld[genome].keys():
        pld[genome][plasmid] = pct
    elif pct > pld[genome][plasmid]:
        pld[genome][plasmid] = pct
    else:
        continue
    continue
plasmids = sorted(set(plasmids))

In [5]:
##NOW CREATE THE IMAGE##
#This includes all plasmids

plc = {} #plasmid counts
gplc = {}
#create a matrix of the image size
#((row, col), pct) -> ((genome,plasmid),match)

##pixel dimensions
bh = 10 #box height
bw = 40 #box width
lw = 3 #line width
ow = 5 #outline width
black = [0,0,0]
white = [255,255,255]
grey = [100,100,100]

height = (len(go)*(bh+lw))-lw+(2*ow)
width = (len(plasmids)*(bw+lw))-lw+(2*ow)

data = np.zeros((height,width,3), np.uint8)

##color lines white

color_my_lines = True
if color_my_lines == True:
    color = black
    lxs = [range(ow-lw+((bw+lw)*i),(ow+((bw+lw)*i))) for i in range(1,len(plasmids))]#line coloring values x axis
    lys = [range(ow-lw+((bh+lw)*i),(ow+((bh+lw)*i))) for i in range(1,len(go))]#line coloring values y axis
    
    #white line dividers for different subspecies/clades
    lysw = [range(ow-lw+((bh+lw)*(go.index(i)+1)),(ow+((bh+lw)*(go.index(i)+1)))) for i in gos]
    
    for x in lxs:
        for xx in x:
            for yy in range(height):
                data[yy,xx]=color
    for y in lys:
        for yy in y:
            for xx in range(width):
                data[yy,xx]=color
    for y in lysw:#dividers for clades
        for yy in y:
            for xx in range(width):
                data[yy,xx]=color            
    
    ##color in outlines
    for x in range(ow):
        for y in range(height):
            data[y,x]=color
    for x in range(width-ow,width):
        for y in range(height):
            data[y,x]=color
    for y in range(ow):
        for x in range(width):
            data[y,x]=color
    for y in range(height-ow,height):
        for x in range(width):
            data[y,x]=color

for genome in go:
    if genome not in pld.keys():
        gplc[genome] = 0
        continue
    for plasmid in plasmids:
        if plasmid in pld[genome]:
            pct = pld[genome][plasmid]
            #using yellow to green since percentage hits start at 80, so we have some sorting already by score
            #data[1,10] = [255,0,0]red
            #data[2,4] = [0,255,0]green
            red = round(255*((100-pct)/20))
            green = 255
            color =[red, green, 0]
            for h in range(ow+(go.index(genome)*(bh+lw)),ow+(go.index(genome)*(bh+lw))+bh):
                for w in range(ow+(plasmids.index(plasmid)*(bw+lw)),ow+(plasmids.index(plasmid)*(bw+lw))+bw):
                    data[h,w] = color
            if plasmid not in plc.keys():
                plc[plasmid] = 0
            plc[plasmid] += 1
            if genome not in gplc.keys():
                gplc[genome] = 0
            gplc[genome] += 1
img = smp.toimage(data).save('2018-08-06_PLASMIDFINDER_image_un_compressed.png')
img = smp.toimage(data)
img.show()
for i in plasmids:
    print(plasmids.index(i)+1, i, ' '*(15-len(i)), 'count:', plc[i], '\tproportion:', plc[i]/len(go))
print('Min:', min([gplc[i] for i in gplc.keys()]), '\tMax:', max([gplc[i] for i in gplc.keys()]),'\tAvg:', \
      sum([gplc[i] for i in gplc.keys()])/len(go))

#Breakdown numbers of genomes containing x number of complete phages
plasmid_number = []
for key in pld.keys():
    plasmid_number.append(len(pld[key]))
print('\n0 plasmids:',445-len(plasmid_number))
for i in sorted(set(plasmid_number)):
    print(i,'plasmids',plasmid_number.count(i))

1 Col(Ye4449)      count: 1 	proportion: 0.0022471910112359553
2 Col156           count: 2 	proportion: 0.0044943820224719105
3 ColRNAI          count: 2 	proportion: 0.0044943820224719105
4 ColpVC           count: 6 	proportion: 0.01348314606741573
5 IncA/C           count: 4 	proportion: 0.008988764044943821
6 IncB/O/K/Z       count: 7 	proportion: 0.015730337078651686
7 IncFIB(K)        count: 1 	proportion: 0.0022471910112359553
8 IncFIB(pB171)    count: 17 	proportion: 0.038202247191011236
9 IncFIB(pLF82)    count: 2 	proportion: 0.0044943820224719105
10 IncFII           count: 28 	proportion: 0.06292134831460675
11 IncFII(29)       count: 56 	proportion: 0.1258426966292135
12 IncFII(Yp)       count: 3 	proportion: 0.006741573033707865
13 IncFII(p14)      count: 14 	proportion: 0.03146067415730337
14 IncFII(pCRY)     count: 6 	proportion: 0.01348314606741573
15 IncHI2           count: 1 	proportion: 0.0022471910112359553
16 IncHI2A          count: 1 	proportion: 0.0022471910112359

In [10]:
##NOW CREATE THE IMAGE##
#This has ONLY INC F MEMBERS

pldF = {} #plasmid dictionary with only incF or incFII members
Fplasmids = [] #plasmids in incF or incFII

for genome in pld.keys():
    incF_plasmids = [i for i in pld[genome] if i.startswith('IncF')]
    if len(incF_plasmids) < 1:
        continue #pass by those with no incF or incFII plasmids
    if genome not in pldF:
        pldF[genome] = {}
    for plasmid in [i for i in pld[genome] if i.startswith('IncF')]:
        Fplasmids.append(plasmid)
        pldF[genome][plasmid] = pld[genome][plasmid]
Fplasmids = list(sorted(set(Fplasmids))) #to get the right height

plc = {} #plasmid counts
gplc = {}
#create a matrix of the image size
#((row, col), pct) -> ((genome,plasmid),match)

##pixel dimensions
bh = 10 #box height
bw = 40 #box width
lw = 0 #line width
ow = 0 #outline width
black = [0,0,0]
white = [255,255,255]
grey = [100,100,100]

height = (len(go)*(bh+lw))-lw+(2*ow)
width = (len(Fplasmids)*(bw+lw))-lw+(2*ow)

data = np.zeros((height,width,3), np.uint8)

##color lines white

color_my_lines = True
if color_my_lines == True:
    color = grey
    lxs = [range(ow-lw+((bw+lw)*i),(ow+((bw+lw)*i))) for i in range(1,len(Fplasmids))]#line coloring values x axis
    lys = [range(ow-lw+((bh+lw)*i),(ow+((bh+lw)*i))) for i in range(1,len(go))]#line coloring values y axis
    
    #white line dividers for different subspecies/clades
    lysw = [range(ow-lw+((bh+lw)*(go.index(i)+1)),(ow+((bh+lw)*(go.index(i)+1)))) for i in gos]
    
    for x in lxs:
        for xx in x:
            for yy in range(height):
                data[yy,xx]=color
    for y in lys:
        for yy in y:
            for xx in range(width):
                data[yy,xx]=color
    for y in lysw:#dividers for clades
        for yy in y:
            for xx in range(width):
                data[yy,xx]=white            
    
    ##color in outlines
    for x in range(ow):
        for y in range(height):
            data[y,x]=color
    for x in range(width-ow,width):
        for y in range(height):
            data[y,x]=color
    for y in range(ow):
        for x in range(width):
            data[y,x]=color
    for y in range(height-ow,height):
        for x in range(width):
            data[y,x]=color

for genome in go:
    if genome not in pldF.keys():
        gplc[genome] = 0
        continue
    for plasmid in Fplasmids:
        if plasmid in pldF[genome]:
            pct = pldF[genome][plasmid]
            #using yellow to green since percentage hits start at 80, so we have some sorting already by score
            #data[1,10] = [255,0,0]red
            #data[2,4] = [0,255,0]green
            red = round(255*((100-pct)/20))
            green = 255
            color =[red, green, 0]
            for h in range(ow+(go.index(genome)*(bh+lw)),ow+(go.index(genome)*(bh+lw))+bh):
                for w in range(ow+(Fplasmids.index(plasmid)*(bw+lw)),ow+(Fplasmids.index(plasmid)*(bw+lw))+bw):
                    data[h,w] = color
            if plasmid not in plc.keys():
                plc[plasmid] = 0
            plc[plasmid] += 1
            if genome not in gplc.keys():
                gplc[genome] = 0
            gplc[genome] += 1
img = smp.toimage(data).save('2017-08-23_PLASMIDFINDER_IncF_and_FII_Only.png')
img = smp.toimage(data)
img.show()
for i in Fplasmids:
    print(Fplasmids.index(i)+1, i, ' '*(15-len(i)), 'count:', plc[i], '\tproportion:', plc[i]/len(go))
print('Min:', min([gplc[i] for i in gplc.keys()]), '\tMax:', max([gplc[i] for i in gplc.keys()]),'\tAvg:', \
      sum([gplc[i] for i in gplc.keys()])/len(go))

#Breakdown numbers of genomes containing x number of complete phages
plasmid_number = []
for key in pldF.keys():
    plasmid_number.append(len(pldF[key]))
print('\n0 plasmids:',445-len(plasmid_number))
for i in sorted(set(plasmid_number)):
    print(i,'plasmids',plasmid_number.count(i))

1 IncFIB(K)        count: 1 	proportion: 0.0022471910112359553
2 IncFIB(pB171)    count: 18 	proportion: 0.04044943820224719
3 IncFIB(pLF82)    count: 2 	proportion: 0.0044943820224719105
4 IncFII           count: 28 	proportion: 0.06292134831460675
5 IncFII(29)       count: 56 	proportion: 0.1258426966292135
6 IncFII(Yp)       count: 3 	proportion: 0.006741573033707865
7 IncFII(p14)      count: 14 	proportion: 0.03146067415730337
8 IncFII(pCRY)     count: 7 	proportion: 0.015730337078651686
Min: 0 	Max: 2 	Avg: 0.2898876404494382

0 plasmids: 328
1 plasmids 104
2 plasmids 13


In [83]:
pldF = {} #plasmid dictionary with only incF or incFII members
Fplasmids = [] #plasmids in incF or incFII

for genome in pld.keys():
    incF_plasmids = [i for i in pld[genome] if i.startswith('IncF')]
    if len(incF_plasmids) < 1:
        continue #pass by those with no incF or incFII plasmids
    if genome not in pldF:
        pldF[genome] = {}
    for plasmid in [i for i in pld[genome] if i.startswith('IncF')]:
        Fplasmids.append(plasmid)
        pldF[genome][plasmid] = pld[genome][plasmid]
Fplasmids = list(set(Fplasmids))
print(Fplasmids)
print(pldF)


['IncFII(pCRY)', 'IncFIB(K)', 'IncFII(Yp)', 'IncFII(29)', 'IncFIB(pLF82)', 'IncFII(p14)', 'IncFII', 'IncFIB(pB171)']
{'SEQ000031518': {'IncFII(pCRY)': 95.262, 'IncFII': 82.625}, 'SEQ000009073': {'IncFII': 82.996}, 'SEQ000041121': {'IncFII(29)': 81.985}, 'SEQ000014948': {'IncFII': 88.213}, 'SEQ000014048': {'IncFII(29)': 83.206}, 'SEQ000007942': {'IncFII(Yp)': 80.973}, 'SEQ000029384': {'IncFII(29)': 83.206, 'IncFII': 81.203}, 'SEQ000018161': {'IncFII(p14)': 87.55}, 'SEQ000014053': {'IncFII(29)': 83.206}, 'SEQ000004780': {'IncFII(29)': 83.206}, 'SEQ000034189': {'IncFII(29)': 83.206}, 'SEQ000015178': {'IncFIB(pB171)': 85.047}, 'SEQ000031933': {'IncFII': 82.996}, 'SEQ000036900': {'IncFII(29)': 83.206, 'IncFIB(pB171)': 90.669}, 'SEQ000015331': {'IncFII(29)': 83.784}, 'SEQ000015070': {'IncFII(p14)': 93.927}, 'SEQ000015057': {'IncFII(29)': 84.556}, 'SEQ000008898': {'IncFII(29)': 83.784}, 'SEQ000007959': {'IncFII(29)': 84.231}, 'SEQ000005604': {'IncFIB(pB171)': 92.702}, 'SEQ000003858': {'IncFIB

In [5]:
##VFDB BLAST HITS##

#For this, we pull from another analysis that eliminated duplicates

wb = op.load_workbook('/Users/jay.worley/Desktop/LLNL-MvirDB/testbook.xlsx')
ws = wb['Sheet']

In [6]:
##pixel dimensions
bh = 10 #box height
bw = 40 #box width
lw = 0 #line width
ow = 0 #outline width
black = [0,0,0]
white = [255,255,255]
grey = [100,100,100]

#vg class indices in excel file
class_inds = [i+1 for i in range(ws.max_column) if ws.cell(column = i+1, row = 1).value is not None]
class_inds.append(ws.max_column+2)

color_my_lines = True

vgc = {}
gvgc =  {}
for ind in class_inds[:-1]:
    class_name = ws.cell(column=ind, row=1).value
    outfile = '2017-05-17_VFDB_'+class_name+'_image.png'
    start = ind
    stop = class_inds[class_inds.index(ind)+1]
    
    height = ((ws.max_row-3)*(bh+lw))-lw+(2*ow)
    width = ((stop-start)*(bw+lw))-lw+(2*ow)

    data = np.zeros((height,width,3), np.uint8)

    if color_my_lines == True:
        color = grey
        lxs = [range(ow-lw+((bw+lw)*i),(ow+((bw+lw)*i))) for i in range(1,stop-start)]#line coloring values x axis
        lys = [range(ow-lw+((bh+lw)*i),(ow+((bh+lw)*i))) for i in range(1,ws.max_row-3)]#line coloring values y axis
        
        #white line dividers for different subspecies/clades
        lysw = [range(ow-lw+((bh+lw)*(go.index(i)+1)),(ow+((bh+lw)*(go.index(i)+1)))) for i in gos]
        
        for x in lxs:
            for xx in x:
                for yy in range(height):
                    data[yy,xx]=color
        for y in lys:
            for yy in y:
                for xx in range(width):
                    data[yy,xx]=color
        for y in lysw:#Dividers for clades
            for yy in y:
                for xx in range(width):
                    data[yy,xx]=white 
                    
        ##color in outlines
        for x in range(ow):
            for y in range(height):
                data[y,x]=color
        for x in range(width-ow,width):
            for y in range(height):
                data[y,x]=color
        for y in range(ow):
            for x in range(width):
                data[y,x]=color
        for y in range(height-ow,height):
            for x in range(width):
                data[y,x]=color
    
    for i in range(stop-start):
        gene = ws.cell(column=i+start,row=3).value
        geneclass = ws.cell(column=i+start, row=2).value
        vgc[gene] = 0
        
        ##if new class and not the first, color in lines to the left
        if (i != 0) and (geneclass != None):
            for xx in range(ow-lw+((bw+lw)*i),(ow+((bw+lw)*i))):
                for yy in range(height):
                    data[yy,xx]=white
            
        
        for j in range(ws.max_row-3):
            genome = ws.cell(column=1,row=j+4).value
            if genome not in gvgc.keys():
                gvgc[genome] = 0
            if ws.cell(column=i+start,row=j+4).value is not None:
                pct = float(ws.cell(column=i+start,row=j+4).value)
                red = round(255*((100-pct)/20))
                if red > 255:
                    red == 255
                green = 255
                color =[red, green, 0]
                for h in range(ow+(go.index(genome)*(bh+lw)),ow+(go.index(genome)*(bh+lw))+bh):
                    for w in range(ow+(i*(bw+lw)),ow+(i*(bw+lw))+bw):
                        data[h,w] = color
                vgc[gene] += 1
                gvgc[genome] += 1
    img = smp.toimage(data).save(outfile)



In [42]:
#Specific to figure of specifically compared adherence genes

##pixel dimensions
bh = 5 #box height
bw = 40 #box width
lw = 3 #line width
ow = 10 #outline width
black = [0,0,0]
white = [255,255,255]
grey = [100,100,100]

select_genes = sorted(['Lpf','Peg','Saf','Sta','Stc','Ste','Stj','Stk','Tcf'])

class_inds = []
class_starts = []
#vg class indices in excel file
for gene in select_genes:
    for i in range (ws.max_column):
        if ws.cell(column = i+2, row = 2).value == gene:
            class_starts.append(i+2)
            break
for i in class_starts:
    br = 0
    j = i
    while br == 0:
        j += 1
        if ws.cell(column=j+1, row=2).value is not None:
            class_inds.append((i,j+1))
            br = 1
            continue
        continue
color_my_lines = True

vgc = {}
gvgc =  {}
print(class_inds)

#DON"T KICK OFF LOOP - ONE LOOP, ONE IMAGE!
class_name = 'Specific_adherence_genes_independent_evo'
outfile = '2017-05-17_VFDB_'+class_name+'_image.png'
total_cols = sum([(tup[1]-tup[0]) for tup in class_inds])
    
height = ((ws.max_row-3)*(bh+lw))-lw+(2*ow)
width = ((total_cols)*(bw+lw))-lw+(2*ow)
print(height, width, total_cols)

data = np.zeros((height,width,3), np.uint8)

if color_my_lines == True:
    color = grey
    lxs = [range(ow-lw+((bw+lw)*i),(ow+((bw+lw)*i))) for i in range(1,total_cols)]#line coloring values x axis
    lys = [range(ow-lw+((bh+lw)*i),(ow+((bh+lw)*i))) for i in range(1,ws.max_row-3)]#line coloring values y axis
        
    #white line dividers for different subspecies/clades
    lysw = [range(ow-lw+((bh+lw)*(go.index(i)+1)),(ow+((bh+lw)*(go.index(i)+1)))) for i in gos]
        
    for x in lxs:
        for xx in x:
            for yy in range(height):
                data[yy,xx]=color
    for y in lys:
        for yy in y:
            for xx in range(width):
                data[yy,xx]=color
    for y in lysw:#Dividers for clades
        for yy in y:
            for xx in range(width):
                data[yy,xx]=white 
                    
    ##color in outlines
    for x in range(ow):
        for y in range(height):
            data[y,x]=color
    for x in range(width-ow,width):
        for y in range(height):
            data[y,x]=color
    for y in range(ow):
        for x in range(width):
            data[y,x]=color
    for y in range(height-ow,height):
        for x in range(width):
            data[y,x]=color

gene_inds = []
for i in class_inds:
    gene_inds = gene_inds+list(range(i[0],i[1]))
    
column_num = 0
for i in gene_inds:
    gene = ws.cell(column=i,row=3).value
    geneclass = ws.cell(column=i, row=2).value
    vgc[gene] = 0
        
    ##if new class and not the first, color in lines to the left
    if (column_num != 0) and (geneclass != None):
        for xx in range(ow-lw+((bw+lw)*column_num),(ow+((bw+lw)*column_num))):
            for yy in range(height):
                data[yy,xx]=white
            
        
    for j in range(ws.max_row-3):
        genome = ws.cell(column=1,row=j+4).value
        if genome not in gvgc.keys():
            gvgc[genome] = 0
        if ws.cell(column=i,row=j+4).value is not None:
            pct = float(ws.cell(column=i,row=j+4).value)
            red = round(255*((100-pct)/20))
            if red > 255:
                red == 255
            green = 255
            color =[red, green, 0]
            for h in range(ow+(go.index(genome)*(bh+lw)),ow+(go.index(genome)*(bh+lw))+bh):
                for w in range(ow+(column_num*(bw+lw)),ow+(column_num*(bw+lw))+bw):
                    data[h,w] = color
            vgc[gene] += 1
            gvgc[genome] += 1
    column_num += 1
img = smp.toimage(data).save(outfile)

[(37, 42), (58, 62), (26, 30), (82, 89), (33, 37), (96, 102), (111, 116), (62, 69), (18, 22)]
3577 1995 46


In [None]:
for ind in class_inds[:-1]:
    class_name = ws.cell(column=ind, row=1).value
    outfile = '2017-05-17_VFDB_'+class_name+'_image.png'
    start = ind
    stop = class_inds[class_inds.index(ind)+1]
    
    height = ((ws.max_row-3)*(bh+lw))-lw+(2*ow)
    width = ((stop-start)*(bw+lw))-lw+(2*ow)

    data = np.zeros((height,width,3), np.uint8)

    if color_my_lines == True:
        color = grey
        lxs = [range(ow-lw+((bw+lw)*i),(ow+((bw+lw)*i))) for i in range(1,stop-start)]#line coloring values x axis
        lys = [range(ow-lw+((bh+lw)*i),(ow+((bh+lw)*i))) for i in range(1,ws.max_row-3)]#line coloring values y axis
        
        #white line dividers for different subspecies/clades
        lysw = [range(ow-lw+((bh+lw)*(go.index(i)+1)),(ow+((bh+lw)*(go.index(i)+1)))) for i in gos]
        
        for x in lxs:
            for xx in x:
                for yy in range(height):
                    data[yy,xx]=color
        for y in lys:
            for yy in y:
                for xx in range(width):
                    data[yy,xx]=color
        for y in lysw:#Dividers for clades
            for yy in y:
                for xx in range(width):
                    data[yy,xx]=white 
                    
        ##color in outlines
        for x in range(ow):
            for y in range(height):
                data[y,x]=color
        for x in range(width-ow,width):
            for y in range(height):
                data[y,x]=color
        for y in range(ow):
            for x in range(width):
                data[y,x]=color
        for y in range(height-ow,height):
            for x in range(width):
                data[y,x]=color
    
    for i in range(stop-start):
        gene = ws.cell(column=i+start,row=3).value
        geneclass = ws.cell(column=i+start, row=2).value
        vgc[gene] = 0
        
        ##if new class and not the first, color in lines to the left
        if (i != 0) and (geneclass != None):
            for xx in range(ow-lw+((bw+lw)*i),(ow+((bw+lw)*i))):
                for yy in range(height):
                    data[yy,xx]=white
            
        
        for j in range(ws.max_row-3):
            genome = ws.cell(column=1,row=j+4).value
            if genome not in gvgc.keys():
                gvgc[genome] = 0
            if ws.cell(column=i+start,row=j+4).value is not None:
                pct = float(ws.cell(column=i+start,row=j+4).value)
                red = round(255*((100-pct)/20))
                if red > 255:
                    red == 255
                green = 255
                color =[red, green, 0]
                for h in range(ow+(go.index(genome)*(bh+lw)),ow+(go.index(genome)*(bh+lw))+bh):
                    for w in range(ow+(i*(bw+lw)),ow+(i*(bw+lw))+bw):
                        data[h,w] = color
                vgc[gene] += 1
                gvgc[genome] += 1
    img = smp.toimage(data).save(outfile)




In [12]:
vgenes = sorted([i for i in list(vgc.keys()) if i is not None])
for i in vgenes:
    try:
        print(vgenes.index(i)+1, i, ' '*(15-len(i)), 'count:', vgc[i], '\tproportion:', vgc[i]/len(go))
    except:
        continue
print('Min:', min([gvgc[i] for i in gvgc.keys()]), '\tMax:', max([gvgc[i] for i in gvgc.keys()]),'\tAvg:', \
      sum([gvgc[i] for i in gvgc.keys()])/len(go))

1 -                count: 0 	proportion: 0.0
2 avrA             count: 343 	proportion: 0.7707865168539326
3 bcfA             count: 223 	proportion: 0.501123595505618
4 bcfB             count: 216 	proportion: 0.4853932584269663
5 bcfC             count: 437 	proportion: 0.9820224719101124
6 bcfD             count: 437 	proportion: 0.9820224719101124
7 bcfE             count: 437 	proportion: 0.9820224719101124
8 bcfF             count: 217 	proportion: 0.48764044943820223
9 bcfG             count: 436 	proportion: 0.9797752808988764
10 cdtB             count: 176 	proportion: 0.3955056179775281
11 csgA             count: 445 	proportion: 1.0
12 csgB             count: 445 	proportion: 1.0
13 csgC             count: 445 	proportion: 1.0
14 csgD             count: 445 	proportion: 1.0
15 csgE             count: 444 	proportion: 0.9977528089887641
16 csgF             count: 445 	proportion: 1.0
17 csgG             count: 445 	proportion: 1.0
18 fimA             count: 444 	proportion: 0

In [None]:
##Virulence Gene Image - refined a bit
##Include adherence operons, effector genes, and toxin genes
##pixel dimensions
bh = 10 #box height
bw = 40 #box width
lw = 0 #line width
ow = 0 #outline width
black = [0,0,0]
white = [255,255,255]
grey = [100,100,100]

#vg class indices in excel file
class_inds = [i+1 for i in range(ws.max_column) if ws.cell(column = i+1, row = 1).value is not None]
class_inds.append(ws.max_column+2)

color_my_lines = True

vgc = {}
gvgc =  {}
for ind in class_inds[:-1]:
    class_name = ws.cell(column=ind, row=1).value
    outfile = '2017-05-17_VFDB_'+class_name+'_image.png'
    start = ind
    stop = class_inds[class_inds.index(ind)+1]
    
    height = ((ws.max_row-3)*(bh+lw))-lw+(2*ow)
    width = ((stop-start)*(bw+lw))-lw+(2*ow)

    data = np.zeros((height,width,3), np.uint8)

    if color_my_lines == True:
        color = grey
        lxs = [range(ow-lw+((bw+lw)*i),(ow+((bw+lw)*i))) for i in range(1,stop-start)]#line coloring values x axis
        lys = [range(ow-lw+((bh+lw)*i),(ow+((bh+lw)*i))) for i in range(1,ws.max_row-3)]#line coloring values y axis
        
        #white line dividers for different subspecies/clades
        lysw = [range(ow-lw+((bh+lw)*(go.index(i)+1)),(ow+((bh+lw)*(go.index(i)+1)))) for i in gos]
        
        for x in lxs:
            for xx in x:
                for yy in range(height):
                    data[yy,xx]=color
        for y in lys:
            for yy in y:
                for xx in range(width):
                    data[yy,xx]=color
        for y in lysw:#Dividers for clades
            for yy in y:
                for xx in range(width):
                    data[yy,xx]=white 
                    
        ##color in outlines
        for x in range(ow):
            for y in range(height):
                data[y,x]=color
        for x in range(width-ow,width):
            for y in range(height):
                data[y,x]=color
        for y in range(ow):
            for x in range(width):
                data[y,x]=color
        for y in range(height-ow,height):
            for x in range(width):
                data[y,x]=color
    
    for i in range(stop-start):
        gene = ws.cell(column=i+start,row=3).value
        geneclass = ws.cell(column=i+start, row=2).value
        vgc[gene] = 0
        
        ##if new class and not the first, color in lines to the left
        if (i != 0) and (geneclass != None):
            for xx in range(ow-lw+((bw+lw)*i),(ow+((bw+lw)*i))):
                for yy in range(height):
                    data[yy,xx]=white
            
        
        for j in range(ws.max_row-3):
            genome = ws.cell(column=1,row=j+4).value
            if genome not in gvgc.keys():
                gvgc[genome] = 0
            if ws.cell(column=i+start,row=j+4).value is not None:
                pct = float(ws.cell(column=i+start,row=j+4).value)
                red = round(255*((100-pct)/20))
                if red > 255:
                    red == 255
                green = 255
                color =[red, green, 0]
                for h in range(ow+(go.index(genome)*(bh+lw)),ow+(go.index(genome)*(bh+lw))+bh):
                    for w in range(ow+(i*(bw+lw)),ow+(i*(bw+lw))+bw):
                        data[h,w] = color
                vgc[gene] += 1
                gvgc[genome] += 1
    img = smp.toimage(data).save(outfile)


