In [1]:
#libraries needed for extraction and for formatting spreadsheet

import os
import Bio
from Bio import SeqIO
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord
import xml.etree.ElementTree as ET
from openpyxl import Workbook
from openpyxl.styles import Color, PatternFill, Style


In [2]:
#Blasts all files in './VGDB' against the genomes.
#Genomes have the simplified name S## where ## are integers
#Creates a tempororary infile for the query, then deletes it.
#Switching to allow more processing in the future of the in-sequence (format changes)
#Output - xml files (outfmt=5) to './VG_BLAST_Hits/'
#folders must be made in advance if not already existing.

q = './VGDB/tempfile.fasta'
d = './BLAST_DB/DavisDB.fasta'

for file in os.listdir('./VGDB'):
    if file.endswith('.fa'):
        for item in SeqIO.parse('./VGDB/'+file,'fasta'):
            otf = open(q,'w')
            SeqIO.write(item, q, 'fasta')
            otf.close
            o = './VG_BLAST_HITS/'+os.path.split(file)[1].split('.')[0]+'.xml'
            os.system(str(NcbiblastnCommandline(query = q, db = d, out = o, outfmt=5)))
os.remove(q)
     

In [3]:
#Extract BLAST hits

#hitdict is a dictionary of all blast hits.
#it stores a list of important data for each hit for calculation.
#hits are checked to be one per contig at end.
#hitdict list order:
#query start, query stop, query length, alignment length,
#number of identical positions, number of 'positive' positions
#number of gaps.
#[qstart,qstop,qlen,alen,ids,pos,gaps]

hitdict = {}

#retrieves xml file, parses with element tree, root = top node
for file in os.listdir('./VG_BLAST_HITS'):
    if file.endswith('.xml'):
        queryg = os.path.splitext(file)[0]
        queryf = os.path.join('./VG_BLAST_HITS/'+file)
        tree = ET.parse(queryf)
        root = tree.getroot()

#Iteration - query information
#hit- hits within genome
#hsp- alignment infor within a hit
        for iteration in root.findall(".//Iteration"):
            qlen = iteration.find('Iteration_query-len').text
        for hit in root.findall(".//Hit"):
            contig = hit.find('Hit_id').text
            for hsp in hit.findall('.//Hit_hsps/Hsp'):
                hseq = hsp.find('Hsp_hseq').text
                qstart = hsp.find('Hsp_query-from').text
                qstop = hsp.find('Hsp_query-to').text
                hstart = hsp.find('Hsp_hit-from').text
                hstop = hsp.find('Hsp_hit-to').text
                alen = hsp.find('Hsp_align-len').text
                ids = hsp.find('Hsp_identity').text
                pos = hsp.find('Hsp_positive').text
                gaps = hsp.find('Hsp_gaps').text
                
            key = contig+'|'+queryg
            keylist = [item for item in hitdict.keys()]
            if key in keylist:
                print('overwrite at '+key)
                
            hitdict[key]=[qstart,qstop,qlen,alen,ids,pos,gaps]

In [8]:
#Create lists of all genomes and genes - used to index information by item name!

#output is an excel file with releavant information
wb = Workbook()
ws1 = wb.create_sheet(0)
ws2 = wb.create_sheet(1)
ws3 = wb.create_sheet(2)
ws4 = wb.create_sheet(3)
ws5 = wb.create_sheet(4)
ws1.title = 'hit locations'
ws2.title = 'hit count'
ws3.title = '%ID, hit len, hit range'
ws4.title = '%ID, %POS, Gaps%'
ws5.title = 'total query coverage & gaps'
gn = ['0','1',]
hn = ['0','1',]
keys = hitdict.keys()

#Flipped.
for item in keys:
    hn.append(item.split('|')[0].split('(')[0])
    gn.append(item.split('|')[1])
    
#Set function removes duplicates, list makes it a list, sorted sorts the list
gn = sorted(list(set(gn)))
hn = sorted(list(set(hn)))

#Labels, labels in the first row and column are important...
for n in range(2, len(gn)):
    ws1.cell(row = n, column = 1).value = gn[n]
    ws2.cell(row = n, column = 1).value = gn[n]
    ws3.cell(row = n, column = 1).value = gn[n]
    ws4.cell(row = n, column = 1).value = gn[n]
    ws5.cell(row = n, column = 1).value = gn[n]
for n in range(2, len(hn)):
    ws1.cell(row = 1, column = n).value = hn[n]
    ws2.cell(row = 1, column = n).value = hn[n]
    ws3.cell(row = 1, column = n).value = hn[n]
    ws4.cell(row = 1, column = n).value = hn[n]
    ws5.cell(row = 1, column = n).value = hn[n]
    
#makes the file
#example dict: S87(NODE_65)|EH250_2b_A

for item in hitdict:
    genome = item.split('(')[0]
    contig = item.split('(')[1].split(')')[0]
    gene = item.split('|')[1]
    ids = hitdict[item][4]
    pos = hitdict[item][5]
    gaps = hitdict[item][6]
    qlen = hitdict[item][2]
    alen = hitdict[item][3]
    qstart=hitdict[item][0]
    qstop=hitdict[item][1]
    hi = hn.index(genome)
    gi = gn.index(gene)
    
    old=ws1.cell(row = gi, column = hi).value
    if old is None:
        old = ''
    ws1.cell(row = gi, column = hi).value = old+','+item.split('|')[0]
    if ws1.cell(row = gi, column = hi).value.startswith(','):
        ws1.cell(row = gi, column = hi).value = ws1.cell(row = gi, column = hi).value[1:]
    
    old = ws3.cell(row = gi, column = hi).value
    if old is None:
        old = ''
    ws3.cell(row = gi, column = hi).value = old+' || '+'ID: '+item.split('|')[0]+', {0:.2f}'.format(int(ids)*100/int(alen))+qlen+'%ID, Query len: '+qlen+' Aln len: '+alen+', range: '+qstart+'-'+qstop
    
    old = ws4.cell(row = gi, column = hi).value
    if old is None:
        old = ''
    ws4.cell(row = gi, column = hi).value = old+' || '+'{0:.2f}'.format(int(ids)/int(alen))+', '+'{0:.2f}'.format(int(pos)/int(qlen))+', '+'{0:.2f}'.format(int(gaps)/int(qlen))

for gi in range(2, len(gn)):
    for hi in range(2,len(hn)):
        if ws1.cell(row = (gi), column = (hi)).value is not (None):
            ws2.cell(row = (gi), column = (hi)).value = len(ws1.cell(row = (gi), column = (hi)).value.split(','))
        #calculator for putting together stitches
            
wb.save('VG_gene_locations_WpSS17.xlsx')

TypeError: expected string or bytes-like object

In [5]:
#Create a table for in-text presentation
#Must run the hitdict creator first (two cells up)

wb = Workbook()
ws1 = wb.create_sheet(0)
ws1.title = 'In Text Figure'

fillyes = PatternFill(fill_type='solid', start_color='0000FF', end_color='0000FF')
fillno = PatternFill(fill_type='solid', start_color='CCCCCC', end_color='CCCCCC')


#Retrieve keys from hitdict for creating lists of genes and 
keys = hitdict.keys()

hn=[]
gn=[]
for item in keys:
    hn.append(item.split('|')[0].split('(')[0])
    gn.append(item.split('|')[1])
    
#Create sorted sets of the genes here.
gn = sorted(set(gn))
hn = sorted(set(hn))

#Exclude lists: lists of things that should be excluded from the table
#gnx - genes to exclude
#hnx - genomes (hits) to exclude

#S54 - Low quality.
#S17 - Returns very fragmented blast hits. Unknown reason - other quality metrics look fine

gnx = ['EDL933_etpoperonpartialhlyC','Sakai_hlyA', 'Sakai_hlyB', 'Sakai_hlyC', 'Sakai_hlyD', 'pO157_ecf1', 'pO157_ecf1to4partial4', 'pO157_ecf2']
hnx = ['S54', 'S17']
for item in gnx:
    gn.remove(item)
for item in hnx:
    hn.remove(item)
print(gn)
print(hn)

#Label columns and rows
for n in range(len(hn)):
    ws1.cell(row = n+2, column = 1).value = hn[n]
for n in range(len(gn)):
    ws1.cell(row = 1, column = n+2).value = gn[n]

#hitdict[key]=[qstart,qstop,qlen,alen,ids,pos,gaps]    
tempgn = []
temphn = []
for hit in hn:
    for gene in gn:
        hitps=[]
        for item in hitdict:
            if (gene in item) and (hit in item):
                #if %ID < 90%, ignore. O157 is very homogenous, high ID cutoff
                if (int(hitdict[item][4])/int(hitdict[item][3]))<.95:
                    continue
                qlen = int(hitdict[item][2])
                hitps.append((int(hitdict[item][0]),int(hitdict[item][1])))
        hitpss = sorted(hitps, key = lambda x: x[0])
        for i in range(1, len(hitpss)):
            if hitpss[i][0]<hitpss[i-1][1]:
                hitpss[i]=(hitpss[i-1][0],hitpss[i][1])
                hitpss[i-1]=(0,0)
                continue
        if (0,0) in hitpss:
            for i in range(hitpss.count((0,0))):
                hitpss.remove((0,0))

        for i in range(1, len(hitpss)):
            if hitpss[i][0]<hitpss[i-1][1]:
                print('Incomplete trimming')
                continue

        lens = [(i[1]-i[0]+1) for i in hitpss]
        slens = sum(lens)
        if (slens/qlen) >= 0.6:
            ws1.cell(row = hn.index(hit)+2, column = gn.index(gene)+2).fill = fillyes
        else:
            ws1.cell(row = hn.index(hit)+2, column = gn.index(gene)+2).fill = fillno

wb.save('VG_gene_in-text_WpSS17.xlsx')            


['EDL933_eae', 'EDL933_espB', 'EDL933_espF', 'EDL933_tir', 'EDL933_vgrG', 'SS17_1721', 'SS17_2051', 'SS17_2052', 'SS17_3604', 'SS17_4737', 'SS17_7017', 'SS17_7023', 'SS17_espD', 'SS17_espG', 'SS17_espH', 'SS17_espJ', 'SS17_espL1', 'SS17_espM2', 'SS17_espN', 'SS17_espP', 'SS17_espR1', 'SS17_espW', 'SS17_espX1', 'SS17_espX4', 'SS17_espX5', 'SS17_espX6', 'SS17_espX7', 'SS17_espY1', 'SS17_espZ', 'SS17_katP', 'SS17_nleA', 'SS17_nleB1', 'SS17_nleB2', 'SS17_nleC', 'SS17_nleD', 'SS17_nleE', 'SS17_nleF', 'SS17_nleG', 'SS17_nleG7', 'SS17_nleH1', 'SS17_nleH2', 'SS17_toxB', 'Sakai_hlyoperon', 'Sakai_stcE', 'pSS17']
['S01', 'S02', 'S03', 'S04', 'S05', 'S06', 'S07', 'S08', 'S09', 'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S18', 'S19', 'S20', 'S21', 'S22', 'S23', 'S24', 'S25', 'S26', 'S27', 'S28', 'S29', 'S30', 'S31', 'S32', 'S33', 'S34', 'S35', 'S36', 'S37', 'S38', 'S39', 'S40', 'S41', 'S42', 'S43', 'S44', 'S45', 'S46', 'S47', 'S48', 'S49', 'S50', 'S51', 'S52', 'S53', 'S55', 'S56', 'S57', 'S5

In [14]:
#REORDERED GENOME LIST TO MATCH PHYLOGENETIC TREE IN FIGURE

#Create a table for in-text presentation
#Must run the hitdict creator first (two cells up)

wb = Workbook()
ws1 = wb.create_sheet(0)
ws1.title = 'In Text Figure'

fillyes = PatternFill(fill_type='solid', start_color='0000FF', end_color='0000FF')
fillno = PatternFill(fill_type='solid', start_color='CCCCCC', end_color='CCCCCC')


#Retrieve keys from hitdict for creating lists of genes and 
keys = hitdict.keys()

hn=[]
gn=[]
for item in keys:
    hn.append(item.split('|')[0].split('(')[0])
    gn.append(item.split('|')[1])
    
#Create sorted sets of the genes here.
gn = sorted(set(gn))
hn = sorted(set(hn))

#Exclude lists: lists of things that should be excluded from the table
#gnx - genes to exclude
#hnx - genomes (hits) to exclude

#S54 - Low quality.
#S17 - Returns very fragmented blast hits. Unknown reason - other quality metrics look fine

gnx = ['pSS17','EDL933_etpoperonpartialhlyC','Sakai_hlyA', 'Sakai_hlyB', 'Sakai_hlyC', 'Sakai_hlyD', 'pO157_ecf1', 'pO157_ecf1to4partial4', 'pO157_ecf2']
hnx = ['S54', 'S17','S66','S69','S70']
for item in gnx:
    gn.remove(item)
for item in hnx:
    hn.remove(item)

hnorder=['S36','S35','S88','S83','S82','S84','S73','S58','S61','S60','S57','S42','S80','S79','S48',
        'S43','S62','S52','S53','S47','S63','S59','S44','S65','S64',
        'S50','S51','S72','S56','S46','S49','S45','S55','S76','S75','S38','S39','S77','S78','S81','S41',
        'S40','S87','S71','S68','S67','S15','S13','S16','S05','S20',
        'S74','S27','S22','S06','S86','S85','S12','S37','S07','S14','S34','S01','S31','S10','S23',
        'S02','S21','S32','S26','S29','S03','S19','S30','S18','S24',
        'S25','S28','S11','S04','S08','S33','S09']
if sorted(set(hn)) == sorted(set(hnorder)):
    print('set check OK')
    hn = hnorder
for item in hn:
    if item not in hnorder:
        print(item)

print(gn)
print(hn)

#Label columns and rows
for n in range(len(hn)):
    ws1.cell(row = n+2, column = 1).value = hn[n]
for n in range(len(gn)):
    ws1.cell(row = 1, column = n+2).value = gn[n]

#hitdict[key]=[qstart,qstop,qlen,alen,ids,pos,gaps]    
tempgn = []
temphn = []
for hit in hn:
    for gene in gn:
        hitps=[]
        for item in hitdict:
            if (gene in item) and (hit in item):
                #if %ID < 90%, ignore. O157 is very homogenous, high ID cutoff
                if (int(hitdict[item][4])/int(hitdict[item][3]))<.95:
                    continue
                qlen = int(hitdict[item][2])
                hitps.append((int(hitdict[item][0]),int(hitdict[item][1])))
        hitpss = sorted(hitps, key = lambda x: x[0])
        for i in range(1, len(hitpss)):
            if hitpss[i][0]<hitpss[i-1][1]:
                hitpss[i]=(hitpss[i-1][0],hitpss[i][1])
                hitpss[i-1]=(0,0)
                continue
        if (0,0) in hitpss:
            for i in range(hitpss.count((0,0))):
                hitpss.remove((0,0))

        for i in range(1, len(hitpss)):
            if hitpss[i][0]<hitpss[i-1][1]:
                print('Incomplete trimming')
                continue

        lens = [(i[1]-i[0]+1) for i in hitpss]
        slens = sum(lens)
        if (slens/qlen) >= 0.6:
            ws1.cell(row = hn.index(hit)+2, column = gn.index(gene)+2).fill = fillyes
        else:
            ws1.cell(row = hn.index(hit)+2, column = gn.index(gene)+2).fill = fillno
        if gene == 'pSS17':
            ws1.cell(row = hn.index(hit)+2, column = gn.index(gene)+2).value = (slens/qlen)
            

wb.save('VG_gene_in-text_reordered2phylo_WpSS17.xlsx')            


set check OK
['EDL933_eae', 'EDL933_espB', 'EDL933_espF', 'EDL933_tir', 'EDL933_vgrG', 'SS17_1721', 'SS17_2051', 'SS17_2052', 'SS17_3604', 'SS17_4737', 'SS17_7017', 'SS17_7023', 'SS17_espD', 'SS17_espG', 'SS17_espH', 'SS17_espJ', 'SS17_espL1', 'SS17_espM2', 'SS17_espN', 'SS17_espP', 'SS17_espR1', 'SS17_espW', 'SS17_espX1', 'SS17_espX4', 'SS17_espX5', 'SS17_espX6', 'SS17_espX7', 'SS17_espY1', 'SS17_espZ', 'SS17_katP', 'SS17_nleA', 'SS17_nleB1', 'SS17_nleB2', 'SS17_nleC', 'SS17_nleD', 'SS17_nleE', 'SS17_nleF', 'SS17_nleG', 'SS17_nleG7', 'SS17_nleH1', 'SS17_nleH2', 'SS17_toxB', 'Sakai_hlyoperon', 'Sakai_stcE']
['S36', 'S35', 'S88', 'S83', 'S82', 'S84', 'S73', 'S58', 'S61', 'S60', 'S57', 'S42', 'S80', 'S79', 'S48', 'S43', 'S62', 'S52', 'S53', 'S47', 'S63', 'S59', 'S44', 'S65', 'S64', 'S50', 'S51', 'S72', 'S56', 'S46', 'S49', 'S45', 'S55', 'S76', 'S75', 'S38', 'S39', 'S77', 'S78', 'S81', 'S41', 'S40', 'S87', 'S71', 'S68', 'S67', 'S15', 'S13', 'S16', 'S05', 'S20', 'S74', 'S27', 'S22', 'S06',

In [9]:
print(lens)

[]


In [None]:
temphns = sorted(set(temphn))
tempgns = sorted(set(tempgn))
for hit in temphns:
    print(hit, temphn.count(hit))
for gene in tempgns:
    print(gene, tempgn.count(gene))

In [7]:
fill1= PatternFill(fill_type='solid', start_color='FF9999', end_color='FF9999')
fill2= PatternFill(fill_type='solid', start_color='BB0000', end_color='BB0000')
for gi in range(2, len(gn)):
    for hi in range(2,len(hn)):
        if ws3.cell(row = (gi), column = (hi)).value is not None:
            hits = ws3.cell(row = (gi), column = (hi)).value
            qlen = hits.split('Query len: ')[1].split(' Aln')[0]
            ranges = hits.split('range: ')
            ranges = [thing.split(' ||')[0] for thing in ranges][1:]
            ranges = [(int(thing.split('-')[0]), int(thing.split('-')[1])) for thing in ranges]
            ranges = sorted(ranges, key=lambda lam: lam[0])
            if len(ranges) > 1:
                for i in range(1,len(ranges)):
                    if (ranges[i][0] <= ranges[i-1][1]) and (ranges[i][1] > ranges[i-1][1]):
                        ranges[i-1] = (ranges[i-1][0],ranges[i][1])
                        ranges[i] = (0,0)
                        ranges = sorted(ranges, key=lambda lam: lam[0])
                    if (ranges[i][0] <= ranges[i-1][1]) and (ranges[i][1] <= ranges[i-1][1]):
                        ranges[i] = (0,0)
                        ranges = sorted(ranges, key=lambda lam: lam[0])
            ranges = [thing for thing in ranges if thing != (0,0)]
            totcoverage = [thing[1]-thing[0] for thing in ranges]
            totcoverage = sum(totcoverage)
            ranges = [str(thing[0])+'-'+str(thing[1]) for thing in ranges]
            rangesstring = ', '.join(ranges)
            ws5.cell(row = (gi), column = (hi)).value = rangesstring+' ('+str(len(ranges)-1)+' gaps, '+str(totcoverage)+' of '+qlen+' bases)'
            if len(ranges) > 1:
                ws5.cell(row = (gi), column = (hi)).fill = fill1
        if ws5.cell(row = (gi), column = (hi)).value is None:
            ws5.cell(row = (gi), column = (hi)).fill = fill2
wb.save('VG_gene_locations_WpSS17.xlsx')

NameError: name 'ws3' is not defined

In [None]:
#Blasts all ORFS in './VGDB/pSS17' against the genomes.
#Genomes have the simplified name S## where ## are integers
#Creates a tempororary infile for the query, then deletes it.
#Switching to allow more processing in the future of the in-sequence (format changes)
#Output - xml files (outfmt=5) to './VG_BLAST_Hits/'
#folders must be made in advance if not already existing.

q = './VGDB/tempfile.fasta'
d = './BLAST_DB/DavisDB.fasta'

for item in SeqIO.parse('./VGDB/pSS17.fa','fasta'):
        otf = open(q,'w')
        SeqIO.write(item, q, 'fasta')
        otf.close
        o = './VG_BLAST_HITS/'+os.path.split(file)[1].split('.')[0]+'.xml'
        os.system(str(NcbiblastnCommandline(query = q, db = d, out = o, outfmt=5)))
os.remove(q)

#Extract BLAST hits

#hitdict is a dictionary of all blast hits.
#it stores a list of important data for each hit for calculation.
#hits are checked to be one per contig at end.
#hitdict list order:
#query start, query stop, query length, alignment length,
#number of identical positions, number of 'positive' positions
#number of gaps.
#[qstart,qstop,qlen,alen,ids,pos,gaps]

hitdict = {}

#retrieves xml file, parses with element tree, root = top node
for file in os.listdir('./VG_BLAST_HITS'):
    if file.endswith('.xml'):
        queryg = os.path.splitext(file)[0]
        queryf = os.path.join('./VG_BLAST_HITS/'+file)
        tree = ET.parse(queryf)
        root = tree.getroot()

#Iteration - query information
#hit- hits within genome
#hsp- alignment infor within a hit
        for iteration in root.findall(".//Iteration"):
            qlen = iteration.find('Iteration_query-len').text
        for hit in root.findall(".//Hit"):
            contig = hit.find('Hit_id').text
            for hsp in hit.findall('.//Hit_hsps/Hsp'):
                hseq = hsp.find('Hsp_hseq').text
                qstart = hsp.find('Hsp_query-from').text
                qstop = hsp.find('Hsp_query-to').text
                hstart = hsp.find('Hsp_hit-from').text
                hstop = hsp.find('Hsp_hit-to').text
                alen = hsp.find('Hsp_align-len').text
                ids = hsp.find('Hsp_identity').text
                pos = hsp.find('Hsp_positive').text
                gaps = hsp.find('Hsp_gaps').text
                
            key = contig+'|'+queryg
            keylist = [item for item in hitdict.keys()]
            if key in keylist:
                print('overwrite at '+key)
                
            hitdict[key]=[qstart,qstop,qlen,alen,ids,pos,gaps]
     

#REORDERED GENOME LIST TO MATCH PHYLOGENETIC TREE IN FIGURE

#Create a table for in-text presentation
#Must run the hitdict creator first (two cells up)

wb = Workbook()
ws1 = wb.create_sheet(0)
ws1.title = 'In Text Figure'

fillyes = PatternFill(fill_type='solid', start_color='0000FF', end_color='0000FF')
fillno = PatternFill(fill_type='solid', start_color='CCCCCC', end_color='CCCCCC')


#Retrieve keys from hitdict for creating lists of genes and 
keys = hitdict.keys()

hn=[]
gn=[]
for item in keys:
    hn.append(item.split('|')[0].split('(')[0])
    gn.append(item.split('|')[1])
    
#Create sorted sets of the genes here.
gn = sorted(set(gn))
hn = sorted(set(hn))

#Exclude lists: lists of things that should be excluded from the table
#gnx - genes to exclude
#hnx - genomes (hits) to exclude

#S54 - Low quality.
#S17 - Returns very fragmented blast hits. Unknown reason - other quality metrics look fine

gnx = ['pSS17','EDL933_etpoperonpartialhlyC','Sakai_hlyA', 'Sakai_hlyB', 'Sakai_hlyC', 'Sakai_hlyD', 'pO157_ecf1', 'pO157_ecf1to4partial4', 'pO157_ecf2']
hnx = ['S54', 'S17','S66','S69','S70']
for item in gnx:
    gn.remove(item)
for item in hnx:
    hn.remove(item)

hnorder=['S36','S35','S88','S83','S82','S84','S73','S58','S61','S60','S57','S42','S80','S79','S48',
        'S43','S62','S52','S53','S47','S63','S59','S44','S65','S64',
        'S50','S51','S72','S56','S46','S49','S45','S55','S76','S75','S38','S39','S77','S78','S81','S41',
        'S40','S87','S71','S68','S67','S15','S13','S16','S05','S20',
        'S74','S27','S22','S06','S86','S85','S12','S37','S07','S14','S34','S01','S31','S10','S23',
        'S02','S21','S32','S26','S29','S03','S19','S30','S18','S24',
        'S25','S28','S11','S04','S08','S33','S09']
if sorted(set(hn)) == sorted(set(hnorder)):
    print('set check OK')
    hn = hnorder
for item in hn:
    if item not in hnorder:
        print(item)

print(gn)
print(hn)

#Label columns and rows
for n in range(len(hn)):
    ws1.cell(row = n+2, column = 1).value = hn[n]
for n in range(len(gn)):
    ws1.cell(row = 1, column = n+2).value = gn[n]

#hitdict[key]=[qstart,qstop,qlen,alen,ids,pos,gaps]    
tempgn = []
temphn = []
for hit in hn:
    for gene in gn:
        hitps=[]
        for item in hitdict:
            if (gene in item) and (hit in item):
                #if %ID < 90%, ignore. O157 is very homogenous, high ID cutoff
                if (int(hitdict[item][4])/int(hitdict[item][3]))<.95:
                    continue
                qlen = int(hitdict[item][2])
                hitps.append((int(hitdict[item][0]),int(hitdict[item][1])))
        hitpss = sorted(hitps, key = lambda x: x[0])
        for i in range(1, len(hitpss)):
            if hitpss[i][0]<hitpss[i-1][1]:
                hitpss[i]=(hitpss[i-1][0],hitpss[i][1])
                hitpss[i-1]=(0,0)
                continue
        if (0,0) in hitpss:
            for i in range(hitpss.count((0,0))):
                hitpss.remove((0,0))

        for i in range(1, len(hitpss)):
            if hitpss[i][0]<hitpss[i-1][1]:
                print('Incomplete trimming')
                continue

        lens = [(i[1]-i[0]+1) for i in hitpss]
        slens = sum(lens)
        if (slens/qlen) >= 0.6:
            ws1.cell(row = hn.index(hit)+2, column = gn.index(gene)+2).fill = fillyes
        else:
            ws1.cell(row = hn.index(hit)+2, column = gn.index(gene)+2).fill = fillno
        if gene == 'pSS17':
            ws1.cell(row = hn.index(hit)+2, column = gn.index(gene)+2).value = (slens/qlen)
            

wb.save('pSS17_Check.xlsx')