##Protocol for annotation of Nahant phage genomes

No more waiting for other folks to get pipelines up and running... let's figure this out.

>UBLAST of prodigal-identified proteins against several sequence databases:
>>COG

>>PFam

>>ACLAME

>>Camera Viral Proteins (CVP)

>>KEGG

>>TARA Oceans Database--TBD

>>EggNOG--TBD

>UBLAST run via run_ublasts.py script in ./scripts; implemented on server using ./scripts/ublast_runs.slurm

In [1]:
import subprocess
import os

def run_prodigal_phage(inputfasta, out_gene, out_prot):
    to_run="prodigal -i "+inputfasta+" -o "+out_gene+" -a "+out_prot+" -p meta"
    subprocess.call(to_run.split(" "))
    
def run_formatudb(fastafile, databasefile="db.udb", ublast_path="/home/sbiller/usearch7.0.1090_i86linux64"):
    to_run=ublast_path+" -makeudb_ublast "+fastafile+" -output "+databasefile
    subprocess.call(to_run.split(" "))
    
def run_ublastp(fastafile, out_file, udb, evalue, ublast_path="/home/sbiller/usearch7.0.1090_i86linux64"):
    to_run=ublast_path+" -ublast "+fastafile+" -db "+udb+" -evalue "+evalue+" -accel 0.5 -blast6out "+out_file+" -top_hit_only"
    subprocess.call(to_run.split(" "))
    
def run_trna_scan(input_file, output):
    if os.path.exists(output):
        os.remove(output)
    args=["tRNAscan-SE", "-o", output, "-G", "-D","-N", input_file]
    subprocess.call(args)
    print("tRNA scan of "+input_file+" is done!")


Below: Creating DB dicts for quick access to blast DB sequence codes, annotations and OGs

In [49]:
#create DB dicts for BLAST file analysis:

#ACLAME:
aclame=open("./databases/DB_Info/aclame/aclame_proteins_all_0.4.tab").readlines()
aclame_dict={}

for line in aclame[2:-1]:
    protein=line.split("\t")[0]
    ncbi_ann=line.split("\t")[2]
    aclame_dict[protein]=ncbi_ann

##COG needs two dbs:
cogs=open("./databases/DB_Info/COG/cog2003-2014.csv").readlines()         #all COG sequences and COG groups
cogs2=open("./databases/DB_Info/COG/cognames2003-2014.tab").readlines()   #COG group definitions/functions

cog_dict={}
cog_defs={}

#dict from gi to COG:
for line in cogs:
    gi=line.split(",")[0]
    cog=line.split(",")[6]
    cog_dict[gi]=cog
#dict from COG to function definition:
for line in cogs2:
    cog=line.split("\t")[0]
    func=line.split("\t")[2]
    cog_defs[cog]=func

##Pfam needs two DBs as well:
pfams=open("./databases/DB_Info/PFam/Pfam-A.titles.txt").readlines()   #ID all pfam sequences in BLAST db
pfams2=open("./databases/DB_Info/PFam/Pfam-A.clans.tsv").readlines()   #Matches IDs to "clans" with functions
pfam_dict={}
pfam_defs={}

#sequence to pfam:
for line in pfams:
    seq=line.split(" ")[0].replace(">","")
    pfam=line.split(" ")[2].split(";")[0]
    pfam_dict[seq]=pfam
#pfam to function:
for line in pfams2:
    pfam=line.split("\t")[0]
    function=line.split("\t")[4]
    pfam_defs[pfam]=function
    
##CAMERA Viral Proteins annotations are complicated; extracting definitions from sequence titles:
cvp=open("./databases/DB_Info/CVP/CVP_titles.txt").readlines()

cvp_dict={}

headers=[]

for line in cvp:
    defs={}
    line=line.replace(">","")
    annotation=""
    info=line.split("/")[1:]
    ID=line.split("/")[0].replace(" ","")
    for i in info: 
        if "=" in i:
            defs[i.split("=")[0]]=i.split("=")[1]
    if "DESCRIPTION" in defs.keys():
        annotation=defs["DESCRIPTION"]
    elif "definition" in defs.keys():
        annotation=defs["definition"]
    
    cvp_dict[ID]=annotation.replace('"','')


###Now to dig into the BLAST files:

In [50]:
#general info

#get number of genes in genome to know how many digits to use in locus tag
def get_digits(prodfile):
    prod=open(prodfile).readlines()
    digits=len(str(len(prod)/2))
    return digits

#create locus tag from protein sequence names:
def get_locus_tag(line, digits, phage):
    query=line.split("\t")[0].split(" ")[0]
    number=query.split("_")[-1]
    z="0"*(digits-len(number))
    return "NVP"+phage.replace(".","")+"_"+z+number

In [57]:
from pyfaidx import Fasta

faa="./1.161.O.faa"
phage="1.161.O."

def get_prot_lens(faa_file, gene_file):
    len_dict={}
    digits=get_digits(gene_file)
    #def make_seq_len_dict(faa):
    f=Fasta(faa_file)
    for i in f.keys():
        name=get_locus_tag(i, digits=digits, phage=phage)
        length=len(str(f[i]))
        len_dict[name]=length
    return len_dict

#prot_lens=get_prot_lens(faa_file=faa, gene_file=phage+"gene")

In [61]:
from __future__ import division
from Bio.KEGG import REST

def create_blast_dict(blast, len_dict, phage, digits, db):
    blast=open(blast).readlines()

    records=[]
    blast_dict={}

    for line in blast:
        name=line.split(" ")[0]
        hit=line.split("\t")[1]
        lt=get_locus_tag(name, digits=digits, phage=phage)
        prot_len=len_dict[lt]
        aln_len=int(line.split("\t")[3])
        pct_id=float(line.split("\t")[2])
        ev=line.split("\t")[-2]
        pct_cov=(aln_len/prot_len)*100

        if pct_id>35 and pct_cov>75 and lt not in records:
            
            records.append(lt)
            blast_dict[lt]=[hit, pct_cov, pct_id, ev]
    
    if db=="kegg":
        for k in blast_dict.keys():
            desc= REST.kegg_find("genes", blast_dict[k][0]).read()
            K=re.search(r"K[0-9]{5}", desc)
            KEGG=K.group(0)
            a=re.search(r"(?<=K[0-9]{5}).*", desc)
            ann=a.group(0)
            blast_dict[k]+=[KEGG, ann]
    
    if db=="cog":
        for c in blast_dict.keys():
            hit=blast_dict[c][0]
            cog=cog_dict[(hit.split("|")[1])]
            func=cog_defs[cog].replace("\n","")
            blast_dict[c]+=[cog, func]
    
    if db=="pfam":
        for p in blast_dict.keys():
            hit=blast_dict[p][0]
            pfam=pfam_dict[hit].split(".")[0]
            function=pfam_defs[pfam].replace("\n","")
            blast_dict[p]+=[pfam, function]
            
    if db=="aclame":
        for a in blast_dict.keys():
            hit=blast_dict[a][0]
            annotation=aclame_dict[hit]
            blast_dict[a]+=[hit, annotation]
            
    if db=="cvp":
        for v in blast_dict.keys():
            hit=blast_dict[v][0]
            func=cvp_dict[hit]
            blast_dict[v]+=[hit, func]
            
    
    return blast_dict

In [53]:
def find_best_hit(gene_id, dict_list):
    evals=1
    annotation=""
    best_hit=""
    for i in range(0, len(first_look)):
        if gene_id in dict_list[i].keys():
            hit=dict_list[i][gene_id]
            #print hit[1]
            if float(hit[3])<evals:
                evals=float(hit[3])
                best_hit=dict_names[i]
                annotation=hit[-1]
            #print(dict_names[i]+"\t"+hit[1]+"\t"+hit[-1])    
    #print("best annotation for"+gene_id+" is from "+best_hit+" with e-value "+str(evals)+" and annotation of "+annotation)
    return [annotation, best_hit]



#below: considers hits to more informative databases before less informative databases
#dict_list* are lists of blast_dicts and dl*_names are the names of the dicts in the same order

def find_best_hit2(gene_id, dict_list1, dl1_names, dict_list2=[], dl2_names=[]):
    evals=1
    annotation=""
    best_hit=""
    hits=[]
    es=[]
    names=[]
    for i in range(0, len(dict_list1)):
        if gene_id in dict_list1[i].keys():
            hit=dict_list1[i][gene_id]
            hits.append(hit[-1])
            es.append(float(hit[3]))
            names.append(dl1_names[i])
    if len(hits)>0:
        best_annotation=[hits[es.index(min(es))],names[es.index(min(es))]]
    else:
        for i in range(0, len(dict_list2)):
            if gene_id in dict_list2[i].keys():
                hit=dict_list2[i][gene_id]
                hits.append(hit[-1])
                es.append(float(hit[3]))
                names.append(dl2_names[i])
        if len(hits)>0:
            best_annotation=[hits[es.index(min(es))],names[es.index(min(es))]]
        else:
            best_annotation=["",""]

    #print("best annotation for"+gene_id+" is from "+best_hit+" with e-value "+str(evals)+" and annotation of "+annotation)
    return best_annotation 

In [68]:
##putting together info for .ggf3 file

phages=["1.161.O."]

import re
from Bio.KEGG import REST
    
for phage in phages:
    
    #prodigal and fasta files:
    prod=phage+"gene"
    faa=phage+"faa"
    
    #blast files:
    pfam_blast=phage+"vs.Pfam.out"
    cog_blast=phage+"vs.cogs_2003-2014.out"
    aclame_blast=phage+"vs.aclame.out"
    cvp_blast=phage+"vs.CVP.out"
    kegg_blast=phage+"vs.kegg.out"
    
    Sequence=open(prod).readlines()[0].split(";")[2].split("=")[1].replace('"','')
    
    #how many genes are we dealing with?
    digits=get_digits(prod)
    
    #create protein length dictionary
    len_dict=get_prot_lens(faa_file=faa, gene_file=prod)
    
    #kegg blast dict processing: finding KO's by calling kegg's REST API
    kegg_blast_dict=create_blast_dict(blast=kegg_blast, len_dict=len_dict, phage=phage, digits=digits, db="kegg")
    
    #pfam blast dict processing:
    pfam_blast_dict=create_blast_dict(blast=pfam_blast, len_dict=len_dict, phage=phage, digits=digits, db="pfam")
    
    #cog blast dict processing:
    cog_blast_dict=create_blast_dict(blast=cog_blast, len_dict=len_dict, phage=phage, digits=digits, db="cog")
        
    #aclame blast dict processing:
    aclame_blast_dict=create_blast_dict(blast=aclame_blast, len_dict=len_dict, phage=phage, digits=digits, db="aclame")
        
    #cvp blast dict processing:
    cvp_blast_dict=create_blast_dict(blast=cvp_blast, len_dict=len_dict, phage=phage, digits=digits, db="cvp")
    
    #prioritize and name dicts:
    first_looks=[kegg_blast_dict, pfam_blast_dict, cog_blast_dict, aclame_blast_dict]
    flnames=["kegg","pfam","cog","aclame"]
    second_look=[cvp_blast_dict]
    slnames=["CVP"]
    OGs=[kegg_blast_dict, pfam_blast_dict, cog_blast_dict]
    OG_names=["KEGG","PFam","COG"]
    annotes=[aclame_blast_dict, cvp_blast_dict]
    annotes_names=["ACLAME","CAMERA_viral_proteins"]
    
    #run through annotations of each prodigal-identified CDS:
    prod=open(prod).readlines()
    
    out=open(phage+".gff3","w")
    out.write(Sequence+"\n")
    
    for i in range(2,len(prod)-1,2):
        
        #extract locus tag:
        info=prod[i+1]
        number=info.split(";")[0].split("=")[2].split("_")[1]
        z="0"*(digits-len(number))
        t="NVP"+phage.replace(".","")+"_"+z+number
        col9="ID="+t
        
        #ID best hit:
        best_hits=find_best_hit2(t, dict_list1=first_looks, dl1_names=flnames, dict_list2=second_look, dl2_names=slnames)
        
        #ID CDS start, stop and orientation:
        loc=prod[i]
        if len(loc.split())==2:
            if "complement" in prod[i].split()[1]:
                strand="-"
                start=loc.split("..")[1].replace(")\n","")
                stop=loc.split("(")[1].split("..")[0]
            else:
                strand="+"
                start=loc.split()[1].split("..")[0]
                stop=loc.split()[1].split("..")[1].replace("\n","")
        #establish name:
        Name=best_hits[0]
        if len(Name)==0:
            col9+=', Name=hypothetical protein'
        else:
            col9+=", Name="+Name.replace('"','')

        #Add OG annotations:
        for d in range(0, len(OGs)):
            db=OGs[d]
            if t in db.keys():
                col9+=', Ontology_term="'+OG_names[d]+":"+db[t][-2]+'"'
        
        #Add db closest hits to notes
        for d in range(0, len(annotes)):
            db=annotes[d]
            if t in db.keys():
                col9+=', note="'+annotes_names[d]+":"+db[t][-2]+'"'
        #print(col9)
        line=Sequence+"\t"+"prod"+"\t"+"CDS"+"\t"+start+"\t"+stop+"\t"+"."+"\t"+strand+"\t"+"0"+"\t"+col9+"\n"
        
        out.write(line)
    out.close()

###Adding tRNA-Scan annotations to the gff3 file if phage encodes tRNAs:

>>In progress

In [46]:
tRNA="/Users/jmb/Desktop/ViralFate/tRNA_info/data/nahant_tRNA_count/1.161.O.trnas.txt"
import os
import glob

tRNAs=glob.glob("/Users/jmb/Desktop/ViralFate/tRNA_info/data/nahant_tRNA_count/*")

tyes=[]

for t in tRNAs:
    if os.path.getsize(t)>0:
        
        tyes.append(t)
    

t=open(tyes[1]).readlines()

for line in t[3:]:
    l=line.split("\t")
    
    locus_tag="NVP"+l[0].split("_")[1].replace(".","")+"_tRNA_"+l[1]
    print locus_tag
    start=l[2]
    stop=l[3]
    aa=l[4]
    codon=l[5]

    
    print(start+"\t"+stop+"\t")
    print line.split("\t")

NVP1012O_tRNA_1
16475	16405	
['Vibriophage_1.012.O._10N.261.48.C12 ', '1', '16475', '16405', 'Tyr', 'TAC', '0', '0', '32.76\n']


In [26]:
tRNA

[]