##Pipeline for annotation of Nahant phage genomes

No more waiting for other folks to get pipelines up and running... let's figure this out.

>UBLAST of prodigal-identified proteins against several sequence databases:
>>COG

>>PFam

>>ACLAME

>>Camera Viral Proteins (CVP)

>>KEGG

>>TARA Oceans Database

>UBLAST run via run_ublasts.py script in ./scripts; implemented on server using ./scripts/ublast_runs.slurm

In [118]:
import subprocess

def run_prodigal_phage(inputfasta, out_gene, out_prot):
    to_run="prodigal -i "+inputfasta+" -o "+out_gene+" -a "+out_prot+" -p meta"
    subprocess.call(to_run.split(" "))
    
def run_blastp(inputfasta="input.fasta", output_file="blast.out", database="databast.fasta", evalue="0.001"):
    to_run="blastp -db "+database+" -query "+inputfasta+"-evalue "+evalue+" -outfmt 6 -out "+output_file
    subprocess.call(to_run.split(" "))

def run_formatdb(fastafile, protein="yes"):
    dbtype="prot"
    if protein=="no":
        dbtype="nucl"
    to_run="makeblastdb -in "+fastafile+" -dbtype "+dbtype
    subprocess.call(to_run.split(" "))
    
def run_formatudb(fastafile, databasefile="db.udb", ublast_path="/home/sbiller/usearch7.0.1090_i86linux64"):
    to_run=ublast_path+" -makeudb_ublast "+fastafile+" -output "+databasefile
    subprocess.call(to_run.split(" "))
    
def run_ublastp(fastafile, out_file, udb, evalue, ublast_path="/home/sbiller/usearch7.0.1090_i86linux64"):
    to_run=ublast_path+" -ublast "+fastafile+" -db "+udb+" -evalue "+evalue+" -accel 0.5 -blast6out "+out_file+" -top_hit_only"
    subprocess.call(to_run.split(" "))
    
def run_trna_scan(input_file, output):
    if os.path.exists(output):
        os.remove(output)
    args=["tRNAscan-SE", "-o", output, "-G", "-D","-N", input_file]
    subprocess.call(args)
    print("tRNA scan of "+input_file+" is done!")
    
def find_best_hit(gene_id, dict_list):
    evals=1
    annotation=""
    best_hit=""
    for i in range(0, len(first_look)):
        if gene_id in dict_list[i].keys():
            hit=dict_list[i][gene_id]
            #print hit[1]
            if float(hit[1])<evals:
                evals=float(hit[1])
                best_hit=dict_names[i]
                annotation=hit[-1]
            #print(dict_names[i]+"\t"+hit[1]+"\t"+hit[-1])
        
    #print("best annotation for"+gene_id+" is from "+best_hit+" with e-value "+str(evals)+" and annotation of "+annotation)
    return [annotation, best_hit]

#considers hits to more informative databases before less informative databases
#dict_lists are lists of blast_dict tables and dl_names are the names of the dicts in the same order

def find_best_hit2(gene_id, dict_list1, dl1_names, dict_list2=[], dl2_names=[]):
    evals=1
    annotation=""
    best_hit=""
    hits=[]
    es=[]
    names=[]
    for i in range(0, len(dict_list1)):
        if gene_id in dict_list1[i].keys():
            hit=dict_list1[i][gene_id]
            hits.append(hit[-1])
            es.append(float(hit[3]))
            names.append(dl1_names[i])
    if len(hits)>0:
        best_annotation=[hits[es.index(min(es))],names[es.index(min(es))]]
    else:
        for i in range(0, len(dict_list2)):
            if gene_id in dict_list2[i].keys():
                hit=dict_list2[i][gene_id]
                hits.append(hit[-1])
                es.append(float(hit[3]))
                names.append(dl2_names[i])
        if len(hits)>0:
            best_annotation=[hits[es.index(min(es))],names[es.index(min(es))]]
        else:
            best_annotation=["",""]

    #print("best annotation for"+gene_id+" is from "+best_hit+" with e-value "+str(evals)+" and annotation of "+annotation)
    return best_annotation 

Below: Creating DB dicts for quick access to sequence codes, annotations and OGs

In [105]:
#create DB dicts for BLAST file analysis:

#ACLAME:
aclame=open("./databases/DB_Info/aclame/aclame_proteins_all_0.4.tab").readlines()
aclame_dict={}

for line in aclame[2:-1]:
    protein=line.split("\t")[0]
    ncbi_ann=line.split("\t")[2]
    aclame_dict[protein]=ncbi_ann

##COG needs two dbs:
cogs=open("./databases/DB_Info/COG/cog2003-2014.csv").readlines()         #all COG sequences and COG groups
cogs2=open("./databases/DB_Info/COG/cognames2003-2014.tab").readlines()   #COG group definitions/functions

cog_dict={}
cog_defs={}

#dict from gi to COG:
for line in cogs:
    gi=line.split(",")[0]
    cog=line.split(",")[6]
    cog_dict[gi]=cog
#dict from COG to function definition:
for line in cogs2:
    cog=line.split("\t")[0]
    func=line.split("\t")[2]
    cog_defs[cog]=func

##Pfam needs two DBs as well:
pfams=open("./databases/DB_Info/PFam/Pfam-A.titles.txt").readlines()   #ID all pfam sequences in BLAST db
pfams2=open("./databases/DB_Info/PFam/Pfam-A.clans.tsv").readlines()   #Matches IDs to "clans" with functions
pfam_dict={}
pfam_defs={}

#sequence to pfam:
for line in pfams:
    seq=line.split(" ")[0].replace(">","")
    pfam=line.split(" ")[2].split(";")[0]
    pfam_dict[seq]=pfam
#pfam to function:
for line in pfams2:
    pfam=line.split("\t")[0]
    function=line.split("\t")[4]
    pfam_defs[pfam]=function
    
##CAMERA Viral Proteins annotations are complicated; extracting definitions from sequence titles:
cvp=open("./databases/DB_Info/CVP/CVP_titles.txt").readlines()

cvp_dict={}

headers=[]

for line in cvp:
    defs={}
    line=line.replace(">","")
    annotation=""
    info=line.split("/")[1:]
    ID=line.split("/")[0].replace(" ","")
    for i in info: 
        if "=" in i:
            defs[i.split("=")[0]]=i.split("=")[1]
    if "DESCRIPTION" in defs.keys():
        annotation=defs["DESCRIPTION"]
    elif "definition" in defs.keys():
        annotation=defs["definition"]
    
    cvp_dict[ID]=annotation.replace('"','')


###Now looking into the actual BLAST files:

In [14]:
#modules for blast file processing
def get_digits(prodfile):
    prod=open(prodfile).readlines()
    digits=len(str(len(prod)/2))
    return digits


def get_locus_tag(line, digits, phage):
    query=line.split("\t")[0].split(" ")[0]
    number=query.split("_")[-1]
    z="0"*(digits-len(number))
    return "NVP"+phage.replace(".","")+"_"+z+number


def write_cds_gff3(gff, phage, out):
    out=open(out, "w")
    prod=open(gff).readlines()
    digits=len(str(len(prod)/2))  ##for assigning a gene number with appropriate number of zeros preceding it
    
    OGs={"Pfam":pfam_blast_dict, "COG":cog_blast_dict}
    first_looks=[pfam_blast_dict, aclame_blast_dict, cog_blast_dict]
    flnames=["pfam","aclame","cog"]
    second_look=[cvp_blast_dict]
    slnames=["cvp"]
    
    SeqID=prod[0].split(";")[2].split("=")[1].replace('"','')

    
    
    for i in range(2,len(prod)-1,2):
        loc=prod[i]

        info=prod[i+1]

        if len(loc.split())==2:
            if "complement" in prod[i].split()[1]:
                strand="-"
                start=loc.split("..")[1].replace(")\n","")
                stop=loc.split("(")[1].split("..")[0]
            else:
                strand="+"
                start=loc.split()[1].split("..")[0]
                stop=loc.split()[1].split("..")[1].replace("\n","")
        number=info.split(";")[0].split("=")[2].split("_")[1]
        z="0"*(digits-len(number))
        t="NVP"+phage.replace(".","")+"_"+z+number
        col9="ID="+t
        best_hits=find_best_hit2(t, dict_list1=first_looks, dl1_names=flnames, dict_list2=second_look, dl2_names=slnames)
        Name=best_hits[0]
        if len(Name)==0:
            col9+=', Name=hypothetical protein'
        else:
            col9+=", Name="+Name.replace('"','')
            col9+=", note=annotation from "+best_hits[1]

        for d in OGs.keys():
            db=OGs[d]
            if t in db.keys():
                col9+=', Ontology_term="'+d+":"+db[t][-2]+'"'
        col9+="\n"
        out.write(SeqID+"\tprod\tCDS\t"+start+"\t"+stop+"\t.\t"+strand+"\t0\t"+col9)
    out.close()



In [25]:
phage="1.161.O."

digits=get_digits(phage+"gene")

#Set up BLAST dicts:
aclame_blast_dict=create_aclame_blast_dict(blast=phage+"vs.aclame.out")
cog_blast_dict=create_cog_blast_dict(blast=phage+"vs.cogs_2003-2014.out")
pfam_blast_dict=create_pfam_blast_dict(blast=phage+"vs.Pfam.out")
cvp_blast_dict=create_cvp_blast_dict(blast=phage+"vs.CVP.out")

write_cds_gff3(gff, phage, phage+"cds.gff3")

In [None]:
import glob

phages=glob.glob("./genomes/*")
phage_list=[]

for p in phages:
    phage_list.append(p.replace("final.fasta",""))
    
for phage in phage_list:
    digits=get_digits("./genes/"+phage+"gene")
    aclame_blast_dict=create_aclame_blast_dict(blast="./blasts/aclame"+phage+"vs.aclame.out")
    cog_blast_dict=create_cog_blast_dict(blast="./blasts/cogs_2003-2014/"+phage+"vs.cogs_2003-2014.out")
    pfam_blast_dict=create_pfam_blast_dict(blast="./blasts/Pfam/"+phage+"vs.Pfam.out")
    cvp_blast_dict=create_cvp_blast_dict(blast="./blasts/CVP/"+phage+"vs.CVP.out")
    write_cds_gff3(gff, phage, "./gff3/"+phage+"cds.gff3")

In [25]:
from pyfaidx import Fasta

faa="./1.161.O.faa"
phage="1.161.O."

def get_prot_lens(faa_file, gene_file):
    len_dict={}
    digits=get_digits(gene_file)
    #def make_seq_len_dict(faa):
    f=Fasta(faa_file)
    for i in f.keys():
        name=get_locus_tag(i, digits=digits, phage=phage)
        length=len(str(f[i]))
        len_dict[name]=length
    return len_dict

prot_lens=get_prot_lens(faa_file=faa, gene_file=phage+"gene")

In [59]:
from __future__ import division

def create_blast_dict(blast, len_dict, phage, digits):
    blast=open(blast).readlines()

    records=[]
    blast_dict={}

    for line in blast:
        name=line.split(" ")[0]
        hit=line.split("\t")[1]
        lt=get_locus_tag(name, digits=digits, phage=phage)
        prot_len=prot_lens[lt]
        aln_len=int(line.split("\t")[3])
        pct_id=float(line.split("\t")[2])
        ev=line.split("\t")[-2]
        pct_cov=(aln_len/prot_len)*100

        if pct_id>35 and pct_cov>75 and lt not in records:
            
            records.append(lt)
            blast_dict[lt]=[hit, pct_cov, pct_id, ev]
    
    return blast_dict

In [131]:
phages=["1.161.O."]

import re
from Bio.KEGG import REST
    
for phage in phages:

    prod=phage+"gene"
    faa=phage+"faa"
    
    #create gene length dictionary
    len_dict=get_prot_lens(faa_file=faa, gene_file=prod)
    
    #blast files:
    pfam_blast=phage+"vs.Pfam.out"
    cog_blast=phage+"vs.cogs_2003-2014.out"
    aclame_blast=phage+"vs.aclame.out"
    cvp_blast=phage+"vs.CVP.out"
    kegg_blast=phage+"vs.kegg.out"
    
    #kegg blast dict processing: finding KO's by calling kegg's REST API
    kegg_blast_dict=create_blast_dict(blast=kegg_blast, len_dict=len_dict, phage=phage, digits=digits)
    for k in kegg_blast_dict.keys():
        desc= REST.kegg_find("genes", kegg_blast_dict[k][0]).read()
        K=re.search(r"K[0-9]{5}", desc)
        KEGG=K.group(0)
        a=re.search(r"(?<=K[0-9]{5}).*", desc)
        ann=a.group(0)
        kegg_blast_dict[k]+=[KEGG, ann]
    
    #pfam blast dict processing:
    pfam_blast_dict=create_blast_dict(blast=pfam_blast, len_dict=len_dict, phage=phage, digits=digits)
    for p in pfam_blast_dict.keys():
        hit=pfam_blast_dict[p][0]
        pfam=pfam_dict[hit].split(".")[0]
        function=pfam_defs[pfam].replace("\n","")
        pfam_blast_dict[p]+=[pfam, function]
    
    #cog blast dict processing:
    cog_blast_dict=create_blast_dict(blast=cog_blast, len_dict=len_dict, phage=phage, digits=digits)
    for c in cog_blast_dict.keys():
        hit=cog_blast_dict[c][0]
        cog=cog_dict[(hit.split("|")[1])]
        func=cog_defs[cog].replace("\n","")
        cog_blast_dict[c]+=[cog, func]
        
    #aclame blast dict processing:
    aclame_blast_dict=create_blast_dict(blast=aclame_blast, len_dict=len_dict, phage=phage, digits=digits)
    for a in aclame_blast_dict.keys():
        hit=aclame_blast_dict[a][0]
        annotation=aclame_dict[hit]
        aclame_blast_dict[a]+=[hit, annotation]
        
    #cvp blast dict processing:
    cvp_blast_dict=create_blast_dict(blast=cvp_blast, len_dict=len_dict, phage=phage, digits=digits)
    for v in cvp_blast_dict.keys():
        hit=cvp_blast_dict[v][0]
        func=cvp_dict[hit]
        cvp_blast_dict[v]+=[hit, func]
    
    #prioritize and name dicts:
    first_looks=[kegg_blast_dict, pfam_blast_dict, cog_blast_dict, aclame_blast_dict]
    flnames=["kegg","pfam","cog","aclame"]
    second_look=[cvp_blast_dict]
    slnames=["CVP"]
    OGs=[kegg_blast_dict, pfam_blast_dict, cog_blast_dict]
    OG_names=["KEGG","PFam","COG"]
    annotes=[aclame_blast_dict, cvp_blast_dict]
    annotes_names=["ACLAME","CAMERA_viral_proteins"]
    
    #run through annotations of each prodigal-identified CDS:
    prod=open(prod).readlines()
    for i in range(2,len(prod)-1,2):
        
        #extract locus tag:
        info=prod[i+1]
        number=info.split(";")[0].split("=")[2].split("_")[1]
        z="0"*(digits-len(number))
        t="NVP"+phage.replace(".","")+"_"+z+number
        col9="ID="+t
        
        #ID best hit:
        best_hits=find_best_hit2(t, dict_list1=first_looks, dl1_names=flnames, dict_list2=second_look, dl2_names=slnames)
        
        #ID CDS start, stop and orientation:
        loc=prod[i]
        if len(loc.split())==2:
            if "complement" in prod[i].split()[1]:
                strand="-"
                start=loc.split("..")[1].replace(")\n","")
                stop=loc.split("(")[1].split("..")[0]
            else:
                strand="+"
                start=loc.split()[1].split("..")[0]
                stop=loc.split()[1].split("..")[1].replace("\n","")
        #establish name:
        Name=best_hits[0]
        if len(Name)==0:
            col9+=', Name=hypothetical protein'
        else:
            col9+=", Name="+Name.replace('"','')

        #Add OG annotations:
        for d in range(0, len(OGs)):
            db=OGs[d]
            if t in db.keys():
                col9+=', Ontology_term="'+OG_names[d]+":"+db[t][-2]+'"'
        
        #Add db closest hits
        for d in range(0, len(annotes)):
            db=annotes[d]
            if t in db.keys():
                col9+=', note="'+annotes_names[d]+":"+db[t][-2]+'"'
        print(col9)

ID=NVP1161O_001, Name=hypothetical protein
ID=NVP1161O_002, Name=Terminase-like family, Ontology_term="PFam:PF03237", db_xref="CAMERA_viral_proteins:NCBI_PEP_323514092"
ID=NVP1161O_003, Name=C-5 cytosine-specific DNA methylase, Ontology_term="PFam:PF00145", db_xref="CAMERA_viral_proteins:NCBI_PEP_535137"
ID=NVP1161O_004, Name=hypothetical protein
ID=NVP1161O_005, Name=predicted protein , db_xref="CAMERA_viral_proteins:CAMPEP_0000011774"
ID=NVP1161O_006, Name=predicted protein , db_xref="CAMERA_viral_proteins:CAMPEP_0000011360"
ID=NVP1161O_007, Name=hypothetical protein
ID=NVP1161O_008, Name=hypothetical protein
ID=NVP1161O_009, Name=predicted protein , db_xref="CAMERA_viral_proteins:CAMPEP_0000011538"
ID=NVP1161O_010, Name=hypothetical protein , db_xref="CAMERA_viral_proteins:NCBI_PEP_323513857"
ID=NVP1161O_011, Name=predicted protein , db_xref="CAMERA_viral_proteins:CAMPEP_0000011418"
ID=NVP1161O_012, Name=hypothetical protein , db_xref="CAMERA_viral_proteins:CAMPEP_0000014762"
ID=NVP

{'NVP1161O_168': ['mtt:Ftrac_0033',
  81.3953488372093,
  35.7,
  '2.8e-12',
  'K00287',
  ' dihydrofolate reductase [EC:1.5.1.3]'],
 'NVP1161O_169': ['spl:Spea_1024',
  90.45936395759718,
  44.1,
  '8.1e-49',
  'K00560',
  ' thymidylate synthase [EC:2.1.1.45]']}