In [24]:
import sqlite3
import os

Set up a sqlite3 database... below appears to work for the tara file:

In [59]:
conn=sqlite3.connect('tara4.sqlite')
c=conn.cursor()
c.execute('''CREATE TABLE taratbl
            (ID primary key, gene, egg, ko, kfunc)''')
with open("./example.tara.tsv") as infile:
    for r in infile:
        vec=r.replace('"','').split("\t")
        ID=vec[0]
        gene=vec[1]
        egg=vec[2]
        ko=vec[3]
        kfunc=vec[4]
        c.execute("INSERT INTO taratbl VALUES ('"+ID+"','"+gene+"','"+egg+"','"+ko+"','"+kfunc+"')")
conn.commit()
conn.close()

Now to query the database:

In [70]:
conn=sqlite3.connect('tara4.sqlite')
c=conn.cursor()
tara_id="OM-RGC.v1.040154628"


c.execute("SELECT * from taratbl where ID='"+tara_id+"'")
output=c.fetchall()
gene=output[0][1]
egg=output[0][2]
ko=output[0][3]
kfunc=output[0][4]

print(gene, egg, ko, kfunc)
print gene

(u'483219.LILAB_09250', u'', u'', u'')
483219.LILAB_09250


In [1]:
!scp jbrown@eofe4.mit.edu:/nobackup1/jbrown/annotation/blasts/tara.translated/1.161.O.* ./

1.161.O.vs.tara.translated.out                100% 3476KB   3.4MB/s   00:00    


In [15]:
# %load ./scripts/annotation_functions.py
#Functions to run annotation-associated programs:

import subprocess
import os
import cPickle as pickle

def run_prodigal_phage(inputfasta, out_gene, out_prot):
    to_run="prodigal -i "+inputfasta+" -o "+out_gene+" -a "+out_prot+" -p meta"
    subprocess.call(to_run.split(" "))
    
def run_formatudb(fastafile, databasefile="db.udb", ublast_path="/home/sbiller/usearch7.0.1090_i86linux64"):
    to_run=ublast_path+" -makeudb_ublast "+fastafile+" -output "+databasefile
    subprocess.call(to_run.split(" "))
    
def run_ublastp(fastafile, out_file, udb, evalue, ublast_path="/home/sbiller/usearch7.0.1090_i86linux64"):
    to_run=ublast_path+" -ublast "+fastafile+" -db "+udb+" -evalue "+evalue+" -accel 0.5 -blast6out "+out_file+" -top_hit_only"
    subprocess.call(to_run.split(" "))
    
def run_trna_scan(input_file, output):
    if os.path.exists(output):
        os.remove(output)
    args=["tRNAscan-SE", "-o", output, "-G", "-D","-N", input_file]
    subprocess.call(args)
    print("tRNA scan of "+input_file+" is done!")

def run_crt(path_to_crt, input_fasta, output):
    args="java -cp "+path_to_crt+" crt -minNR 2 "+input_fasta+" "+output
    subprocess.call(args.split(" "))
    
#general info

#get number of genes in genome to know how many digits to use in locus tag


def get_digits(faa):
    faa=open(faa).read()
    digits=len(str(faa.count(">")))
    return digits

#create locus tag from protein sequence name in BLAST output file:
def get_locus_tag(line, digits, phage):
    query=line.split("\t")[0].split(" ")[0]
    number=query.split("_")[-1]
    z="0"*(digits-len(number))
    return "NVP"+phage.replace(".","")+"_"+z+number
        

from pyfaidx import Fasta

def get_prot_lens(faa_file, phage):
    len_dict={}
    digits=get_digits(faa_file)
    #def make_seq_len_dict(faa):
    f=Fasta(faa_file)
    for i in f.keys():
        name=get_locus_tag(i, digits=digits, phage=phage)
        length=len(str(f[i]))
        len_dict[name]=length
    return len_dict

#set up dict of general info from BLAST:
def set_up_blast_dict(blast, prod, faa):
    phage=blast.split("v")[0].replace("/","")
    digits=get_digits(faa)
    len_dict=get_prot_lens(faa, phage)
    records=[]
    blast_dict={}
    
    blast=open(blast).readlines()
    for line in blast:
        name=line.split(" ")[0]
        hit=line.split("\t")[1]
        lt=get_locus_tag(name, digits=digits, phage=phage)
        prot_len=len_dict[lt]
        aln_len=int(line.split("\t")[3])
        pct_id=float(line.split("\t")[2])
        ev=line.split("\t")[-2]
        pct_cov=(aln_len/prot_len)*100

        if pct_id>35 and pct_cov>75 and lt not in records:
            records.append(lt)
            blast_dict[lt]=[hit, pct_cov, pct_id, ev]
    return blast_dict

#load blast database dictionaries:
aclame_dict=pickle.load(open("./databases/pickled_dicts/aclame_dict.p","rb"))
cog_dict=pickle.load(open("./databases/pickled_dicts/cog_dict.p","rb"))
cog_defs=pickle.load(open("./databases/pickled_dicts/cog_def.p","rb"))
pfam_dict=pickle.load(open("./databases/pickled_dicts/pfam_dict.p","rb"))
pfam_defs=pickle.load(open("./databases/pickled_dicts/pfam_def.p","rb"))
cvp_dict=pickle.load(open("./databases/pickled_dicts/cvp_dict.p","rb"))

#functions for adding annotations/info to BLAST hit based on BLAST database

def add_kegg_descript(hit):
    desc= REST.kegg_find("genes", hit).read()
    K=re.search(r"K[0-9]{5}", desc)
    KEGG=K.group(0)
    a=re.search(r"(?<=K[0-9]{5}).*", desc)
    ann=a.group(0)
    return [KEGG, ann]

def add_cog_descript(hit):
    cog=cog_dict[(hit.split("|")[1])]
    func=cog_defs[cog].replace("\n","")
    return [cog, func]

def add_pfam_descript(hit):
    pfam=pfam_dict[hit].split(".")[0]
    function=pfam_defs[pfam].replace("\n","")
    return [pfam, function]

def add_aclame_descript(hit):
    annotation=aclame_dict[hit]
    return [hit, annotation]

def add_cvp_descript(hit):
    func=cvp_dict[hit]
    return [hit, func]

def add_tara_descript(hit):   #right now just adding the closest hit, TARA sequences come with COG/Pfam info etc 
    return [hit, hit]

db_dict={"kegg":add_kegg_descript, "cog":add_cog_descript, "pfam":add_pfam_descript, "aclame":add_aclame_descript,
        "cvp":add_cvp_descript, "tara":add_tara_descript}

def annotated_blast_dict(blast, prod, faa, db):
    blast_dict=set_up_blast_dict(blast, prod, faa)
    blast_db_function=db_dict[db]
    for i in blast_dict.keys():
        hit=blast_dict[i][0]
        info=blast_db_function(hit)
        blast_dict[i]+=info
    
    return blast_dict

#i=prodigal line that begins with a location identifier..
#function meant to iterate over the length of a prodigal file every two lines starting at line 3 as such: 
'''
for i in range (2, len(open(prod_file).readlines())-1,2):
    get_prod_cds_info(i,...)
'''

def get_prod_cds_info(i, prod, digits, phage):  
    loc=prod[i]
    if len(loc.split())==2:
        if "complement" in prod[i].split()[1]:
            strand="-"
            start=loc.split("..")[1].replace(")\n","")
            stop=loc.split("(")[1].split("..")[0]
        else:
            strand="+"
            start=loc.split()[1].split("..")[0]
            stop=loc.split()[1].split("..")[1].replace("\n","")
        start=start.replace(">","").replace("<","")
        stop=stop.replace(">","").replace("<","")
    info=prod[i+1]
    number=info.split(";")[0].split("=")[2].split("_")[1]
    z="0"*(digits-len(number))
    t="NVP"+phage.replace(".","")+"_"+z+number
    return [t, start, stop, strand]

#below: considers hits to more informative databases before less informative databases
#dict_list* are lists of blast_dicts and dl*_names are the names of the dicts in the same order

def find_best_hit2(gene_id, dict_list1, dl1_names, dict_list2=[], dl2_names=[]):
    evals=1
    annotation=""
    best_hit=""
    hits=[]
    es=[]
    names=[]
    for i in range(0, len(dict_list1)):
        if gene_id in dict_list1[i].keys():
            hit=dict_list1[i][gene_id]
            hits.append(hit[-1])
            es.append(float(hit[3]))
            names.append(dl1_names[i])
    if len(hits)>0:
        best_annotation=[hits[es.index(min(es))],names[es.index(min(es))]]
    else:
        for i in range(0, len(dict_list2)):
            if gene_id in dict_list2[i].keys():
                hit=dict_list2[i][gene_id]
                hits.append(hit[-1])
                es.append(float(hit[3]))
                names.append(dl2_names[i])
        if len(hits)>0:
            best_annotation=[hits[es.index(min(es))],names[es.index(min(es))]]
        else:
            best_annotation=["",""]

    #print("best annotation for"+gene_id+" is from "+best_hit+" with e-value "+str(evals)+" and annotation of "+annotation)
    return best_annotation 

def gff3_header(prod):
    Sequence=open(prod).readlines()[0].split(";")[2].split("=")[1].replace('"','')
    return Sequence+"\n"

#merge BLAST results into one gff3
    
def cds_blast_annotations_to_gff3(phage):
    #prodigal and fasta files:
    prod=phage+"gene"
    faa=phage+"faa"
    
    #blast files:
    pfam_blast=phage+"vs.Pfam.out"
    cog_blast=phage+"vs.cogs_2003-2014.out"
    aclame_blast=phage+"vs.aclame.out"
    cvp_blast=phage+"vs.CVP.out"
    kegg_blast=phage+"vs.kegg.out"
    tara_blast=phage+"vs.tara.out"

    Sequence=open(prod).readlines()[0].split(";")[2].split("=")[1].replace('"','')
    
    #set up dicts from all BLASTs
    kegg_blast_dict=annotated_blast_dict(blast=kegg_blast, prod=prod, faa=faa, db="kegg")
    pfam_blast_dict=annotated_blast_dict(blast=pfam_blast, prod=prod, faa=faa, db="pfam")
    cog_blast_dict=annotated_blast_dict(blast=cog_blast, prod=prod, faa=faa, db="cog")
    aclame_blast_dict=annotated_blast_dict(blast=aclame_blast, prod=prod, faa=faa, db="aclame")
    cvp_blast_dict=annotated_blast_dict(blast=cvp_blast, prod=prod, faa=faa, db="cvp")
    tara_blast_dict=annotated_blast_dict(blast=tara_blast, prod=prod, faa=faa, db="tara")

    #prioritize and name dicts:
    #preferred blast dbs to annotate from if there's a match:
    first_looks=[kegg_blast_dict, pfam_blast_dict, cog_blast_dict, aclame_blast_dict]
    flnames=["kegg","pfam","cog","aclame"]
    #secondary database(s) to annotate from:
    second_look=[cvp_blast_dict]
    slnames=["CVP"]
    #databases with orthologous groups to include in annotation
    OGs=[kegg_blast_dict, pfam_blast_dict, cog_blast_dict]
    OG_names=["KEGG","PFam","COG"]
    #databases where the closest hit will be referenced, but no other info will be provided:
    annotes=[aclame_blast_dict, cvp_blast_dict, tara_blast_dict]
    annotes_names=["ACLAME","CAMERA_viral_proteins","TARA_Oceans_Dataset"]
    
    out=""  #set up string to write to
    
    #run through annotations of each prodigal-identified CDS:
    prod=open(prod).readlines()
    digits=get_digits(faa)
    
    #write gff3 lines from prodigal files and blast dicts:
    for i in range(2,len(prod)-1,2):

        coords=get_prod_cds_info(i, prod, digits, phage)
        locus_tag=coords[0]
        start=coords[1]
        stop=coords[2]
        strand=coords[3]
        
        #set up col9
        col9="ID="+locus_tag
        
        #ID best hit:
        best_hits=find_best_hit2(locus_tag, dict_list1=first_looks, dl1_names=flnames, dict_list2=second_look, dl2_names=slnames)

        #establish name:
        Name=best_hits[0]
        if len(Name)==0:
            col9+='; Name=hypothetical protein'
        else:
            col9+="; Name="+Name.replace('"','')

        #Add OG annotations:
        for d in range(0, len(OGs)):
            og_dict=OGs[d]
            if locus_tag in og_dict.keys():
                col9+='; Ontology_term="'+OG_names[d]+":"+og_dict[locus_tag][-2]+'"'

        #Add db closest hits to notes
        for d in range(0, len(annotes)):
            annote_dict=annotes[d]
            if locus_tag in annote_dict.keys():
                col9+='; note="'+annotes_names[d]+"_best_match:"+annote_dict[locus_tag][-2]+'"'
        out+=Sequence+"\t"+"prod"+"\t"+"CDS"+"\t"+start+"\t"+stop+"\t"+"."+"\t"+strand+"\t"+"0"+"\t"+col9+"\n"

    return out

def CRISPR_gff3(input_fasta, crt_output):
    crtout=open(crt_output).readlines()

    sequence=open(input_fasta).readlines()
    name= [i.replace(">","") for i in sequence if i.startswith(">")][0]
    out=""
    for line in crtout:
        if line.startswith("CRISPR"):
            vec=line.split()
            number=vec[1]
            start=vec[3]
            stop=vec[5]
            ID="NVP"+name.split("_")[1].replace(".","")+"_CRISPR-like_"+number
            out+=name.replace("\n","")+"\t"+"crt"+"\tputative CRISPR feature\t%s\t%s\t.\t.\t.\tID=%s" % (start, stop, ID)
            out+=", note=CRISPR region\n"
    return out

def tRNA_scan_to_gff3(tRNAScanSE_file):
    if os.path.getsize(tRNAScanSE_file)>0:
        t=open(tRNAScanSE_file).readlines()
        tanns=""
        for line in t[3:]:
            l=line.split("\t")

            locus_tag="NVP"+l[0].split("_")[1].replace(".","")+"_tRNA_"+l[1]
            start=l[2]
            stop=l[3]
            if start<stop:
                strand="+"
            else:
                strand="-"
            aa=l[4]
            codon=l[5]
            SeqID=l[0]
            col9="ID="+locus_tag+", aa="+aa+", codon="+codon
            out=SeqID+"\t"+"tRNAScanSE"+"\t"+"tRNA"+"\t"+start+"\t"+stop+"\t"+l[-1].replace("\n","")+"\t"+strand+"\t"+"0"+"\t"+col9+"\n"
            tanns+=out
        return tanns
    else:
        print "no tRNAs found in genome"
        return ""
    
#put them all together:
def write_gff3_file(phage):
    prod=phage+".gene"
    faa=phage+".faa"
    genomic_fasta="./genomes/%sfinal.fasta" % phage
    
    out=open(phage+"test.gff3","w")
    #out.write(gff3_header(phage+"gene"))
    out.write(cds_blast_annotations_to_gff3(phage))
    
    trna="../tRNA_info/data/nahant_tRNA_count/%strnas.txt" % phage 
    if os.path.getsize(trna)>0:
        out.write(tRNA_scan_to_gff3(trna))
    
    crt_output=phage+"crt"
    out.write(CRISPR_gff3(genomic_fasta, crt_output))
    
    out.close()


In [16]:
tarablast=set_up_blast_dict("./1.161.O.vs.tara.translated.out","1.161.O.gene","1.161.O.faa")

In [19]:
for k in tarablast.keys()[0:10]:
    print tarablast[k]

['OM-RGC.v1.031044528', 100, 84.8, '1.1e-41']
['OM-RGC.v1.029216919', 100, 89.4, '2.8e-49']
['OM-RGC.v1.023443165', 100, 95.6, '1.3e-79']
['OM-RGC.v1.005269655', 100, 37.1, '6.4e-46']
['OM-RGC.v1.036345776', 100, 85.2, '6.6e-21']
['OM-RGC.v1.018155996', 100, 76.6, '1.1e-74']
['OM-RGC.v1.037561010', 100, 94.4, '5.3e-22']
['OM-RGC.v1.022734525', 100, 78.8, '2.5e-70']
['OM-RGC.v1.005709073', 100, 41.2, '9e-50']
['OM-RGC.v1.002040829', 100, 64.5, '1.2e-165']


In [None]:
def query_tara_db(tid):
    conn=sqlite3.connect('/pool001/jbrown/tara_db.sqlite')
    c=conn.cursor()

    c.execute("SELECT * from taratbl where ID='"+tid+"'")
    output=c.fetchall()
    gene=output[0][1]
    egg=output[0][2]
    ko=output[0][3]
    kfunc=output[0][4]
    conn.close()
    
    return [gene, egg, ko, kfunc]


In [None]:
for k in tarablast.keys():
    tid=tarablast[k][0]
    tann=query_tara_db(tid)
    tarablast[k]+=tann
    out.write(k+"\t"+"\t".join(tarablast[k]))

In [21]:
mylist=["hello","how","are","you","?"]
"\t".join(mylist)

'hello\thow\tare\tyou\t?'

In [25]:
def query_tara_db(tid):
    conn=sqlite3.connect('./tara4.sqlite')
    c=conn.cursor()

    c.execute("SELECT * from taratbl where ID='"+tid+"'")
    output=c.fetchall()
    gene=output[0][1]
    egg=output[0][2]
    ko=output[0][3]
    kfunc=output[0][4]
    conn.close()
    
    return [gene, egg, ko, kfunc]


In [28]:
tid="OM-RGC.v1.040154624"

q=query_tara_db(tid)
out="\t".join(q)
print out

472759.Nhal_2297			


In [None]:
for k in tarablast.keys():
    tid=tarablast[k][0]
    tann=query_tara_db(tid)
    print tann
    together=tarablast[k]+tann
    print together
    print "\t".join(together)
    to_write="\t".join(together)
    out.write(k+"\t"+to_write+"\n")
out.close()

Connect to eggNOG REST api:

In [29]:
import urllib2

In [47]:
url = 'http://eggnogapi.embl.de/nog_data/text/go_terms/COG5511'
response = urllib2.urlopen(url).read()

HTTPError: HTTP Error 500: Internal Server Error

In [40]:
import collections
from restful_lib import Connection


In [50]:
kegg_url = "http://rest.kegg.jp"
conn = Connection(kegg_url)

In [52]:
allKOs = conn.request_get('list/ko', headers={'Accept':'text/json'})
print allKOs['headers']
print type(allKOs['body'])

{'status': '200', 'content-location': u'http://rest.kegg.jp/list/ko', 'transfer-encoding': 'chunked', 'server': 'Apache', 'date': 'Wed, 04 Nov 2015 21:58:35 GMT', 'content-type': 'text/plain; charset=utf-8'}
<type 'unicode'>


In [48]:
test = conn.request_get('text/go_terms/COG5511', headers={'Accept':'text/json'})

In [49]:
print test

{u'body': u'\n    <!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">\n    <html>\n        <head>\n            <title>Error: 500 Internal Server Error</title>\n            <style type="text/css">\n              html {background-color: #eee; font-family: sans;}\n              body {background-color: #fff; border: 1px solid #ddd;\n                    padding: 15px; margin: 15px;}\n              pre {background-color: #eee; border: 1px solid #ddd; padding: 5px;}\n            </style>\n        </head>\n        <body>\n            <h1>Error: 500 Internal Server Error</h1>\n            <p>Sorry, the requested URL <tt>&#039;http://eggnogapi.embl.de/nog_data/text/go_terms/COG5511&#039;</tt>\n               caused an error:</p>\n            <pre>Internal Server Error</pre>\n        </body>\n    </html>\n', u'headers': {'status': '500', 'content-length': '774', 'server': 'dylan.embl.de', 'connection': 'close', 'date': 'Wed, 04 Nov 2015 21:06:04 GMT', 'content-type': 'text/html; charset=UTF-8'}}


Screw this... I'm going to just download the EGGNOG database onto the server and set up another sqlite3 library!

In [None]:
conn=sqlite3.connect('tara4.sqlite')
c=conn.cursor()
c.execute('''CREATE TABLE eggnog
            (ID primary key, gene, egg, ko, kfunc)''')


with open("./example.tara.tsv") as infile:
    for r in infile:
        vec=r.replace('"','').split("\t")
        ID=vec[0]
        gene=vec[1]
        egg=vec[2]
        ko=vec[3]
        kfunc=vec[4]
        c.execute("INSERT INTO taratbl VALUES ('"+ID+"','"+gene+"','"+egg+"','"+ko+"','"+kfunc+"')")
conn.commit()
conn.close()