In [97]:
from doekbase.data_api.sequence.assembly.api import AssemblyAPI
from doekbase.data_api.annotation.genome_annotation.api import GenomeAnnotationAPI
from doekbase.data_api.taxonomy.taxon.api import TaxonAPI
from doekbase.workspace.client import Workspace
from doekbase.data_api.core import ObjectAPI
from doekbase.handle.Client import AbstractHandle as handleClient

services = {"workspace_service_url": "https://ci.kbase.us/services/ws/",
            "shock_service_url": "https://ci.kbase.us/services/shock-api/",
            "handle_service_url": "https://ci.kbase.us/services/handle_service/"}

import os
import string

token = os.environ["KB_AUTH_TOKEN"]


In [98]:
genome_ref = 'ReferenceGenomeAnnotations/kb|g.166819'#6838/146'#ReferenceGenomeAnnotationsV5/kb|g.166819
working_directory ="./"

ga_api = GenomeAnnotationAPI(services, token=token, ref=genome_ref)

tax_api = ga_api.get_taxon()

asm_api = ga_api.get_assembly()

In [134]:
genome_name = str(ga_api.get_id())

print genome_name

valid_chars = "-_.(){0}{1}".format(string.ascii_letters, string.digits)
temp_file_name = ""
filename_chars = list()

for character in genome_name:
    if character in valid_chars:
        filename_chars.append(character)
    else:
        filename_chars.append("_")

if len(filename_chars) == 0:
    temp_file_name = "GenbankFile"
else:
    temp_file_name = "".join(filename_chars)+".gbk"

output_file = os.path.join(working_directory,temp_file_name)

contig_ids = asm_api.get_contig_ids()
contig_lengths = asm_api.get_contig_lengths(contig_ids)

628


In [135]:
def writeHeader(contig_id, contig_lengths, fulltax, tax_api, outFile):
    outFile.write("LOCUS       " + contig_id + "             " + str(contig_lengths[contig_id]) + " bp    " +"DNA\n")
    sn = tax_api.get_scientific_name()
    outFile.write("DEFINITION  " + sn + " genome.\n")
    outFile.write("SOURCE      " + sn + "\n")
    outFile.write("  ORGANISM  " + sn + "\n")

    fulltax = tax_api.get_scientific_lineage();
    fulltaxstring = ';'.join(fulltax)
    print fulltax

    if (fulltax):
        formatTax = ""

        counter = 0
        index = 0
        while (index < len(fulltax)):
            formatTax.join(fulltax[index])
            if (index < len(fulltax) - 1):
                formatTax.join(" ")
            counter=counter+ len(fulltax[index]) + 1
            index=index+1

            if (counter >= 65 or len(fulltaxstring) < 80):
                formatTax.join("\n")
                formatTax.join("            ")
                counter = 0

        outFile.write("            " + formatTax + ".\n")

    outFile.write("COMMENT     Exported from the DOE KnowledgeBase.\n")

    outFile.write("FEATURES             Location/Qualifiers\n")
    outFile.write("     source          1.." + str(contig_lengths[contig_id]) + "\n")
    outFile.write("                     /organism=\"" + tax_api.get_scientific_name() + "\"\n")
    outFile.write("                     /mol_type=\"DNA\"\n") 

In [143]:
def writeFeatures(ga_api, regions, outFile):
    feature_ids = ga_api.get_feature_ids(filters={"region_list":regions})
    features = ga_api.get_features(feature_ids['by_type']['gene'])
    print len(features)
    
    for feat in features:
        
        print feat
        print features[feat]
        function = features[feat]['feature_function']
        allfunction = function.split(" ")
        
        formatFunction = getAnnotation(function, allfunction, 48, 58)
      
        if (features[feat]['feature_type'] == "CDS") :
            outFile.write("     gene            ")
        else :
            if (function.find("tRNA") != -1) :
                outFile.write("     tRNA            ")
            else :
                outFile.write("     misc_RNA        ")

        #out = writeCDS(location, outFile)

        if (features[feat]['feature_type'] == "CDS") :
            outFile.write("     CDS             ")
            #writeCDS(location, outFile)
            outFile.write("                     /gene=\"" + id + "\"\n")

        #outFile.write("                     /note=\"" + formatNote)
        #outFile.write("                     /codon_start=1\n")
        #outFile.write"                     /transl_table=11\n")
        #outFile.write("                     /product=\"" + id + "\"\n")
        outFile.write("                     /function=\"" + formatFunction)

        if (features[feat]['feature_type'] == "CDS") :
            str = "                     /protein_id=\"" + id + "\"\n"
            outFile.write(str)

        aliases = cur.getAliases()
        if (aliases != null) :
            for s in aliases :
                outFile.write("                     /db_xref=\"id:" + s + "\"\n")

        proteinTranslation = cur['amino_acid_sequence']
        if (proteinTranslation != null):
            outFile.write("                     /translation=\"" + formatString(proteinTranslation, 44, 58));

In [115]:
def writeCDS():
    added = 0
    complement = false
    join = false
    for n in range(0, len(location)) :
        now4 = location[n]
        if (added == 0 and now4['strand'] == "-") :
            outFile.write("complement(")
            complement = true

        if (len(location) > 1) :
            if (added == 0):
                outFile.write("join(")
            join = true
        
        #outpu location info
        #if (complement is false):
        #    outFile.write(now4.getE2() + ".." + (now4.getE2() + (long) now4.getE4() - 1))
        #else :
        #    outFile.write((now4.getE2() - (long) now4.getE4() + 1) + ".." + now4.getE2())

        #if (location.size() > 0 and n < location.size() - 1)
        #    outFile.write(",")
        added = added + 1

    if (complement and join):
        outFile.write("))\n")
    elif (complement or join):
        outFile.write(")\n")
    else:
        outFile.write("\n")

In [153]:
def getAnnotation(function,  allfunction, first, nexta) :
    formatFunction = ""
    
    isfirst = 1
    if (len(function) < first) :
        formatFunction.join(function + "\"\n")
    else :
        counter2 = 0
        index2 = 0
        while (index2 < len(allfunction)) :

            counter2 = counter2 + allfunction[index2].length() + 1

            if (((isfirst == 1 and counter2 >= first) or counter2 >= nexta)) :
                
                if (isfirst == 1):
                    isfirst = 0;

                if (index2 < len(allfunction)) :
                    formatFunction.join("\n")
                    formatFunction.join("                     ")
                    formatFunction.join(allfunction[index2])
                    counter2 = len(allfunction[index2])
                    if (index2 < len(allfunction) - 1):
                        counter2 = counter2 + 1
                        formatFunction.join(" ")
                    else:
                        formatFunction.join("\"\n")
                
            else :
                if (index2 < len(allfunction)) :
                    formatFunction.join(allfunction[index2])
                    if (index2 < len(allfunction) - 1) :
                        counter2 = counter2 + 1
                        formatFunction.join(" ")
                    else:
                        formatFunction.join("\"\n")
                else:
                    formatFunction.join("\"\n")
            
            index2 = index2 + 1
        
    if (len(formatFunction) == 0) :
        formatFunction.join("\"\n")
        
    formatindex = formatFunction.find("\"\n")    
    
    print formatindex
    if (formatindex != len(formatFunction) - 2):
        formatFunction.join("\"\n")
        
    print "formatFunction :"+ formatFunction+":"
    if(len(formatFunction) == 0):
        formatFunction = ""
    formatFunction

In [117]:
def formatString(s, one, two) :
    s = s.replace("\"", "")
    out = ""
    first = 1
    for start in range(0, len(s)) :
        if (first == 1) :
            lista = [len(s), start+one]
            last = min(lista)
            isLast = false
            if (last == len(s)):
                isLast = true
            out.join(s[start : last])
            if (isLast):
                out.join("\"\n")
            else :
                out.join("\n")
            
            first = 0;
            start = start+one;
        else :
            lista = [len(s), start+two]
            last = min(lista)
            out.join("                     ")
            isLast = false
            if (last == len(s)):
                isLast = true
            out.join(s[start: last])
            start = start + two
            if (isLast) :
                out.join("\"\n")            
            else:
                out.join("\n");
    out

In [154]:

with open(output_file, "w") as outFile:

    #
    #create per-contig section in gbk file
    #
    for contig_id in contig_ids:

        start=1
        stop=contig_lengths[contig_id]

        print contig_id
        writeHeader(contig_id, contig_lengths, fulltax, tax_api, outFile)
        regions = []
        contig_add = {"contig_id": c, "start": start, "length": stop-start, "strand": "+"}
        contig_add2 = {"contig_id": c, "start": stop, "length": stop-start, "strand": "-"}        
        regions.append(contig_add)
        regions.append(contig_add2)

        writeFeatures(ga_api, regions, outFile)

        #write contig sequence

        break

outFile.close()

kb|g.166819.c.7
[u'cellular organisms', u'Eukaryota', u'Viridiplantae', u'Chlorophyta', u'prasinophytes', u'Mamiellophyceae', u'Mamiellales', u'Bathycoccaceae', u'Ostreococcus']
426
kb|g.166819.locus.7394
-1
formatFunction ::


TypeError: cannot concatenate 'str' and 'NoneType' objects