In [1]:
from cogent3.core.annotation_db import GffAnnotationDb
from cogent3.core.annotation_db import GenbankAnnotationDb
from cogent3.parse.gff import gff_parser
from cogent3 import open_
from cogent3.parse.genbank import MinimalGenbankParser
from cogent3 import load_unaligned_seqs
gff_path = "/Users/kiratalreja/Downloads/prok_NoLocusTags.gff"
genbank_path = "/Users/kiratalreja/Downloads/NC_000913.3.gb"
import unittest

In [39]:
def test_make_db():
    """Test if the gff3 database is correctly created
    in terms of the names of columns & the number of entries"""

    db = GffAnnotationDb(gff_path)

    """Test the name of columns created"""
    sql_columns = []
    data = db.db.execute("""SELECT * FROM GFF""")
    for column in data.description:
        sql_columns.append(column[0])
    got = sql_columns
    parsed_gff = gff_parser(gff_path)
    expected = list((list(parsed_gff)[0]).keys())[:-1]
    assert got == expected
    
    sql_columns = []
    db = GenbankAnnotationDb(genbank_path)
    data = db.db.execute("""SELECT * FROM GENBANK""")
    for column in data.description:
        sql_columns.append(column[0])
    got = sql_columns
    expected = ["LocusID","Type","Spans","Locus_Tag","Start","End","Strand"]
    assert got == expected
    
   
    

In [57]:
def test_make_sql_query():
    """Test that the SQLlite queries are correctly formed"""
    
    fasta_path = "/Users/kiratalreja/Desktop/short.fa"
    seqs = load_unaligned_seqs(fasta_path, moltype="dna")
    seq = seqs.seqs[0]

    gff_path = "/Users/kiratalreja/Downloads/prok_NoLocusTags.gff"
    db = GffAnnotationDb(gff_path)

    """only the bio_type is provided"""
    expected = ('SELECT * FROM GFF WHERE Type == ?', ['gene'])
    got = db._make_sql_query(seq_name=None,bio_type="gene",identifier=None,start=None,end=None)
    assert got == expected 

    """only the identifier is provided"""
    expected = ('SELECT * FROM GFF WHERE Attributes like ?', ['%RandomIdentifier%'])
    got = db._make_sql_query(seq_name=None,bio_type=None,identifier="RandomIdentifier",start=None,end=None)
    assert got == expected

    """both identifier and bio_type provided"""
    expected = ('SELECT * FROM GFF WHERE Type == ? AND Attributes like ?',['CDS', '%RandomIdentifier%'])
    got = db._make_sql_query(seq_name=None,bio_type="CDS",identifier="RandomIdentifier",start=None,end=None)
    assert got == expected

    """start provided along with identifier and bio_type"""
    expected = ('SELECT * FROM GFF WHERE Type == ? AND Attributes like ? AND Start >= ?',['CDS', '%RandomIdentifier%', 0])
    got = db._make_sql_query(seq_name=None,bio_type="CDS",identifier="RandomIdentifier",start=0,end=None)
    assert got == expected

    """end provided along with identifier and bio_type"""
    expected = ('SELECT * FROM GFF WHERE Type == ? AND Attributes like ? AND End < ?',['CDS', '%RandomIdentifier%', 5000])
    got = db._make_sql_query(seq_name=None,bio_type="CDS",identifier="RandomIdentifier",start=None,end=5000)
    assert got == expected

    """start and end provided along with identifier and bio_type"""
    expected = ('SELECT * FROM GFF WHERE Type == ? AND Attributes like ? AND Start >= ? AND End < ?',['CDS', '%RandomIdentifier%', 0, 5000])
    got = db._make_sql_query(seq_name=None,bio_type="CDS",identifier="RandomIdentifier",start=0,end=5000)
    assert got == expected

    """all five attributes provided"""
    expected = ('SELECT * FROM GFF WHERE SeqID == ? AND Type == ? AND Attributes like ? AND Start >= ? AND End < ?', ['1', 'CDS', '%RandomIdentifier%', 0, 5000])
    got = db._make_sql_query(seq_name="1",bio_type="CDS",identifier="RandomIdentifier",start=0,end=5000)
    assert got == expected

    """check exception when both bio_type and identifier are missing"""
    import pytest

    with pytest.raises(ValueError):
        db._make_sql_query(seq_name=None,bio_type=None,identifier=None,start=None,end=None)

    """check exception when both bio_type and identifier are missing, even if other attributes"""

    with pytest.raises(ValueError):
        db._make_sql_query(seq_name="1",bio_type=None,identifier=None,start=0,end=1000)

    """check exception when only seq_name is provided"""

    with pytest.raises(ValueError):
        db._make_sql_query(seq_name="1",bio_type=None,identifier=None,start=None,end=None)

    """check exception when only start is provided"""

    with pytest.raises(ValueError):
        db._make_sql_query(seq_name=None,bio_type=None,identifier=None,start=0,end=None)
    
    """check exception when only end is provided"""

    with pytest.raises(ValueError):
        db._make_sql_query(seq_name=None,bio_type=None,identifier=None,start=None,end=1000)

In [59]:
def test_populate_from_file():
    """Test that the database is populated with the correct
    number of columns"""
    from cogent3.core.annotation import GffAnnotationDb
    from cogent3.parse.gff import gff_parser

    gff_path = "/Users/kiratalreja/Downloads/prok_NoLocusTags.gff"
    db = GffAnnotationDb(gff_path)

    """test the number of rows populated"""
    db.db.execute(""" SELECT * FROM GFF """)
    got = len(list(db.db.fetchall()))
    parsed_gff = gff_parser(gff_path)
    expected = len(list(parsed_gff))
    assert got == expected


In [68]:
def test_db_query():

    """Test that the SQL query returns the correct
    number of rows for different combinations of bio_type/identifier"""

    from cogent3 import load_unaligned_seqs
    from cogent3.core.annotation import GffAnnotationDb

    fasta_path = "/Users/kiratalreja/Desktop/short.fa"
    seqs = load_unaligned_seqs(fasta_path, moltype="dna")
    seq = seqs.seqs[0]

    gff_path = "/Users/kiratalreja/Downloads/prok_NoLocusTags.gff"
    db = GffAnnotationDb()
    db.populate_from_file(gff_path)

    """multiple hits for the same identifier"""
    got = len(db.db_query(start=0, end=len(seq), identifier="CDS4"))
    expected = 2
    assert got == expected

    """query for an ID and recieve the children to the ID along with the parent"""
    got = len(db.db_query(start=0, end=len(seq), identifier="gene4"))
    expected = 3
    assert got == expected

    """query for an ID, with no children"""
    got = len(db.db_query(start=0, end=len(seq), identifier="trna1"))
    expected = 1
    assert got == expected

    """query for a bio type, with multiple hits"""
    got = len(db.db_query(start=0, end=len(seq), bio_type="gene"))
    expected = 8
    assert got == expected

    """query for an ID & a bio type, with a single hit"""
    got = len(db.db_query(start=0, end=len(seq), identifier="gene0", bio_type="CDS"))
    expected = 1
    assert got == expected

    """query for an ID & a bio type, with multiple hits"""
    got = len(db.db_query(start=0, end=len(seq), identifier="CDS4", bio_type="CDS"))
    expected = 2
    assert got == expected

In [70]:
def test_find_records():

    """Test that the coordinates the grouped correctly, and features
    formed properly"""

    from cogent3 import load_unaligned_seqs
    from cogent3.core.annotation import GffAnnotationDb

    gff_path = "/Users/kiratalreja/Downloads/prok_NoLocusTags.gff"
    db = GffAnnotationDb()
    db.populate_from_file(gff_path)
    fasta_path = "/Users/kiratalreja/Desktop/short.fa"
    seqs = load_unaligned_seqs(fasta_path, moltype="dna")
    seq = seqs.seqs[0]

    """combine rows with the same ID"""
    got = len(db.find_records(start=0, end=len(seq), identifier="CDS4"))
    expected = 1
    assert got == expected

    """combine rows with the same ID, when bio_type given"""
    got = len(db.find_records(start=0, end=len(seq), bio_type="CDS"))
    expected = 6
    assert got == expected

    """combine rows with the same ID, when children rows are fetched with the parent"""
    got = len(db.find_records(start=0, end=len(seq), identifier="gene4"))
    expected = 2
    assert got == expected

    """unique ID, single row returned"""
    got = len(db.find_records(start=0, end=len(seq), identifier="id020000"))
    expected = 1
    assert got == expected


In [None]:
test_find_records()

In [6]:
def test_ordered_values():
    from cogent3.core.annotation_db import _ordered_values
    from cogent3.parse.gff import gff_parser
    parsed_gff = gff_parser(gff_path)
    example_dict = list(parsed_gff)[0]  
    got = _ordered_values(example_dict)
    expected = ['sequence001', 'mine','gene', 189, 255, '.', '+', '.', {'ID': 'gene0','Dbxref': 'ASAP:ABE-0000006','gene': 'thrL','gene_synonym': 'ECK0001'}]
    assert got == expected

    def return_type(values):
        types = []
        for v in values:
            types.append(type(v))
        return types

    got = return_type(_ordered_values(example_dict))
    expected = [str, str, str, int, int, str, str, str, dict]
    assert got == expected
    

In [4]:
from cogent3.core.annotation_db import _ordered_values
from cogent3.parse.gff import gff_parser
parsed_gff = gff_parser(gff_path)
example_dict = list(parsed_gff)[0]  
got = _ordered_values(example_dict)
expected = ['sequence001', 'mine','gene', 189, 255, '.', '+', '.', {'ID': 'gene0','Dbxref': 'ASAP:ABE-0000006','gene': 'thrL','gene_synonym': 'ECK0001'}]
assert got == expected

def return_type(values):
    types = []
    for v in values:
        types.append(type(v))
    return types

got = return_type(_ordered_values(example_dict))
expected = [str, str, str, int, int, str, str, str, dict]
assert got == expected
    

In [9]:
expected = [1,2,3,4]

In [16]:
def myfunc(a, b):
  return a + b

x = map(myfunc, ('apple', 'banana', 'cherry'), ('orange', 'lemon', 'pineapple'))

In [17]:
x

<map at 0x7f7e912314e0>

In [12]:
map(type(),expected)

TypeError: type() takes 1 or 3 arguments

In [7]:
test_ordered_values()

In [8]:
parsed_gff = gff_parser(gff_path)
example_dict = list(parsed_gff)[0]  

In [10]:
from cogent3.core.annotation_db import _ordered_values
_ordered_values(example_dict)

['sequence001',
 'mine',
 'gene',
 189,
 255,
 '.',
 '+',
 '.',
 {'ID': 'gene0',
  'Dbxref': 'ASAP:ABE-0000006',
  'gene': 'thrL',
  'gene_synonym': 'ECK0001'}]

IndexError: list index out of range

In [13]:
example_dict = ['sequence001', 'mine','gene', 189, 255, '.', '+', '.', {'ID': 'gene0','Dbxref': 'ASAP:ABE-0000006','gene': 'thrL','gene_synonym': 'ECK0001'}]

In [18]:
def return_type(values):
    types = []
    for v in values:
        types.append(type(v))
    return types

In [27]:
assert return_type(example_dict) == [str, str, str, int, int, str, str, str, dict]

In [21]:
def test_distinct():
    from cogent3.core.annotation_db import GffAnnotationDb

    gff_path = "/Users/kiratalreja/Downloads/prok_NoLocusTags.gff"
    db = GffAnnotationDb(gff_path)

    #nothing provided
    got = db.distinct()
    expected = set()
    assert got == expected

    #only bio_type
    got = len(db.distinct(bio_type=True)['Type'])
    expected = 9
    assert got == expected 
    
    #only seqID
    got = db.distinct(seq_name=True)
    expected = {'SeqID': {'sequence001'}}
    assert got == expected
    
    #only identifier
    got = len(db.distinct(identifier=True)['identifier'])
    expected = 21
    assert got == expected 
    
    #all three values
    got = len(db.distinct(bio_type=True,seq_name=True,identifier=True))
    expected = 3
    assert got == expected

    got = len(db.distinct(bio_type=True,seq_name=True,identifier=True)['Type'])
    expected = 9
    assert got == expected

    got = len(db.distinct(bio_type=True,seq_name=True,identifier=True)['identifier'])
    expected = 21
    assert got == expected

    got = len(db.distinct(bio_type=True,seq_name=True,identifier=True)['SeqID'])
    expected = 1
    assert got == expected




In [15]:
from cogent3.core.annotation_db import GffAnnotationDb

gff_path = "/Users/kiratalreja/Downloads/prok_NoLocusTags.gff"
db = GffAnnotationDb(gff_path)

#nothing provided
got = db.distinct()
expected = set()
assert got == expected

#only bio_type
got = len(db.distinct(bio_type=True)['Type'])
expected = 9
assert got == expected 

got = db.distinct(seq_name=True)
expected = {'SeqID': {'sequence001'}}
assert got == expected

got = len(db.distinct(identifier=True)['identifier'])
expected = 21
assert got == expected 


In [20]:
len(db.distinct(bio_type=True,seq_name=True,identifier=True))

3

In [22]:
test_distinct()

In [1]:
example = {'Type': {'CDS',
    'gene',
    'misc_feature',
    'mobile_element',
    'mobile_genetic_element',
    'rRNA',
    'rep_origin',
    'repeat_region',
    'tRNA'}}

In [5]:
len(example['Type'])

9

In [17]:
from cogent3.core.annotation_db import GffAnnotationDb

gff_path = "/Users/kiratalreja/Downloads/prok_NoLocusTags.gff"
db = GffAnnotationDb(gff_path)
db.distinct(identifier=True)

{'identifier': {'CDS0',
  'CDS1',
  'CDS2',
  'CDS3',
  'CDS4',
  'CDS5',
  'gene0',
  'gene1',
  'gene2',
  'gene3',
  'gene4',
  'gene5',
  'gener1',
  'genet1',
  'id010000',
  'id020000',
  'id040000',
  'id060000',
  'id070000',
  'rrna1',
  'trna1'}}

In [None]:
{'identifier': {'CDS0',
  'CDS1',
  'CDS2',
  'CDS3',
  'CDS4',
  'CDS5',
  'gene0',
  'gene1',
  'gene2',
  'gene3',
  'gene4',
  'gene5',
  'gener1',
  'genet1',
  'id010000',
  'id020000',
  'id040000',
  'id060000',
  'id070000',
  'rrna1',
  'trna1'}}

In [3]:
def test_make_sql_query():
    """Test that the SQLlite queries are correctly formed"""

    from cogent3 import load_unaligned_seqs
    from cogent3.core.annotation_db import GffAnnotationDb

    fasta_path = "/Users/kiratalreja/Desktop/short.fa"
    seqs = load_unaligned_seqs(fasta_path, moltype="dna")
    seq = seqs.seqs[0]

    gff_path = "/Users/kiratalreja/Downloads/prok_NoLocusTags.gff"
    db = GffAnnotationDb(gff_path)

    """only bio_type provided"""
    expected = (
           'SELECT * FROM GFF WHERE Start >= ? AND End < ? AND Type == ?', [0, 13720, 'CDS']
    )
    got = db._make_sql_query(start=0, end=len(seq), bio_type="CDS")
    assert got == expected

    """only identifier provided"""
    expected = (
        "SELECT * FROM GFF WHERE Start >= ? AND End < ? AND Attributes like ?",
        [0, 13720, "%RandomID%"],
    )
    got = db._make_sql_query(start=0, end=len(seq), identifier="RandomID")
    assert expected == got

    """both bio_type and identifier provided"""
    expected = (
        "SELECT * FROM GFF WHERE Start >= ? AND End < ? AND Type == ? AND Attributes like ?",
        [0, 13720, "CDS", "%RandomID%"],
    )
    got = db._make_sql_query(
        start=0, end=len(seq), bio_type="CDS", identifier="RandomID"
    )
    assert expected == got

    """check exception when both bio_type and identifier are missing"""
    import pytest

    with pytest.raises(ValueError):
        db._make_sql_query(start=0, end=len(seq))


In [4]:
test_make_sql_query()

In [31]:
("SELECT * FROM GFF WHERE SeqID == ? AND Start >= ? AND End < ? AND Type == ?",[0, 13720, "CDS"]) == ('SELECT * FROM GFF WHERE SeqID == ? AND Start >= ? AND End < ? AND Type == ?', [0, 13720, 'CDS'])

True

In [2]:
from cogent3.core.annotation_db import GffAnnotationDb
from cogent3.core.annotation_db import GenbankAnnotationDb
from cogent3.parse.gff import gff_parser
from cogent3 import open_
from cogent3.parse.genbank import MinimalGenbankParser
from cogent3 import load_unaligned_seqs
gff_path = "/Users/kiratalreja/Downloads/prok_NoLocusTags.gff"
genbank_path = "/Users/kiratalreja/Downloads/NC_000913.3.gb"
import unittest

def test_make_db():
    """Test if the gff3 database is correctly created
    in terms of the names of columns & the number of entries"""

    db = GffAnnotationDb(gff_path)

    """Test the name of columns created"""
    sql_columns = []
    data = db.db.execute("""SELECT * FROM GFF""")
    for column in data.description:
        sql_columns.append(column[0])
    got = sql_columns
    parsed_gff = gff_parser(gff_path)
    expected = list((list(parsed_gff)[0]).keys())[:-1]
    assert got == expected
    
    sql_columns = []
    db = GenbankAnnotationDb(genbank_path)
    data = db.db.execute("""SELECT * FROM GENBANK""")
    for column in data.description:
        sql_columns.append(column[0])
    got = sql_columns
    expected = ["LocusID","Type","Spans","Locus_Tag","Start","End","Strand"]
    assert got == expected
    
   
def test_make_sql_query():
    """Test that the SQLlite queries are correctly formed"""
    
    fasta_path = "/Users/kiratalreja/Desktop/short.fa"
    seqs = load_unaligned_seqs(fasta_path, moltype="dna")
    seq = seqs.seqs[0]

    gff_path = "/Users/kiratalreja/Downloads/prok_NoLocusTags.gff"
    db = GffAnnotationDb(gff_path)

    """only the bio_type is provided"""
    expected = ('SELECT * FROM GFF WHERE Type == ?', ['gene'])
    got = db._make_sql_query(seq_name=None,bio_type="gene",identifier=None,start=None,end=None)
    assert got == expected 

    """only the identifier is provided"""
    expected = ('SELECT * FROM GFF WHERE Attributes like ?', ['%RandomIdentifier%'])
    got = db._make_sql_query(seq_name=None,bio_type=None,identifier="RandomIdentifier",start=None,end=None)
    assert got == expected

    """both identifier and bio_type provided"""
    expected = ('SELECT * FROM GFF WHERE Type == ? AND Attributes like ?',['CDS', '%RandomIdentifier%'])
    got = db._make_sql_query(seq_name=None,bio_type="CDS",identifier="RandomIdentifier",start=None,end=None)
    assert got == expected

    """start provided along with identifier and bio_type"""
    expected = ('SELECT * FROM GFF WHERE Type == ? AND Attributes like ? AND Start >= ?',['CDS', '%RandomIdentifier%', 0])
    got = db._make_sql_query(seq_name=None,bio_type="CDS",identifier="RandomIdentifier",start=0,end=None)
    assert got == expected

    """end provided along with identifier and bio_type"""
    expected = ('SELECT * FROM GFF WHERE Type == ? AND Attributes like ? AND End < ?',['CDS', '%RandomIdentifier%', 5000])
    got = db._make_sql_query(seq_name=None,bio_type="CDS",identifier="RandomIdentifier",start=None,end=5000)
    assert got == expected

    """start and end provided along with identifier and bio_type"""
    expected = ('SELECT * FROM GFF WHERE Type == ? AND Attributes like ? AND Start >= ? AND End < ?',['CDS', '%RandomIdentifier%', 0, 5000])
    got = db._make_sql_query(seq_name=None,bio_type="CDS",identifier="RandomIdentifier",start=0,end=5000)
    assert got == expected

    """all five attributes provided"""
    expected = ('SELECT * FROM GFF WHERE SeqID == ? AND Type == ? AND Attributes like ? AND Start >= ? AND End < ?', ['1', 'CDS', '%RandomIdentifier%', 0, 5000])
    got = db._make_sql_query(seq_name="1",bio_type="CDS",identifier="RandomIdentifier",start=0,end=5000)
    assert got == expected

    """check exception when both bio_type and identifier are missing"""
    import pytest

    with pytest.raises(ValueError):
        db._make_sql_query(seq_name=None,bio_type=None,identifier=None,start=None,end=None)

    """check exception when both bio_type and identifier are missing, even if other attributes"""

    with pytest.raises(ValueError):
        db._make_sql_query(seq_name="1",bio_type=None,identifier=None,start=0,end=1000)

    """check exception when only seq_name is provided"""

    with pytest.raises(ValueError):
        db._make_sql_query(seq_name="1",bio_type=None,identifier=None,start=None,end=None)

    """check exception when only start is provided"""

    with pytest.raises(ValueError):
        db._make_sql_query(seq_name=None,bio_type=None,identifier=None,start=0,end=None)
    
    """check exception when only end is provided"""

    with pytest.raises(ValueError):
        db._make_sql_query(seq_name=None,bio_type=None,identifier=None,start=None,end=1000)

def test_populate_from_file():
    """Test that the database is populated with the correct
    number of columns"""
    from cogent3.parse.gff import gff_parser

    gff_path = "/Users/kiratalreja/Downloads/prok_NoLocusTags.gff"
    db = GffAnnotationDb(gff_path)

    """test the number of rows populated"""
    db.db.execute(""" SELECT * FROM GFF """)
    got = len(list(db.db.fetchall()))
    parsed_gff = gff_parser(gff_path)
    expected = len(list(parsed_gff))
    assert got == expected

def test_db_query():

    """Test that the SQL query returns the correct
    number of rows for different combinations of bio_type/identifier"""

    from cogent3 import load_unaligned_seqs

    fasta_path = "/Users/kiratalreja/Desktop/short.fa"
    seqs = load_unaligned_seqs(fasta_path, moltype="dna")
    seq = seqs.seqs[0]

    gff_path = "/Users/kiratalreja/Downloads/prok_NoLocusTags.gff"
    db = GffAnnotationDb(gff_path)

    """multiple hits for the same identifier"""
    got = len(db.db_query(start=0, end=len(seq), identifier="CDS4"))
    expected = 2
    assert got == expected

    """query for an ID and recieve the children to the ID along with the parent"""
    got = len(db.db_query(start=0, end=len(seq), identifier="gene4"))
    expected = 3
    assert got == expected

    """query for an ID, with no children"""
    got = len(db.db_query(start=0, end=len(seq), identifier="trna1"))
    expected = 1
    assert got == expected

    """query for a bio type, with multiple hits"""
    got = len(db.db_query(start=0, end=len(seq), bio_type="gene"))
    expected = 8
    assert got == expected

    """query for an ID & a bio type, with a single hit"""
    got = len(db.db_query(start=0, end=len(seq), identifier="gene0", bio_type="CDS"))
    expected = 1
    assert got == expected

    """query for an ID & a bio type, with multiple hits"""
    got = len(db.db_query(start=0, end=len(seq), identifier="CDS4", bio_type="CDS"))
    expected = 2
    assert got == expected

def test_find_records():

    """Test that the coordinates the grouped correctly, and features
    formed properly"""

    from cogent3 import load_unaligned_seqs

    gff_path = "/Users/kiratalreja/Downloads/prok_NoLocusTags.gff"
    db = GffAnnotationDb(gff_path)
    fasta_path = "/Users/kiratalreja/Desktop/short.fa"
    seqs = load_unaligned_seqs(fasta_path, moltype="dna")
    seq = seqs.seqs[0]

    """combine rows with the same ID"""
    got = len(db.find_records(start=0, end=len(seq), identifier="CDS4"))
    expected = 1
    assert got == expected

    """combine rows with the same ID, when bio_type given"""
    got = len(db.find_records(start=0, end=len(seq), bio_type="CDS"))
    expected = 6
    assert got == expected

    """combine rows with the same ID, when children rows are fetched with the parent"""
    got = len(db.find_records(start=0, end=len(seq), identifier="gene4"))
    expected = 2
    assert got == expected

    """unique ID, single row returned"""
    got = len(db.find_records(start=0, end=len(seq), identifier="id020000"))
    expected = 1
    assert got == expected

if __name__ == "__main__":
    test_db_query()
    test_find_records()
    test_make_sql_query()
    test_populate_from_file()



In [7]:
test_make_sql_query()

In [3]:
expected = ('SELECT * FROM GFF WHERE Type == ? AND Attributes like ? AND Start >= ?',['CDS', '%RandomIdentifier%', 0])

In [None]:
def test_populate_from_file():
    """Test that the database is populated with the correct
    number of columns"""
    from cogent3.parse.gff import gff_parser

    gff_path = "/Users/kiratalreja/Downloads/prok_NoLocusTags.gff"
    db = GffAnnotationDb(gff_path)

    """test the number of rows populated"""
    db.db.execute(""" SELECT * FROM GFF """)
    got = len(list(db.db.fetchall()))
    parsed_gff = gff_parser(gff_path)
    expected = len(list(parsed_gff))
    assert got == expected

In [16]:
def test_populate_from_file_genbank():
    """Test that the database is populated with the correct
    number of columns"""

    db = GenbankAnnotationDb(genbank_path)

    """test the number of rows populated"""
    db.db.execute(""" SELECT * FROM Genbank """)
    got = len(list(db.db.fetchall()))
    
    with open_(genbank_path) as infile:
      data = list(MinimalGenbankParser(infile.readlines()))

    record = data[0]
    features = record['features']
    expected = 0
    for feature in features:
        if "locus_tag" not in list(feature.keys()):
            continue
        expected +=1

    assert got == expected        



    #parsed_gff = gff_parser(gff_path)
    #expected = len(list(parsed_gff))
    #assert got == expected

In [17]:
test_populate_from_file_genbank()

In [10]:
with open_(genbank_path) as infile:
     data = list(MinimalGenbankParser(infile.readlines()))

record = data[0]
features = record['features']

In [14]:
expected = 0
for feature in features:
    if "locus_tag" not in list(feature.keys()):
        continue
    expected +=1

In [15]:
count

9198

In [36]:
db = GenbankAnnotationDb(genbank_path)

In [19]:
with open_(genbank_path) as infile:
     data = list(MinimalGenbankParser(infile.readlines()))

record = data[0]

In [26]:
record['locus']

'NC_000913'

In [21]:
len(record['sequence'])

4641652

In [35]:
len(db.db_query(end = len(record['sequence'])/2 ,bio_type='CDS'))

2199

In [28]:
db.distinct(identifier=True)

{'identifier': {'b4515',
  'b1807',
  'b1310',
  'b4584',
  'b4348',
  'b0287',
  'b4665',
  'b3179',
  'b0799',
  'b1488',
  'b4755',
  'b0358',
  'b1058',
  'b2436',
  'b4372',
  'b1752',
  'b3302',
  'b3747',
  'b1870',
  'b4096',
  'b1677',
  'b1645',
  'b2893',
  'b3547',
  'b3745',
  'b2869',
  'b4512',
  'b4604',
  'b2358',
  'b2948',
  'b0935',
  'b2679',
  'b0267',
  'b2586',
  'b0180',
  'b3365',
  'b3082',
  'b2652',
  'b0120',
  'b3507',
  'b2277',
  'b3998',
  'b2252',
  'b3085',
  'b2667',
  'b1667',
  'b4770',
  'b1225',
  'b2879',
  'b0709',
  'b3535',
  'b4621',
  'b4209',
  'b1224',
  'b2377',
  'b0078',
  'b1327',
  'b4336',
  'b0333',
  'b3870',
  'b0317',
  'b1826',
  'b0453',
  'b2742',
  'b1317',
  'b0280',
  'b0415',
  'b0559',
  'b2215',
  'b4236',
  'b2871',
  'b1008',
  'b2333',
  'b4450',
  'b1596',
  'b1157',
  'b2830',
  'b3367',
  'b0875',
  'b0065',
  'b4644',
  'b0084',
  'b0764',
  'b3736',
  'b0809',
  'b0503',
  'b4654',
  'b0596',
  'b2026',
  'b240

In [46]:
from cogent3.core.annotation_db import _fetch_from_features

In [51]:
_fetch_from_features(record['features'][1])

['gene', '[[189, 255]]', 'b0001', 189, 255, 1]

In [52]:
def test_fetch_from_feature():
    from cogent3.core.annotation_db import _fetch_from_features
    with open_(genbank_path) as infile:
       data = list(MinimalGenbankParser(infile.readlines()))

    record = data[0]
    features = record['features']
    got = _fetch_from_features(features[1])
    expected = ['gene', '[[189, 255]]', 'b0001', 189, 255, 1]
    assert got == expected

    def return_type(values):
        types = []
        for v in values:
            types.append(type(v))
        return types

    got = return_type(_fetch_from_features(features[1]))
    expected = [str,str,str,int,int,int]
    assert got == expected





    


In [53]:
test_fetch_from_feature()

In [54]:
db = GenbankAnnotationDb(genbank_path)

In [64]:
db.distinct(seq_name=True)

{'LocusID': {'NC_000913'}}

In [63]:
len(db.distinct(bio_type=True,identifier=True, seq_name=True))

{'LocusID': {'NC_000913'},
 'Type': {'CDS', 'gene', 'misc_feature', 'ncRNA', 'rRNA', 'tRNA'},
 'identifier': {'b4515',
  'b1807',
  'b1310',
  'b4584',
  'b4348',
  'b0287',
  'b4665',
  'b3179',
  'b0799',
  'b1488',
  'b4755',
  'b0358',
  'b1058',
  'b2436',
  'b4372',
  'b1752',
  'b3302',
  'b3747',
  'b1870',
  'b4096',
  'b1677',
  'b1645',
  'b2893',
  'b3547',
  'b3745',
  'b2869',
  'b4512',
  'b4604',
  'b2358',
  'b2948',
  'b0935',
  'b2679',
  'b0267',
  'b2586',
  'b0180',
  'b3365',
  'b3082',
  'b2652',
  'b0120',
  'b3507',
  'b2277',
  'b3998',
  'b2252',
  'b3085',
  'b2667',
  'b1667',
  'b4770',
  'b1225',
  'b2879',
  'b0709',
  'b3535',
  'b4621',
  'b4209',
  'b1224',
  'b2377',
  'b0078',
  'b1327',
  'b4336',
  'b0333',
  'b3870',
  'b0317',
  'b1826',
  'b0453',
  'b2742',
  'b1317',
  'b0280',
  'b0415',
  'b0559',
  'b2215',
  'b4236',
  'b2871',
  'b1008',
  'b2333',
  'b4450',
  'b1596',
  'b1157',
  'b2830',
  'b3367',
  'b0875',
  'b0065',
  'b4644',
 

In [74]:
db.find_records(start=0,end=5000)

ValueError: no arguments provided