### Here's how my database is set up:
I've got a table of genes, a table of enzymes, and a table of pathways. Depending on how much time I get later, I might also make some associative tables linking them.

### Gene Table
CREATE TABLE genes (id INT PRIMARY KEY ASC, name TEXT, description TEXT, organism TEXT,
chromosome TEXT, start INT, end INT, strand VARCHAR(1), sequence TEXT, translated TEXT)

**Start with name, description, organism, and nucleotide sequence. Additional fields might include chromosome, start and end position, strand, and translated sequence. For eukaryotes, the nucleotide sequence should be the spliced mRNA and the coordinates should span the entire locus.**

### Enzyme Table
CREATE TABLE enzymes (id INT PRIMARY KEY ASC, name TEXT, function TEXT, 
EC INT, pathway TEXT)
**EC = Enzyme Commission number
**name, function, and enzyme commission (EC) number. Multiple genes encode enzymes that perform the same function, so there ought to be fewer enzymes than genes.**

### Pathway Table
CREATE TABLE pathways (id INT PRIMARY KEY ASC, name TEXT, description TEXT)

In [25]:
#This segment connects to the metabolism database and readies it to receive input
import sqlite3
from Bio import Entrez
from Bio import SeqIO
Entrez.email = 'hverdonk@berkeley.edu'
conn = sqlite3.connect('metabolism.db')
c = conn.cursor()

In [48]:
#Creates emtpy Gene Table
c.execute("""CREATE TABLE genes (name TEXT, 
                                description TEXT, 
                                organism TEXT,  
                                nt_sequence TEXT)""")
conn.commit()

In [112]:
#Creates empty Enzyme Table
c.execute("""DROP TABLE enzymes""")
c.execute("""CREATE TABLE enzymes (name TEXT, 
                                    description TEXT, 
                                    EC TEXT, 
                                    pathway TEXT)""")
conn.commit()

In [33]:
#Creates empty Pathway Table
c.execute("""CREATE TABLE pathways (name TEXT, 
                                    description TEXT)""")
conn.commit()

In [34]:
#Fill the Pathway Table
c.execute("""INSERT INTO pathways
                  VALUES ('glycolysis', 
                            'converts glucose into pyruvate to generate ATP and NADH'),
                            ('citric acid cycle', 
                            'the complete oxidation of glucose derivatives to carbon dioxide to produce ATP'), 
                            ('pentose phosphate pathway', 
                            'converts glucose into pentose to generate NADPH and ribose 5-phosphate');""")
conn.commit()

In [35]:
#how to print a whole table
c.execute("""SELECT * FROM pathways""")
print(c.fetchall())

[('glycolysis', 'converts glucose into pyruvate to generate ATP and NADH'), ('citric acid cycle', 'the complete oxidation of glucose derivatives to carbon dioxide to produce ATP'), ('pentose phosphate pathway', 'converts glucose into pentose to generate NADPH and ribose 5-phosphate')]


In [110]:
#finds accession numbers for genes that match our search term (for every enzyme, add which pathway it's part of)
handle = Entrez.esearch(db='nucleotide',
                        term='embl accession FITM01000007.1',
                        sort='relevance',
                        idtype='acc')

#fetches the records corresponding to each accession number we found earlier
for i in Entrez.read(handle)['IdList']:
    handle=Entrez.efetch(db='nucleotide', id=i, rettype='gb', retmode='text')
    print(SeqIO.read(handle, 'gb') + '\n' + '\n')

In [113]:
citric_acid_terms = ['EC 2.3.1.12', 'EC 1.2.4.1', 'EC 1.8.1.4', 'EC 4.1.1.49', 'EC 4.1.1.32']
glycolysis_terms = ['EC 5.4.2.2', 'EC 5.3.1.9', 'EC 4.2.1.11', 'EC 2.7.2.3']
pentose_phosphate_terms = ['EC 4.3.1.9', 'EC 2.7.1.11', 'EC 4.1.2.13', 'EC 3.1.3.11']
genes=[]

"""
Fills enzyme table and adds one gene corresponding to that enzyme to a list of genes. 
Original function would add all results of the search query to the table, but it kept timing out :(
"""
def fill_enzyme_table(cycle_name, cycle_list):
    for t in cycle_list:
        handle = Entrez.esearch(db='protein',
                        term=t,
                        sort='relevance',
                        idtype='acc')
        results = Entrez.read(handle)['IdList']
        if results:
            for i in range(3):
                next_result = results[i]
                handle=Entrez.efetch(db='protein', id=next_result, rettype='gb', retmode='text')
                temp = SeqIO.read(handle, 'gb')
                gene_acc_num = temp.annotations['db_source'][len("embl accession "):]
                genes.append(gene_acc_num)
                c.execute("INSERT INTO enzymes VALUES (?, ?, ?, ?)", (temp.name, temp.description, t, cycle_name))
        else:
            continue

fill_enzyme_table("citric acid cycle", citric_acid_terms)
fill_enzyme_table("glycolysis", glycolysis_terms)
fill_enzyme_table("pentose phosphate pathway", pentose_phosphate_terms)


In [114]:
c.execute("""SELECT * FROM enzymes""")
print(c.fetchall())

[('CBW75502', 'Dihydrolipoamide acetyltransferase component of pyruvate dehydrogenase complex (EC 2.3.1.12) [Paraburkholderia rhizoxinica HKI 454]', 'EC 2.3.1.12', 'citric acid cycle'), ('SAY38694', 'Dihydrolipoamide acetyltransferase component of pyruvate dehydrogenase complex (EC 2.3.1.12) [Candidatus Synechococcus spongiarum]', 'EC 2.3.1.12', 'citric acid cycle'), ('CTQ99038', 'Dihydrolipoamide acetyltransferase component of pyruvate dehydrogenase complex (EC 2.3.1.12) [Kibdelosporangium sp. MJ126-NF4]', 'EC 2.3.1.12', 'citric acid cycle'), ('CBW75503', 'Pyruvate dehydrogenase E1 component (EC 1.2.4.1) [Paraburkholderia rhizoxinica HKI 454]', 'EC 1.2.4.1', 'citric acid cycle'), ('CZB10753', 'Pyruvate dehydrogenase E1 component beta subunit (EC 1.2.4.1) [Candidatus Synechococcus spongiarum]', 'EC 1.2.4.1', 'citric acid cycle'), ('CZB19904', 'Pyruvate dehydrogenase E1 component alpha subunit (EC 1.2.4.1) [Candidatus Synechococcus spongiarum]', 'EC 1.2.4.1', 'citric acid cycle'), ('CBW

In [115]:
print(genes)

['FR687359.1', 'FITM01000071.1', 'LN877229.1', 'FR687359.1', 'FITM01000007.1', 'FITM01000129.1', 'FR687359.1', 'FR687359.1', 'FITM01000101.1', 'HG966617.1', 'FO203355.1', 'AM236080.1', 'FR687359.1', 'LN877229.1', 'LN877229.1']
