### Here's how my database is set up:
I've got a table of genes, a table of enzymes, and a table of pathways. Depending on how much time I get later, I might also make some associative tables linking them.

### Gene Table
CREATE TABLE genes (id INT PRIMARY KEY ASC, name TEXT, description TEXT, organism TEXT,
chromosome TEXT, start INT, end INT, strand VARCHAR(1), sequence TEXT, translated TEXT)

**Start with name, description, organism, and nucleotide sequence. Additional fields might include chromosome, start and end position, strand, and translated sequence. For eukaryotes, the nucleotide sequence should be the spliced mRNA and the coordinates should span the entire locus.**

### Enzyme Table
CREATE TABLE enzymes (id INT PRIMARY KEY ASC, name TEXT, function TEXT, 
EC INT, pathway TEXT)
**EC = Enzyme Commission number
**name, function, and enzyme commission (EC) number. Multiple genes encode enzymes that perform the same function, so there ought to be fewer enzymes than genes.**

### Pathway Table
CREATE TABLE pathways (id INT PRIMARY KEY ASC, name TEXT, description TEXT)

In [25]:
#This segment connects to the metabolism database and readies it to receive input
import sqlite3
from Bio import Entrez
from Bio import SeqIO
Entrez.email = 'hverdonk@berkeley.edu'
conn = sqlite3.connect('metabolism.db')
c = conn.cursor()

In [144]:
#Creates empty Enzyme Table
c.execute("""DROP TABLE enzymes""")
c.execute("""CREATE TABLE enzymes (name TEXT, 
                                    description TEXT, 
                                    EC TEXT, 
                                    pathway TEXT)""")
conn.commit()

In [48]:
#Creates emtpy Gene Table
c.execute("""DROP TABLE genes""")
c.execute("""CREATE TABLE genes (name TEXT, 
                                description TEXT, 
                                organism TEXT,  
                                nt_sequence TEXT)""")
conn.commit()

In [33]:
#Creates empty Pathway Table
c.execute("""CREATE TABLE pathways (name TEXT, 
                                    description TEXT)""")
conn.commit()

In [34]:
#Fill the Pathway Table
c.execute("""INSERT INTO pathways
                  VALUES ('glycolysis', 
                            'converts glucose into pyruvate to generate ATP and NADH'),
                            ('citric acid cycle', 
                            'the complete oxidation of glucose derivatives to carbon dioxide to produce ATP'), 
                            ('pentose phosphate pathway', 
                            'converts glucose into pentose to generate NADPH and ribose 5-phosphate');""")
conn.commit()

FILL IN EXPLANATION LATER

In [145]:
#Fills enzymes table

citric_acid_terms = ['homo sapiens[ORGN] EC 2.3.1.12', 'drosophila[ORGN] EC 2.3.1.12', 'Escherichia coli[ORGN] EC 2.3.1.12',
                     'homo sapiens[ORGN] EC 1.2.4.1', 'drosophila[ORGN] EC 1.2.4.1', 'Escherichia coli[ORGN] EC 1.2.4.1',
                     'homo sapiens[ORGN] EC 1.8.1.4', 'drosophila[ORGN] EC 1.8.1.4', 'Escherichia coli[ORGN] EC 1.8.1.4',
                     'homo sapiens[ORGN] EC 4.1.1.32', 'drosophila[ORGN] EC 4.1.1.32', 'Escherichia coli[ORGN] EC 4.1.1.49',]
glycolysis_terms = ['homo sapiens[ORGN] EC 5.4.2.2', 'drosophila[ORGN] EC 5.4.2.2', 'Escherichia coli[ORGN] EC 5.4.2.2',
                    'homo sapiens[ORGN] EC 5.3.1.9', 'drosophila[ORGN] EC 5.3.1.9', 'Escherichia coli[ORGN] EC 5.3.1.9',
                    'homo sapiens[ORGN] EC 4.2.1.11', 'drosophila[ORGN] EC 4.2.1.11', 'Escherichia coli[ORGN] EC 4.2.1.11',
                    'homo sapiens[ORGN] EC 2.7.2.3', 'drosophila[ORGN] EC 2.7.2.3', 'Escherichia coli[ORGN] EC 2.7.2.3']
pentose_phosphate_terms = ['homo sapiens[ORGN] EC 4.3.1.9', 'drosophila[ORGN] EC 4.3.1.9', 'Escherichia coli[ORGN] EC 4.3.1.9',
                           'homo sapiens[ORGN] EC 2.7.1.11', 'drosophila[ORGN] EC 2.7.1.11', 'Escherichia coli[ORGN] EC 2.7.1.11', 
                           'homo sapiens[ORGN] EC 4.1.2.13', 'drosophila[ORGN] EC 4.1.2.13', 'Escherichia coli[ORGN] EC 4.1.2.13', 
                           'homo sapiens[ORGN] EC 3.1.3.11', 'drosophila[ORGN] EC 3.1.3.11', 'Escherichia coli[ORGN] EC 3.1.3.11']
genes=[]

"""
Fills enzyme table with first search results to an Entrez protein database query. Adds one gene 
corresponding to each enzyme to a list of genes.
"""
def fill_enzyme_table(cycle_name, cycle_list):
    for t in cycle_list:
        handle = Entrez.esearch(db='protein',
                        term=t,
                        sort='relevance',
                        idtype='acc')
        results = Entrez.read(handle)['IdList']
        if results:
            first_result = results[0]
            handle=Entrez.efetch(db='protein', id=first_result, rettype='gb', retmode='text')
            temp = SeqIO.read(handle, 'gb')
            gene_acc_num = temp.annotations['db_source']
            genes.append(gene_acc_num)
            c.execute("INSERT INTO enzymes VALUES (?, ?, ?, ?)", (temp.name, temp.description, t, cycle_name))

fill_enzyme_table("citric acid cycle", citric_acid_terms)
fill_enzyme_table("glycolysis", glycolysis_terms)
fill_enzyme_table("pentose phosphate pathway", pentose_phosphate_terms)


In order to get the gene accession numbers out of the enzyme entries, I had to just grab all the text surrounding the number itself. This next section just strips away the unneccessary text and puts the numbers in a new list.

In [166]:
gene_acc_nums = []

for g in genes:
    l = g.split()
    gene_acc_nums.append(l[-1])
    
print(gene_acc_nums)

['AK299562.1', 'QDKR01000002.1', 'M24848.1', 'LOJL01000004.1', 'AK300077.1', 'LOJL01000003.1', 'AK316206.1', 'LOJL01000036.1', 'AK300100.1', 'QDKR01000003.1', 'AAB36062.1', 'LOJL01000043.1', 'M14328.1', 'AAB26877.1', 'LOJL01000011.1', 'AK301740.1', 'LOJL01000025.1', 'AK303681.1', 'LOJM01000004.1', 'M21190.1', 'LOJM01000004.1', 'M19922.1', 'LOJL01000003.1']


In [156]:
"""
Takes in a list of search queries to the Entrez database and populates the genes table 
with the results of those queries.
"""
def fill_gene_table(genes_list):
    for t in genes_list:
        handle = Entrez.esearch(db='nucleotide',
                        term=t,
                        sort='relevance',
                        idtype='acc')
        results = Entrez.read(handle)['IdList']
        if results:
            first_result = results[0]
            handle=Entrez.efetch(db='nucleotide', id=first_result, rettype='gb', retmode='text')
            temp = SeqIO.read(handle, 'gb')
            c.execute("INSERT INTO genes VALUES (?, ?, ?, ?)", 
                      (temp.name, temp.description, temp.annotations['organism'], str(temp.seq)))
        else:
            continue


fill_gene_table(gene_acc_nums)

In [160]:
#how to print a whole table
c.execute("""SELECT * FROM pathways""")
print(c.fetchall())

[('glycolysis', 'converts glucose into pyruvate to generate ATP and NADH'), ('citric acid cycle', 'the complete oxidation of glucose derivatives to carbon dioxide to produce ATP'), ('pentose phosphate pathway', 'converts glucose into pentose to generate NADPH and ribose 5-phosphate')]
