In [73]:
import sqlite3 as sql

# 1. Create Database and 3 fundamental tables

* command "sql.connect()" creates a new database file in a designated address.
* use .cursor() to create a cursor.
* use curson_name.execute() to execute SQLite syntax.

In [74]:
con = sql.connect("Enzyme_in_Pathway.db")
c = con.cursor()

In [75]:
c.execute("""CREATE TABLE genes(id TEXT,
                                 name TEXT,
                                 description TEXT,
                                 organism TEXT,
                                 nucleotide_sequence TEXT,
                                 chromosome TEXT,
                                 start INT,
                                 end INT,
                                 strand VARCHAR(1),
                                 translated_sequence TEXT);""")

c.execute("""CREATE TABLE pathway(id TEXT, 
                                  name TEXT,
                                  description TEXT);""")

c.execute("""CREATE TABLE enzyme(id TEXT,
                                 name TEXT,
                                 function TEXT,
                                 EC_number TEXT);""")



<sqlite3.Cursor at 0x7f5c3c77e880>

# 2. Create Associative Tables

1. enzymes_in_pathways_with_order associative table, this table indicates which pathway the enzymes beclong to, and an additional number is added to this table to show the order of enzyme in specified pathway.



In [76]:
c.execute("""CREATE TABLE enzymes_in_pathways_with_order(gene_id INT,
                                                         pathway_id INT,
                                                         order_in_pathway INT);""")

c.execute("""CREATE TABLE genes_to_enzymes(gene_id INT,
                                         enzyme_id INT);""")



<sqlite3.Cursor at 0x7f5c3c77e880>

# 3. Insert information of 3 pathway to pathway table

## 1. Go to KEGG and search for 3 pathway. (with browser)
* website address of glycolysis: https://www.genome.jp/dbget-bin/www_bget?map00010

* website address of the citrate cycle : https://www.genome.jp/dbget-bin/www_bget?pathway+map00020

* website address of the pentose phosphate : https://www.genome.jp/dbget-bin/www_bget?map00030

## 2. Insert these pathway's ID (KEGG format), name and description to pathway table.


In [77]:
c.execute("""INSERT INTO pathway(id, name, description) VALUES(?,?,?)""", 
          ("map00010", "Glycolysis", "Glycolysis is the process of converting glucose into pyruvate and generating small amounts of ATP (energy) and NADH (reducing power). It is a central pathway that produces important precursor metabolites: six-carbon compounds of glucose-6P and fructose-6P and three-carbon compounds of glycerone-P, glyceraldehyde-3P, glycerate-3P, phosphoenolpyruvate, and pyruvate [MD:M00001]. Acetyl-CoA, another important precursor metabolite, is produced by oxidative decarboxylation of pyruvate [MD:M00307]. When the enzyme genes of this pathway are examined in completely sequenced genomes, the reaction steps of three-carbon compounds from glycerone-P to pyruvate form a conserved core module [MD:M00002], which is found in almost all organisms and which sometimes contains operon structures in bacterial genomes. Gluconeogenesis is a synthesis pathway of glucose from noncarbohydrate precursors. It is essentially a reversal of glycolysis with minor variations of alternative paths [MD:M00003]."))

c.execute("""INSERT INTO pathway(id, name, description) VALUES(?,?,?)""",
          ("map00020", "Citrate cycle", "The citrate cycle (TCA cycle, Krebs cycle) is an important aerobic pathway for the final steps of the oxidation of carbohydrates and fatty acids. The cycle starts with acetyl-CoA, the activated form of acetate, derived from glycolysis and pyruvate oxidation for carbohydrates and from beta oxidation of fatty acids. The two-carbon acetyl group in acetyl-CoA is transferred to the four-carbon compound of oxaloacetate to form the six-carbon compound of citrate. In a series of reactions two carbons in citrate are oxidized to CO2 and the reaction pathway supplies NADH for use in the oxidative phosphorylation and other metabolic processes. The pathway also supplies important precursor metabolites including 2-oxoglutarate. At the end of the cycle the remaining four-carbon part is transformed back to oxaloacetate. According to the genome sequence data, many organisms seem to lack genes for the full cycle [MD:M00009], but contain genes for specific segments [MD:M00010 M00011]."))

c.execute("""INSERT INTO pathway(id, name, description) VALUES(?,?,?)""", 
          ("map00030", "Pentose phosphate pathway", "The pentose phosphate pathway is a process of glucose turnover that produces NADPH as reducing equivalents and pentoses as essential parts of nucleotides. There are two different phases in the pathway. One is irreversible oxidative phase in which glucose-6P is converted to ribulose-5P by oxidative decarboxylation, and NADPH is generated [MD:M00006]. The other is reversible non-oxidative phase in which phosphorylated sugars are interconverted to generate xylulose-5P, ribulose-5P, and ribose-5P [MD:M00007]. Phosphoribosyl pyrophosphate (PRPP) formed from ribose-5P [MD:M00005] is an activated compound used in the biosynthesis of histidine and purine/pyrimidine nucleotides. This pathway map also shows the Entner-Doudoroff pathway where 6-P-gluconate is dehydrated and then cleaved into pyruvate and glyceraldehyde-3P [MD:M00008]."))

con.commit()


# 4. Insert Enzymes into table

In [78]:
from Bio.KEGG import REST
from Bio.KEGG.KGML import KGML_parser as ks
from Bio.KEGG import Enzyme

def get_enzymes(enzyme):
    request = REST.kegg_get(enzyme)
    record = Enzyme.read(request)
    #print(record.genes[0])
    #print(record.name)
    return record

record = get_enzymes("1.1.1.1")

def insert_enzymes(record):
    name = record.name
    EC_number = record.entry
    function = record.comment
    c.execute("""INSERT INTO enzyme(name, function, EC_number) VALUES(?,?,?)""",
             (str(name), str(function), str(EC_number)))
    con.commit()
    
    



### I got the list of enzymes in 3 pathway, the website links are here:
glycolysis enzymes: https://www.genome.jp/dbget-bin/get_linkdb?-t+enzyme+path:map00010

In [79]:
enzyme_list1 = ["1.1.1.1", 
                "1.1.1.2", 
                "1.1.1.27",             
                "1.1.2.7",]


for i in range(0,len(enzyme_list1)):
    insert_enzymes(get_enzymes(enzyme_list[i]))



TCA cycle enzymes: https://www.genome.jp/dbget-bin/get_linkdb?-t+enzyme+path:map00020

In [80]:
enzyme_list2 = ["1.1.1.286",            
                "1.1.1.37",          
                "1.1.1.41",            
                "1.1.1.42"]


for i in range(0,len(enzyme_list2)):
    insert_enzymes(get_enzymes(enzyme_list[i]))

pentose phosphate pathway enzymes: https://www.genome.jp/dbget-bin/get_linkdb?-t+enzyme+path:map00030

In [81]:
enzyme_list3 = ["1.1.1.215", 
                "1.1.1.343",          
                "1.1.1.359",         
                "1.1.1.360"]


for i in range(0,len(enzyme_list3)):
    insert_enzymes(get_enzymes(enzyme_list[i]))

# Accessing enzymes' gene datas
1. use .esearch() to search the databases in NCBI
2. use .efetch() to get more complete information about an item.
3. Wrap these as a function


In [82]:
def search_for(db,term1):
    
    from Bio import Entrez as ent
    from Bio import SeqIO

    ent.email = 'lijiaxiaoxiong@icloud.com'

    handle = ent.esearch(db = db,
                            term = term1,
                            sort = 'relevant',
                            idytpe = 'acc')
    return handle
    

In [102]:
def deal_genbank(handle):
    from Bio import Entrez as ent
    from Bio import SeqIO
    for i in ent.read(handle)['IdList']:
       
        handle = ent.efetch(db = 'nucleotide', id = i, rettype = 'gb', retmode = 'text') 
        record = SeqIO.read(handle, "genbank")
       
        GI = i
        name = record.name
        description = record.description
        organism = record.annotations["organism"]
        
        #use SeqIO to read genbank format, the features are forming a dictionary called annotations.
        
        #chromosome = record.features[0].qualifiers["chromosome"]
        start = record.features[0].location.start
        end = record.features[0].location.end
        strand = record.features[0].location.strand
        seq = record.seq
        trans= ''
        
        for j in range(0,len(record.features)):
            if record.features[j].type == "CDS":
                trans = record.features[j].qualifiers["translation"]
                
        c.execute("""INSERT INTO genes(id,name,description, organism, nucleotide_sequence,chromsome, start, end, strand, translated_sequence) VALUES (?,?,?,?,?,?,?,?,?,?);""", 
                  (int(i), str(name), str(description),str(organism), str(seq), str(chromosome),int(start), int(end), int(strand), str(trans)))
        
        con.commit()
    return handle
    

In [97]:
def generate(enzyme, species):
    record = get_enzymes(enzyme)
    term1 = species + ' ' + record.name[0]
    deal_genbank(search_for("nucleotide", term1))
    