In [14]:
# OPTIONAL: Load the "autoreload" eX_orig[alias]tension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
# Import Biopython modules to interact with KEGG
from Bio import SeqIO
from Bio.KEGG import REST
from Bio.KEGG.KGML import KGML_parser
from Bio.Graphics.KGML_vis import KGMLCanvas

import pandas as pd
import io
import numpy as np

In [16]:
mouse_pathways = REST.kegg_list("pathway", "mmu").read()
mouse_pathways.split("\n")[0:5]

['mmu01100\tMetabolic pathways - Mus musculus (house mouse)',
 'mmu01200\tCarbon metabolism - Mus musculus (house mouse)',
 'mmu01210\t2-Oxocarboxylic acid metabolism - Mus musculus (house mouse)',
 'mmu01212\tFatty acid metabolism - Mus musculus (house mouse)',
 'mmu01230\tBiosynthesis of amino acids - Mus musculus (house mouse)']

In [27]:
mouse_pathways.split("\n")

['mmu01100\tMetabolic pathways - Mus musculus (house mouse)',
 'mmu01200\tCarbon metabolism - Mus musculus (house mouse)',
 'mmu01210\t2-Oxocarboxylic acid metabolism - Mus musculus (house mouse)',
 'mmu01212\tFatty acid metabolism - Mus musculus (house mouse)',
 'mmu01230\tBiosynthesis of amino acids - Mus musculus (house mouse)',
 'mmu01232\tNucleotide metabolism - Mus musculus (house mouse)',
 'mmu01250\tBiosynthesis of nucleotide sugars - Mus musculus (house mouse)',
 'mmu01240\tBiosynthesis of cofactors - Mus musculus (house mouse)',
 'mmu00010\tGlycolysis / Gluconeogenesis - Mus musculus (house mouse)',
 'mmu00020\tCitrate cycle (TCA cycle) - Mus musculus (house mouse)',
 'mmu00030\tPentose phosphate pathway - Mus musculus (house mouse)',
 'mmu00040\tPentose and glucuronate interconversions - Mus musculus (house mouse)',
 'mmu00051\tFructose and mannose metabolism - Mus musculus (house mouse)',
 'mmu00052\tGalactose metabolism - Mus musculus (house mouse)',
 'mmu00053\tAscorbate 

In [28]:
# Filter all pathways for repair pathways
repair_pathways = []
for line in mouse_pathways.rstrip().split("\n"):
    entry, description = line.split("\t")
    if "homologous" in description:
        repair_pathways.append(entry)

repair_pathways

['mmu03450']

In [34]:
repair_pathways = [
    "mmu03460", # Fanconi Anemia,
    "mmu03450", # Non-homologous end joining
    "mmu03440"
]

In [35]:
# Get the genes for pathways and add them to a list
pathway_genes = {}
for pathway in repair_pathways:
    pathway_genes[pathway] = []
    pathway_file = REST.kegg_get(pathway).read()  # query and read each pathway

    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers, gene_description = line[12:].split("; ")
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in pathway_genes[pathway]:
                pathway_genes[pathway].append(gene_symbol)

# print(pathway_genes.keys())
# print(pathway_genes)

for p in repair_pathways:
    print(p)
    print(pathway_genes[p])
    print("\n")



mmu03460
['Atrip', 'Atr', 'Fancm', 'Faap24', 'Cenps', 'Cenpx', 'Telo2', 'Hes1', 'Faap100', 'Fanca', 'Fancb', 'Fancc', 'Fance', 'Fancf', 'Fancg', 'Fancl', 'Wdr48', 'Usp1', 'Ube2t', 'Fanci', 'Fancd2', 'Brca2', 'Palb2', 'Rad51c', 'Rad51', 'Brca1', 'Brip1', 'Fan1', 'Mlh1', 'Pms2', 'Rev1', 'Rev3l', 'Polh', 'Poli', 'Polk', 'Poln', 'Rmi1', 'Rmi2', 'Top3b', 'Top3a', 'Blm', 'Rpa1', 'Rpa2', 'Rpa3', 'Gm6195', 'Mus81', 'Eme1', 'Eme2', 'Ercc4', 'Ercc1', 'Slx1b', 'Slx4']


mmu03450
['Xrcc6', 'Xrcc5', 'Dclre1c', 'Prkdc', 'Poll', 'Polm', 'Dntt', 'Lig4', 'Xrcc4', 'Nhej1', 'Rad50', 'Mre11a', 'Fen1']


mmu03440
['Ssbp1', 'Rad50', 'Mre11a', 'Nbn', 'Atm', 'Brca1', 'Bard1', 'Rbbp8', 'Brip1', 'Topbp1', 'Abraxas1', 'Uimc1', 'Babam1', 'Babam2', 'Brcc3', 'Brcc3dc', 'Palb2', 'Brca2', 'Sem1', 'Sycp3', 'Rpa1', 'Rpa2', 'Rpa3', 'Gm6195', 'Rad51', 'Rad52', 'Rad51b', 'Rad51c', 'Rad51d', 'Xrcc2', 'Xrcc3', 'Rad54l', 'Rad54b', 'Pold1', 'Pold2', 'Pold3', 'Pold4', 'Blm', 'Top3b', 'Top3a', 'Mus81', 'Eme1']




In [19]:
# polymerases

polymerases = REST.kegg_find("mmu", "polymerase").read().split("\n")
all_polys = []
for p in polymerases:
    ps = p.split("\t")
    if len(ps) > 1:
        all_polys.append(ps[1].split(",")[0].split(";")[0])

[p for p in all_polys if "Pol" in p]

['Polr1f',
 'Polr1h',
 'Polr2e',
 'Polr2k',
 'Polr3d',
 'Polr3f',
 'Polr1e',
 'Polr3g',
 'Polr2b',
 'Polr1d',
 'Polr3gl',
 'Pole2',
 'Polr1c',
 'Polg2',
 'Polm',
 'Polb',
 'Polr1a',
 'Pold4',
 'Polr1g',
 'Pold2',
 'Polr2g',
 'Polrmt',
 'Polr1b',
 'Poli',
 'Polr3a',
 'Pole3',
 'Pola2',
 'Polg',
 'Polr2m',
 'Polh',
 'Polr2l',
 'Polr1has',
 'Polr2j',
 'Pole4',
 'Pola1',
 'Pold1',
 'Polr3k',
 'Polr2i',
 'Polr2c',
 'Polr3h',
 'Polr2f',
 'Poll',
 'Poldip3',
 'Polr2d',
 'Pold3',
 'Polr3b',
 'Polr3e',
 'Polr2h',
 'Poldip2',
 'Polr3c',
 'Poln',
 'Pole',
 'Polq',
 'Polk',
 'Polr2a']

### Downloading KGML object

In [20]:
# from Bio import SeqIO
# from Bio.KEGG.REST import *
# from Bio.KEGG.KGML import KGML_parser
# from Bio.Graphics.KGML_vis import KGMLCanvas
# from Bio.Graphics.ColorSpiral import ColorSpiral

In [21]:
# # A bit of helper code to shorten long text
# def head(text, lines=10):
#     """Print the first lines lines of the passed text."""
#     print('\n'.join(text.split('\n')[:lines] + ['[...]']))

In [22]:
# head(kegg_list('mmu03450').read())

In [23]:
# import os

# kegg_dir = "artifacts/kegg"
# pathway = "mmu03450"
# pathway_file_path = "{}/{}.xml".format(kegg_dir, pathway)
# os.makedirs(kegg_dir, exist_ok=True)
# ko_map = (kegg_get("mmu03450", "kgml").read())
# pathway_file = open(pathway_file_path, "w")
# pathway_file.write(ko_map)
# pathway_file.close()

In [24]:
# from src.parse_KGML import *

# (tree, pathway, nodes, genes) = KGML2Graph(pathway_file_path)

In [25]:
# pathway.reactions