## Parse single structure (subtree) from K page

In [136]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

def get_subtree_text(url):
    """ 
    url like https://www.kegg.jp/entry/K02703 are from 
    >>> with open(pathway_gene_name_filename, 'r') as f:
            gene_ids = []
            for i in f.readlines():
                gene_ids.append(i[:6])
        urls = []
        prefix = 'https://www.genome.jp/entry/'
        for i in gene_ids:
            urls.append(prefix+i)
    """
    response = requests.get(url)
    webpage = response.content
    gene_id = url[-6:]

    soup = BeautifulSoup(webpage, 'html.parser')
    for i in soup.find_all('span'):
        if '\xa0'+gene_id in i.text:
            target = i
    target = str(target).replace('\xa0', ' ')\
    .replace('</span>', '').replace('<br/>', '')\
    .replace('<span class="nowrap">', '').split('\n')
    root = TreeNode(None)
    for i in target:
        if i: # skip empty lines
            depth = len(i) - len(i.lstrip(' ')) + 1
            add_node(root, i.strip(), depth)
    return root


class TreeNode():
    def __init__(self, value) -> None:
        self.value = value
        self.children = []


def print_tree(node, level=0):
    if node.value is not None:
        print('  ' * level + node.value)
    for child in node.children:
        print_tree(child, level + 1)


def add_node(root, value, depth):
    current = root
    
    for _ in range(depth - 1):
        if current.children:
            current = current.children[-1]
        else:
            new_child = TreeNode(None)
            current.children.append()
            current = new_child
    
    new_node = TreeNode(value)
    current.children.append(new_node)

In [128]:
import pickle

with open("gene_link_K_page.pkl", 'rb') as p:
    urls = pickle.load(p)

In [137]:
gene_subtree_dic = {}

for url in tqdm(urls):
    gene_subtree_dic[url[-6:]] = get_subtree_text(url)

with open("gene_subtree_dic.pkl", 'wb') as p:
    pickle.dump(gene_subtree_dic, p)

100%|██████████| 63/63 [03:35<00:00,  3.43s/it]


In [141]:
print_tree(gene_subtree_dic['K02111']) == print_tree(a)

  KEGG Orthology (KO) [BR:<a href="/brite/ko00001+K02111">ko00001</a>]
    09100 Metabolism
      09102 Energy metabolism
        00190 Oxidative phosphorylation
          K02111  ATPF1A, atpA; F-type H+/Na+-transporting ATPase subunit alpha
        00195 Photosynthesis
          K02111  ATPF1A, atpA; F-type H+/Na+-transporting ATPase subunit alpha
    09180 Brite Hierarchies
      09181 Protein families: metabolism
        00194 Photosynthesis proteins
          K02111  ATPF1A, atpA; F-type H+/Na+-transporting ATPase subunit alpha
  Enzymes [BR:<a href="/brite/ko01000+K02111">ko01000</a>]
    7. Translocases
      7.1  Catalysing the translocation of protons
        7.1.2  Linked to the hydrolysis of a nucleoside triphosphate
          7.1.2.2  H+-transporting two-sector ATPase
            K02111  ATPF1A, atpA; F-type H+/Na+-transporting ATPase subunit alpha
      7.2  Catalysing the translocation of inorganic cations
        7.2.2  Linked to the hydrolysis of a nucleoside triphosphat

True

In [144]:
gene_subtree_dic_from_pkl.keys()

dict_keys(['K02108', 'K02109', 'K02110', 'K02111', 'K02112', 'K02113', 'K02114', 'K02115', 'K02634', 'K02635', 'K02636', 'K02637', 'K02638', 'K02639', 'K02640', 'K02641', 'K02642', 'K02643', 'K02689', 'K02690', 'K02691', 'K02692', 'K02693', 'K02694', 'K02695', 'K02696', 'K02697', 'K02698', 'K02699', 'K02700', 'K02701', 'K02702', 'K02703', 'K02704', 'K02705', 'K02706', 'K02707', 'K02708', 'K02709', 'K02710', 'K02711', 'K02712', 'K02713', 'K02714', 'K02716', 'K02717', 'K02718', 'K02719', 'K02720', 'K02721', 'K02722', 'K02723', 'K02724', 'K03541', 'K03542', 'K03689', 'K08901', 'K08902', 'K08903', 'K08904', 'K08905', 'K08906', 'K14332'])

In [145]:
with open("gene_subtree_dic.pkl", 'rb') as p:
    gene_subtree_dic_from_pkl = pickle.load(p)

print_tree(gene_subtree_dic_from_pkl['K02109'])

  KEGG Orthology (KO) [BR:<a href="/brite/ko00001+K02109">ko00001</a>]
    09100 Metabolism
      09102 Energy metabolism
        00190 Oxidative phosphorylation
          K02109  ATPF0B, atpF; F-type H+-transporting ATPase subunit b
        00195 Photosynthesis
          K02109  ATPF0B, atpF; F-type H+-transporting ATPase subunit b
    09180 Brite Hierarchies
      09181 Protein families: metabolism
        00194 Photosynthesis proteins
          K02109  ATPF0B, atpF; F-type H+-transporting ATPase subunit b
  Photosynthesis proteins [BR:<a href="/brite/ko00194+K02109">ko00194</a>]
    Photosystem and electron transport system
      F-type ATPase [OT]
        K02109  ATPF0B, atpF; F-type H+-transporting ATPase subunit b


In [115]:
a = get_subtree_text("https://www.kegg.jp/entry/K02111")

Finished requesting.


Adding tree nodes: 100%|██████████| 26/26 [00:00<?, ?it/s]


In [125]:
a.children[0].children[0].children[0].children[0].children[0].value

'K02111  ATPF1A, atpA; F-type H+/Na+-transporting ATPase subunit alpha'

In [116]:
print_tree(a)

  KEGG Orthology (KO) [BR:<a href="/brite/ko00001+K02111">ko00001</a>]
    09100 Metabolism
      09102 Energy metabolism
        00190 Oxidative phosphorylation
          K02111  ATPF1A, atpA; F-type H+/Na+-transporting ATPase subunit alpha
        00195 Photosynthesis
          K02111  ATPF1A, atpA; F-type H+/Na+-transporting ATPase subunit alpha
    09180 Brite Hierarchies
      09181 Protein families: metabolism
        00194 Photosynthesis proteins
          K02111  ATPF1A, atpA; F-type H+/Na+-transporting ATPase subunit alpha
  Enzymes [BR:<a href="/brite/ko01000+K02111">ko01000</a>]
    7. Translocases
      7.1  Catalysing the translocation of protons
        7.1.2  Linked to the hydrolysis of a nucleoside triphosphate
          7.1.2.2  H+-transporting two-sector ATPase
            K02111  ATPF1A, atpA; F-type H+/Na+-transporting ATPase subunit alpha
      7.2  Catalysing the translocation of inorganic cations
        7.2.2  Linked to the hydrolysis of a nucleoside triphosphat

## Parse total structure (tree) from BR page (need to be finished)

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
a = requests.get("https://www.kegg.jp/brite/ko00194")

<Response [200]>

In [None]:
soup = BeautifulSoup(a.content, 'html.parser')

In [None]:
b = str(soup.find_all('script')[-1]).split('\n')[1]

In [None]:
print(b)

	const env = {"visibleIndentHandle":false,"columnWidth":[],"htextNo":"00194","isIndexFile":false,"alignTerminalNode":false,"zoomout":null,"org":"ko","joinPruningColumn":[],"devMode":false,"joinPruningQuery":[],"highlight":{},"root":{"expanded":true,"children":[{"children":[{"values":["Photosystem II (P680 chlorophyll a) [OT]"],"children":[{"children":[{"values":["<a href=\"/entry/K02703\">K02703</a>  psbA; photosystem II P680 reaction center D1 protein [EC:<a href=\"/entry/1.10.3.9\">1.10.3.9</a>]"],"children":[],"expanded":false},{"children":[],"values":["<a href=\"/entry/K02706\">K02706</a>  psbD; photosystem II P680 reaction center D2 protein [EC:<a href=\"/entry/1.10.3.9\">1.10.3.9</a>]"],"expanded":false},{"values":["<a href=\"/entry/K02705\">K02705</a>  psbC; photosystem II CP43 chlorophyll apoprotein"],"children":[],"expanded":false},{"values":["<a href=\"/entry/K02704\">K02704</a>  psbB; photosystem II CP47 chlorophyll apoprotein"],"children":[],"expanded":false},{"children":[]