## Parse single structure (subtree) from K page

In [235]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

# It can be faster to get text and BR link in the same function, not implemented yet.

def get_subtree_text(url):
    """ 
    url like https://www.kegg.jp/entry/K02703 are from 
    >>> with open(pathway_gene_name_filename, 'r') as f:
            gene_ids = []
            for i in f.readlines():
                gene_ids.append(i[:6])
        urls = []
        prefix = 'https://www.genome.jp/entry/'
        for i in gene_ids:
            urls.append(prefix+i)
    """
    response = requests.get(url)
    webpage = response.content
    gene_id = url[-6:]

    soup = BeautifulSoup(webpage, 'html.parser')
    for i in soup.find_all('span'):
        if '\xa0'+gene_id in i.text:
            target = i
    target = str(target).replace('\xa0', ' ')\
    .replace('</span>', '').replace('<br/>', '')\
    .replace('<span class="nowrap">', '').split('\n')
    root = TreeNode(None)
    for i in target:
        if i: # skip empty lines
            depth = len(i) - len(i.lstrip(' ')) + 1
            add_node(root, i.strip(), depth)
    return root


class TreeNode():
    def __init__(self, value) -> None:
        self.value = value
        self.children = []


def print_tree(node, level=0):
    if node.value is not None:
        try:
            print('  ' * level + node.value)
        except Exception as e:
            print(e)
            print(f"{node.value=}, {node.children=}")
            return
    for child in node.children:
        print_tree(child, level + 1)


def add_node(root, value, depth):
    current = root
    
    for _ in range(depth - 1):
        if current.children:
            current = current.children[-1]
        else:
            new_child = TreeNode(None)
            current.children.append()
            current = new_child
    
    new_node = TreeNode(value)
    current.children.append(new_node)

In [3]:
import pickle

with open("gene_link_K_page.pkl", 'rb') as p:
    urls = pickle.load(p)

In [137]:
gene_subtree_dic = {}

for url in tqdm(urls):
    gene_subtree_dic[url[-6:]] = get_subtree_text(url)

with open("gene_subtree_dic.pkl", 'wb') as p:
    pickle.dump(gene_subtree_dic, p)

100%|██████████| 63/63 [03:35<00:00,  3.43s/it]


In [119]:
with open("gene_subtree_dic.pkl", 'rb') as p:
    gene_subtree_dic_from_pkl = pickle.load(p)

## Parse total structure (tree) from BR page (need to be finished)

In [319]:
import re
import time
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
import pickle

In [321]:
def get_single_BR_url_from_k_page(k_url):
    """ 
    url like https://www.kegg.jp/entry/K02703 are from 
    >>> with open(pathway_gene_name_filename, 'r') as f:
            gene_ids = []
            for i in f.readlines():
                gene_ids.append(i[:6])
        urls = []
        prefix = 'https://www.genome.jp/entry/'
        for i in gene_ids:
            urls.append(prefix+i)
    """
    response = requests.get(k_url)
    webpage = response.content
    gene_id = k_url[-6:]

    soup = BeautifulSoup(webpage, 'html.parser')
    for i in soup.find_all('span'):
        if '\xa0'+gene_id in i.text:
            target = i
    pattern = re.compile(r'BR:<a href="(.+?)"')
    links = pattern.findall(str(target))
    return links

def parse_BR_tree(data):
    node = TreeNode(data.get('values', None))
    for child_data in data.get('children', []):
        child_node = parse_BR_tree(child_data)
        node.children.append(child_node)
    return node

def clean_br_tree_dic(node):
    # Basically convert lists in value into strings
    if 'values' in node:
        if isinstance(node['values'], list):
            node['values'] = ''.join(node['values'])
    
    if 'children' in node:
        for child in node['children']:
            clean_br_tree_dic(child)

def get_total_BR_link_tuple(urls):
    br_links = set()
    prefix = "https://www.genome.jp"
    for i in tqdm(urls):
        tmp_links = get_single_BR_url_from_k_page(i)
        for j in tmp_links:
            tmp_full_link = prefix + j
            br_links.add(tmp_full_link)

    total_BR_link = set()
    for i in br_links:
        total_BR_link.add(i[:-7])
    
    return br_links, total_BR_link

def get_total_BR_tree_dic(br_links_tuple):
    max_attempts = 5
    total_BR_tree_dic = dict()
    
    for i in tqdm(list(br_links_tuple[1])):
        current_attempt = 1
        name = i[-7:]
        while current_attempt <= max_attempts:
            try:
                br_tree = _get_single_BR_tree(i)
                total_BR_tree_dic[name] = br_tree
                break  # Exit the loop if the function runs without errors
            except Exception as e:
                print(f"Attempt {current_attempt}: An error occurred - {str(e)}")
                time.sleep(1)
                current_attempt += 1

        if current_attempt > max_attempts:
            print(f"Max attempts reached. Function execution failed on page {i}.")
    
    return total_BR_tree_dic

def _get_single_BR_tree(br_url):
    br_page = requests.get(br_url)
    soup = BeautifulSoup(br_page.content, 'html.parser')
    raw = str(soup.find_all('script')[-1]).split('\n')[1][13:]
    pattern = r'<a.*?a>'
    dic_text = re.sub(pattern, '', raw)
    replace_list = ['"expanded":false,', ',"expanded":false',
            '"expanded":true,', ',"expanded":true',
            '"expanded":true,', ',"expanded":true', ',"isRoot":true', '"isRoot":true,', '<b>', '</b>',
            ',"devMode":false', ',"columnWidth":[]', ',"visibleIndentHandle":false', ',"columnTitle":["Chaperone"]',
            ',"org":"ko"', ',"isIndexFile":false', ',"joinPruningQuery":[]', ',"alignTerminalNode":false',
            ',"zoomout":null', ',"highlight":{}', ',"joinPruningColumn":[]'
            ',"htextNo":"03110"']
    for i in replace_list:
        dic_text = dic_text.replace(i, '')

    dic_text = dic_text.replace('"values":[]', '"values": None').replace('"values": []', '"values": None')
    tree_dic = eval(dic_text)
    tree_dic = tree_dic['root']

    clean_br_tree_dic(tree_dic)
    br_tree = parse_BR_tree(tree_dic)
    
    return br_tree

In [56]:
br_links, total_BR_link = get_total_BR_link_tuple(urls)
br_links_tuple = br_links, total_BR_link
with open("br_links_tuple.pkl", 'wb') as p:
    pickle.dump(br_links_tuple, p)

In [322]:
total_BR_tree_dic = get_total_BR_tree_dic(br_links_tuple)
with open("total_BR_tree_dic.pkl", 'wb') as p:
    pickle.dump(total_BR_tree_dic, p)

  0%|          | 0/4 [00:00<?, ?it/s]

Attempt 1: An error occurred - name 'false' is not defined


 50%|█████     | 2/4 [00:05<00:04,  2.48s/it]

Attempt 2: An error occurred - name 'false' is not defined


 75%|███████▌  | 3/4 [00:26<00:10, 10.90s/it]

Attempt 3: An error occurred - name 'false' is not defined
Attempt 4: An error occurred - name 'false' is not defined


100%|██████████| 4/4 [00:42<00:00, 10.61s/it]
