In [None]:
from bs4 import BeautifulSoup
import re
import pandas as pd
import requests
from tqdm import tqdm
import numpy as np
import json

In [None]:
bkms_reactions = pd.read_csv('bkms/1Sep2023_bkms-mapped.txt',sep='\t')

In [None]:
kegg_rids = bkms_reactions['Reaction_ID_KEGG'].drop_duplicates().tolist()

In [None]:
len(bkms_reactions['Reaction_ID_KEGG'].drop_duplicates().tolist())

In [None]:
class KeggPage():
    def __init__(self, url):
        res = requests.get(url)
        self.soup = BeautifulSoup(res.text, 'html.parser')
        self.get_other_dbs()
        
    def get_other_dbs(self):
        for tr in self.soup.find_all('tr'):
            if 'Other DBs' in tr.text:
                self.other_dbs = tr.text
    
#     def get_other_dbs_ref(self):
        

In [None]:
kp = KeggPage('https://www.genome.jp/entry/R03024')

In [None]:
kp.other_dbs

In [None]:
[x.text if x is not None else x for x in kp.soup.find_all('tr')]

In [None]:
'Other DBs' in [x for x in kp.soup.find_all('tr')][-2].text

In [None]:
for p in [x for x in kp.soup.find_all('td', class_='td{}'.format(kp.other_dbs_idx))]:
    print ([y.text for y in p.find_all('table')])

In [None]:
[x for x in kp.soup.find_all('td', class_='td{}'.format(kp.other_dbs_idx))][-1].find_all('table')

In [None]:
kps = [KeggPage('https://www.genome.jp/entry/'+r.split(',')[0]) for r in bkms_reactions['Reaction_ID_KEGG'].drop_duplicates().tolist()[:100]]

In [None]:
for k in kps:
    try:
        print (k.other_dbs)
    except:
        print ('failed')

In [None]:
for i in range(10):
    kp = kps[i]
    print (i)
    for p in [x for x in kp.soup.find_all('td', class_='td{}'.format(kp.other_dbs_idx))]:
        print ([y.text for y in p.find_all('table')])

In [None]:
def get_genes_for_kegg_rid (krid):
    kegg_gene_url = 'https://www.genome.jp/dbget-bin/get_linkdb?-t+genes+rn:'
    url = kegg_gene_url+krid
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    return [a.get('href') for a in soup.find_all('a') if a.get('href') and 'entry' in a.get('href')]

In [None]:
get_genes_for_kegg_rid('R04132')

In [None]:
kegg_rids_to_genes['R04132']

In [None]:
get_genes_for_kegg_rid(kegg_rids[0])

In [None]:
unique_kegg_rids =(','.join([str(x) for x in kegg_rids])).split(',')

In [None]:
unique_kegg_rids

In [None]:
from joblib import Parallel, delayed

In [None]:
kegg_genes = [get_genes_for_kegg_rid(x) for x in tqdm(unique_kegg_rids)]

In [None]:
kegg_gene_ids = [[y.split('/')[-1] for y in x] for x in kegg_genes]

In [None]:
#UNCOMMENT TO SAVE
# with open('kegg_rids_to_genes.json','w') as f:
#     json.dump(dict(zip(unique_kegg_rids, kegg_gene_ids)), f)


In [None]:
def parse_kegg_header (kegg_header):
    if '[' in kegg_header:
        pattern = r'>(\w+:\w+)\s+(\w+)\s+(.*)\s+\['
    else:
        pattern = r'>(\w+:\w+)\s+(\w+)\s+(.*)'
    # Use re.match to apply the pattern to the input string
    match = re.match(pattern, kegg_header)

    if match:
        # Extract matched groups
        gene_id = match.group(1)
        id_value = match.group(2)
        common_name = match.group(3)

        # Create the desired output
        result = {
            'gene_id': gene_id,
            'id': id_value,
            'common_name': common_name,
        }
        ec_pattern = r'\[EC:(.*)\]'
        ec_match = re.findall(ec_pattern, kegg_header)
        if len(ec_match):
            ec_numbers = ec_match[0]
            result['ec_numbers'] = ec_numbers.split(' ')
        return result
    else:
        print("No match found.")
        return None
    
def parse_kegg_fasta(kegg_fasta):
    split = kegg_fasta.split('\n')
    header = split[1]
    results = parse_kegg_header(header)
    results['sequence'] = ''.join(split[2:])
    return results

input_string = ">iho:Igni_0595 K14534 4-hydroxybutyryl-CoA dehydratase / vinylacetyl-CoA-Delta-isomerase [EC:4.2.1.120 5.3.3.3]"
parse_kegg_header(input_string)

In [None]:
input_string = ">iho:Igni_0595 K14534 4-hydroxybutyryl-CoA dehydratase / vinylacetyl-CoA-Delta-isomerase [EC:4.2.1.120 5.3.3.3]"

pattern = r'\[EC:(.*)\]'

match = re.search(pattern, input_string)

In [None]:
kegg_rids_to_genes = dict(zip(unique_kegg_rids, kegg_gene_ids))

In [None]:
kegg_rids_to_sequence = {}

In [None]:
def retrieve_sequence_for_first_gene(list_of_genes):
    list_of_genes = [x for x in list_of_genes if x]
    if list_of_genes:
        try:
            gene_to_query = list_of_genes[0]
            res = requests.get('https://www.genome.jp/entry/-f+-n+a+'+gene_to_query)
            soup = BeautifulSoup(res.text, 'html.parser')
            return parse_kegg_fasta(soup.find_all('pre')[0].text)
        except IndexError:
            return None
        except TypeError:
            print(list_of_genes)
            return None
        

In [None]:
kegg_sequences = Parallel(n_jobs=16)(delayed(retrieve_sequence_for_first_gene)(x) for x in tqdm(list(kegg_rids_to_genes.values())))

In [None]:
kegg_rids_to_sequences = dict(zip(unique_kegg_rids, kegg_sequences))

In [None]:
{r for r,s in kegg_rids_to_sequences.items() if s is None}

In [None]:
len(kegg_rids_to_genes)

In [None]:
len([r for r in kegg_genes if not len(r)])

In [None]:
if None:
    print ('hi')

In [None]:
retrieve_sequence_for_first_gene(kegg_rids_to_genes['R04132'])

In [None]:
kegg_rids_to_genes['R04132']

In [None]:
get_genes_for_kegg_rid('R07370')

In [None]:
kegg_rids_to_genes['R00672']

In [None]:
kegg_rids_to_genes['R00015']

In [None]:
kegg_rids_to_genes