In [None]:
import pandas as pd
import json
import re
import requests
import numpy as np

In [None]:
# Get query results from uniprot (for UNIPROT ID —> sequence)
with open('20Nov2023uniprot_chkpoint_for_brenda.txt', 'r') as f:
    queried = f.readlines()

In [None]:
print (len(queried))
queried = [x for x in queried if 'rror' not in x]
print (len(queried))

In [None]:
def extract_header_information(header):
    regex_pattern = r'>?([^|]+)\|([^|]+)\|([^ ]+?) (.*?)(?: OS=(.+?))(?: OX=(.+))?(?: GN=(.+?))?(?: PE=(.+?))(?: SV=(.+?))'
    match = re.match(regex_pattern, header)
    if match:
        db_value = match.group(1)
        unique_identifier = match.group(2)
        entry_name = match.group(3)
        protein_name = match.group(4)
        organism_name = match.group(5)
        organism_identifier = match.group(6)
        gene_name = match.group(7)
        protein_existence = match.group(8)
        sequence_version = match.group(9)

        return {
            "db": db_value,
            "UniqueIdentifier": unique_identifier,
            "EntryName": entry_name,
            "ProteinName": protein_name,
            "OrganismName": organism_name,
            "OrganismIdentifier": organism_identifier,
            "GeneName": gene_name,
            "ProteinExistence": protein_existence,
            "SequenceVersion": sequence_version
        }
    else:
        print ("Failed parsing")
        return None

header_example = '>sp|Q01740|FMO1_HUMAN Flavin-containing monooxygenase 1 OS=Homo sapiens OX=9606 PE=1 SV=3'


result = extract_header_information(header_example)
if result:
    print("Extracted Information:")
    for key, value in result.items():
        print(f"{key}: {value}")
else:
    print("Header format doesn't match the pattern.")


def parse_uniprot_result (uniprot_result):
    header = uniprot_result.split('\n')[0]
    sequence = ''.join(uniprot_result.split('\n')[1:])
    info = extract_header_information(header)
    if info is not None:
        info['sequence'] = sequence
    else:
        print (header)
        print (sequence)
    return info

def parse_uniprot_batch (uniprot_batch):
    result_ls = []
    for uniprot_result in uniprot_batch.split('\n>'):
        if uniprot_result is not None:
            parsed = parse_uniprot_result(uniprot_result)
            if parsed is not None:
                result_ls.append(parsed)
    return result_ls

In [None]:
parsed_queried = parse_uniprot_batch(''.join(queried))

In [None]:
# get previously parsed dictionary of UNIPROT ID —> sequence 
with open('protein_id_to_sequence.json', 'r') as f:
    pid2seq = json.load(f)

In [None]:
# Add BRENDA UNIPROT IDs to dictionary
for entry in parsed_queried:
    pid2seq[entry['UniqueIdentifier']] = entry['sequence']

In [None]:
# Save more complete dictionary
with open('20Nov2023_protein_id_to_sequence.json','w') as f:
    json.dump(pid2seq, f)

In [None]:
# Get info linking brenda reactions to UNIPROT IDs
with open('../parse_reaction_dbs/brenda/scraped_brenda_substrate_results.json', 'r') as f:
    #requires further conversion from uniprot to seq
    brenda2uniprot = json.load(f)

In [None]:
# clean brenda dict
# remove lines with no UNIPROT ID
cleaned_brenda2uniprot = {}
for ec in brenda2uniprot:
    new_entries = []
    for entry in brenda2uniprot[ec]:
        if entry['UNIPROT']!='-' and len(entry['UNIPROT']) and entry['SUBSTRATE']!='additional information'and entry['PRODUCT']!='?':
            new_entry = entry
            new_entry['molecules'] = set(entry['PRODUCT'].split(' + ')).union(set(entry['SUBSTRATE'].split(' + ')))
            new_entries.append(new_entry)
    cleaned_brenda2uniprot[ec] = new_entries

In [None]:
bkms = pd.read_csv('bkms/1Sep2023_bkms-mapped.txt', sep='\t').drop(columns=['Unnamed: 0.1','index','Unnamed: 0'])

In [None]:
brenda_subset = bkms.dropna(subset=['Reaction_ID_BRENDA'])
print (len (brenda_subset))

In [None]:
def get_reactants (reaction):
    if '<=>' in reaction:
        return reaction.split(' <=> ')[0].split(' + ')
    elif ' = ' in reaction:
        return reaction.split(' = ')[0].split(' + ')
    else:
        raise 
def get_products (reaction):
    if '<=>' in reaction:
        return reaction.split(' <=> ')[1].split(' + ')
    elif ' = ' in reaction:
        return reaction.split(' = ')[1].split(' + ')
    else:
        raise 
        
brenda_subset['reactants'] = brenda_subset['Reaction'].map(lambda x : get_reactants(x))
brenda_subset['products'] = brenda_subset['Reaction'].map(lambda x : get_products(x))
brenda_subset['molecules'] = brenda_subset['Reaction'].map(lambda x : set(get_products(x)).union(set(get_reactants(x))) )

In [None]:
idx2uniprot = {}
no_ec = []
empty_ec = []
for idx in brenda_subset.index:
    ec =  brenda_subset.loc[idx,'EC_Number']
    if ec =='SPONTANEOUS':
        idx2uniprot[idx] = 'SPONTANEOUS'
    else:    
        if ec in cleaned_brenda2uniprot.keys():
            brenda_ec_page = cleaned_brenda2uniprot[ec]
            overlaps = []
            if len(brenda_ec_page):
                for entry in brenda_ec_page:
                    overlap = len(entry['molecules'].intersection(brenda_subset.loc[idx,'molecules']))
                    overlaps.append(overlap)
                
                idx2uniprot[idx] = brenda_ec_page[np.argmax(overlaps)]['UNIPROT']
            else:
                empty_ec.append(ec)

In [None]:
idx2seq = {}
for k,pid in idx2uniprot.items():
    
    if pid == 'SPONTANEOUS':
        idx2seq[k] = 'SPONTANEOUS'
        
    uniprot_options = [x.strip() for x in pid.replace(';',',').split(',')]
    
    for uid in uniprot_options:
        if uid in pid2seq.keys():
            idx2seq[k] = pid2seq[uid]

In [None]:
len(idx2seq)

In [None]:
bkms['sequence'] = [idx2seq[idx] if idx in idx2seq.keys() else None for idx in bkms.index]

In [None]:
bkms['sequence'].dropna()

## KEGG

In [None]:
with open('../parse_reaction_dbs/kegg/kegg_rids_to_sequences.json', 'r') as f:
    kegg2seq = json.load(f)

In [None]:
# Prefer KEGG sequence over BRENDA-derived sequence
for idx in bkms[~bkms['Reaction_ID_KEGG'].isna()].index:
    kegg_ids = bkms.loc[idx, 'Reaction_ID_KEGG'].replace(';',',').split(',')
    for kid in kegg_ids:
        if  kid in kegg2seq.keys():
            kegg_entry = kegg2seq[kid]
            if kegg_entry:
                bkms.loc[idx,'sequence'] = kegg_entry['sequence']

In [None]:
bkms['sequence'].dropna()

In [None]:
bkms['Remark'].drop_duplicates()

## MetaCyc

In [None]:
metacyc2seq = pd.read_csv('21Nov2023_metacyc_reactions_with_sequences.csv')

In [None]:
metacyc2seq_dict = pd.Series(metacyc2seq['sequence'].values, index = metacyc2seq['REACTION_ID'].values).to_dict()

In [None]:
for idx in bkms[~bkms['Reaction_ID_MetaCyc'].isna()].index:
    metacyc_ids = bkms.loc[idx, 'Reaction_ID_MetaCyc'].replace(';',',').split(',')
    for mid in metacyc_ids:
        if  bkms.loc[idx, 'Reaction_ID_MetaCyc'] in metacyc2seq_dict.keys():
            bkms.loc[idx,'sequence'] = metacyc2seq_dict[bkms.loc[idx, 'Reaction_ID_MetaCyc']]

In [None]:
bkms['sequence'].dropna()

In [None]:
bkms[bkms['EC_Number'].map(lambda x: ',' in str(x))]

In [None]:
np.random.seed(4)
ridxs = np.random.choice(bkms[bkms['sequence'].isna()].index, 5)

bkms.loc[ridxs,:]

In [None]:
bkms.to_csv('bkms/21Nov2023_bkms-mapped_w_seqs.tsv', sep='\t', index=False)