In [None]:
import networkx as nx
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np

import os

import matplotlib.pyplot as plt

import re
import uuid
# import plotly.graph_objects as go
import plotly.express as px
import molplotly


from queue import PriorityQueue

import json

import pickle5 as pickle

from tqdm import tqdm

import sys
sys.path.append('..')
from utils import *
# sys.path.append('../../../forward_enumeration/')
# from forward_enumeration import construct_enumeration_graph
# from forward_enumeration import construct_pathway_from_list

In [None]:
from datetime import date
today = date.today()
date = today.strftime('%d%b%Y')
print ('Date prefix:', date)

In [None]:
# enzymemap = pd.read_csv('/Users/Itai/Desktop/processed_reactions.csv')

In [None]:
PRECOMPUTED_GRAPH_PATH = ''
REACTION_FILES = ['../parse_reaction_dbs/rhea/21Nov2023_rhea_reaction_smiles_no_cofs_with_sequences.csv', 
                  '../parse_reaction_dbs/bkms/21Nov2023_bkms-mapped_w_seqs.tsv',  
                  '../parse_reaction_dbs/metacyc/21Nov2023_metacyc_reaction_smiles_no_cofs_with_sequences.tsv']

REACTION_COLUMN = ['reaction_smiles_no_cofs', 'smiles', 'reaction_smiles']

REACTION_SET_NAME = ['_rhea','_bkms','_metacyc']

# Generate network 
try:
    with open(PRECOMPUTED_GRAPH_PATH, 'rb') as f:
        met = pickle.load(f)
    print ('Loading pre-computed graph')
except:    
    print ('RECOMPUTING GRAPH')
    reaction_datasets = []

    for f, c in zip(REACTION_FILES,REACTION_COLUMN):
        df = pd.read_csv(f, sep='\t')
        df['smiles'] = df[c]
        if 'Reaction' in df.columns: 
            reversibles = df[df['Reaction'].map(lambda x: '<=>' in str(x))].copy()
            reversibles.loc[:,'smiles'] = reversibles['smiles'].map(lambda x: flip_reaction(x))
            df = df.append(reversibles).reset_index()
        df = df.dropna(subset='smiles')
        df = df[df['smiles'].map(lambda x: '*' not in x)]
        df['smiles'] = df['smiles'].map(lambda x: standardize_reaction_smiles(x))
        reaction_datasets.append(df)
    
    reaction_df = reaction_datasets[0]
    for i in range(len(reaction_datasets)-1):
        print (REACTION_SET_NAME[i:i+2])
        reaction_df = reaction_df.merge(reaction_datasets[i+1], on='smiles', how='outer',
                                        suffixes=REACTION_SET_NAME[i:i+2])

    for i in ['level_0', 'Unnamed: 0.1', 'Unnamed: 0']:
        if i in reaction_df.columns:
            reaction_df = reaction_df.drop(columns=[i])
    
    #merge sequence columns:
    for idx in reaction_df.index:
        any_seq = [x for x in reaction_df.loc[idx, ['sequence', 'sequence_rhea', 'sequence_bkms']] if not pd.isna(x)] 
        if len(any_seq):
            reaction_df.loc[idx, 'sequence'] = any_seq[0]
    
    # get length of AA sequences    
    seq_lengths = []
    for idx, seq in zip(reaction_df.index, reaction_df['sequence'].values):
        if seq == 'SPONTANEOUS' or reaction_df.loc[idx, 'SPONTANEOUS?']=='T' or \
        reaction_df.loc[idx, 'EC_Number']=='SPONTANEOUS' or 'spon' in str(reaction_df.loc[idx, 'Commentary_MetaCyc']).lower() or \
        'spon' in str(reaction_df.loc[idx, 'Commentary_KEGG']).lower():
            seq_lengths.append(0)
        elif seq is None or pd.isna(seq):
            seq_lengths.append(None)
        else:
            seq_lengths.append(len(seq))
    
    reaction_df['seq_length'] = seq_lengths
    df_subset = reaction_df.loc[:, ['smiles', 'seq_length']]
    
    # remove reactions with no SMILES
    df_subset['smiles'] = [str(r) if r else None for r in df_subset['smiles']]
    df_subset = df_subset.dropna(subset='smiles')
    df_subset = df_subset[df_subset['smiles'].map(lambda x : len(x.split('>'))==3)]
    df_subset = df_subset.groupby('smiles').max()
    
    all_smiles = list([str(r) for r in df_subset.index])
    metadata = [{'aa_seq_len':x} for x in df_subset['seq_length']]
    

    print ('{} reactions in graph'.format(len(all_smiles)))
    
    print ('Computing graph')
    
    
    
    met = construct_pathway_from_list(all_smiles, metadata = metadata)

print ('Number of nodes', len(met.nodes))
print ('Number of edges', len(met.edges))
print ('Number of connected components', nx.number_connected_components(nx.Graph(met)))


print ('Number of reaction nodes', len([n for n in met.nodes if '>>' in n]))
print ('Number of chemical nodes', len([n for n in met.nodes if '>>' not in n]))

In [None]:
#TMP wanted to compare how the enzymemap database compares to the BKMS+BRENDA+Rhea 
# enzymemap_met = construct_pathway_from_list(enzymemapemap['unmapped'].values, ['']*len(enzymemap['unmapped'].values))
# reactions_to_new_chems = {n:list(enzymemap_met.pred[n]) for n in enzymemap_met.nodes if n not in met.nodes and '>' not in n}

In [None]:
n_t_s_df = pd.read_csv('../../../spectranalysis/13Jul2023_metabolite_name_to_smiles_df.tsv', sep='\t')

name_to_smiles = dict(zip(n_t_s_df['name'], n_t_s_df['smiles']))
smiles_to_name = dict(zip(n_t_s_df['smiles'], n_t_s_df['name']))

In [None]:
sum([n in smiles_to_name.keys() for n in met.nodes])

In [None]:
not_in_dict = [n for n in met.nodes if n not in smiles_to_name.keys() and '>>' not in n and '*' not in n]
not_in_dict
# Chem.Draw.MolsToGridImage([Chem.MolFromSmiles(n) for n in not_in_dict])

In [None]:
reaction_df

In [None]:
met.nodes['C#C.O>>CC=O']

In [None]:
#uncomment to save
reaction_df.to_csv('{}_all_reaction_from{}.csv'.format(date, ''.join(REACTION_SET_NAME)), sep='\t')
nx.write_gpickle(met, '{}_whole_metabolic_network_labeled.pkl'.format(date))

print ('Reactions saved to {}_all_reaction_from{}.csv'.format(date, ''.join(REACTION_SET_NAME)))
print ('Network saved to {}_whole_metabolic_network_labeled.pkl'.format(date))

In [None]:
reaction_df.columns

In [None]:
reaction_df[reaction_df['smiles']==r'C=CC1=C(C)C2=Cc3c(C=C)c(C)c4n3[Fe-2]35n6c(c(C)c(CCC(=O)O)c6=CC6=[N+]3C(=C4)C(C)=C6CCC(=O)O)=CC1=[N+]25>>C=CC1=C(C)C(/C=c2/[nH]/c(=C\c3[nH]c(/C=C4\NC(=O)C(C)=C4C=C)c(C)c3CCC(=O)O)c(CCC(=O)O)c2C)=NC1=O.[Co].[Fe+2]'].dropna(axis=1)

In [None]:
for idx in reaction_df.index:
    any_seq = [x for x in reaction_df.loc[idx, ['sequence', 'sequence_rhea', 'sequence_bkms']] if not pd.isna(x)] 
    if len(any_seq):
        print (any_seq[0])
        

In [None]:
met.nodes[r'C=CC1=C(C)C2=Cc3c(C=C)c(C)c4n3[Fe-2]35n6c(c(C)c(CCC(=O)O)c6=CC6=[N+]3C(=C4)C(C)=C6CCC(=O)O)=CC1=[N+]25>>C=CC1=C(C)C(/C=c2/[nH]/c(=C\c3[nH]c(/C=C4\NC(=O)C(C)=C4C=C)c(C)c3CCC(=O)O)c(CCC(=O)O)c2C)=NC1=O.[Co].[Fe+2]']

In [None]:
met.pred['O=C1N=C(c2c[nH]c3ccccc23)C=C1c1c[nH]c2ccccc12']

In [None]:
met.nodes['O=C(O)c1[nH]c(-c2c[nH]c3ccccc23)cc1-c1c[nH]c2ccccc12.O=O.[H+]>>O.O=C1N=C(c2c[nH]c3ccccc23)C=C1c1c[nH]c2ccccc12.O=C=O']

In [None]:
met.nodes['O=C(O)c1[nH]c(-c2c[nH]c3ccccc23)cc1-c1c[nH]c2ccccc12>>O=C1N=C(c2c[nH]c3ccccc23)C=C1c1c[nH]c2ccccc12']

In [None]:
reaction_df[reaction_df['Reaction'].map(lambda x: 'deoxyvio' in str(x))].dropna(axis=1,thresh=3)

In [None]:
[s for s, x in zip(all_smiles, seq_lengths) if x == 0]

In [None]:
reaction_df.loc[34232, 'smiles']