In [1]:
%%script false --no-raise-error
%%bash

cd tcdb
wget --quiet -O tcdb_2024_04_08.faa http://www.tcdb.org/public/tcdb
wget --quiet -O tcdb_2024_04_08.tsv https://www.tcdb.org/cgi-bin/substrates/getSubstrates.py


# Libraries

In [2]:
import glob
import os
import shutil
import logging
import statistics
import json
import pickle
import difflib
import subprocess


import pandas as pnd


import gempipe
import cobra
import libsbml
from Bio import SeqIO, SeqRecord, Seq
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

# Create new assets

## create reference faa

In [3]:
# create 'tcdb/tcdb_parsed_2024_04_08.faa': 


tc_to_comps = {}
tc_counter = {}
sr_dict = {}
sr_list = []


for record in SeqIO.parse(open('tcdb/tcdb_2024_04_08.faa', "r"), "fasta"):
    tc_code = record.id.rsplit('|', 1)[-1]
    
    n_digits = len(tc_code.split('.'))
    if n_digits !=5: 
        print("WARNING: ignoring", record.id)
        continue
        
    tc_code = tc_code.replace('.', '_')
    if tc_code not in tc_to_comps.keys():
        tc_to_comps[tc_code] = set()
        tc_counter[tc_code] = 0
               
    tc_counter[tc_code] += 1
    gempipe_id = f'TCDB.{tc_code}_comp{tc_counter[tc_code]}'
    tc_to_comps[tc_code].add(gempipe_id)
    
    
    sr = SeqRecord.SeqRecord(record.seq, id=gempipe_id, description=record.description)  # 'record.description' includes 'record.id'.
    sr_dict[gempipe_id] = sr
    

# write sr_list in alphabetical order: 
for tc_code in tc_to_comps.keys():
    for comp in sorted(list(tc_to_comps[tc_code])): 
        sr_list.append(sr_dict[comp])
with open(f'tcdb_generated/tcdb_2024_04_08_formatted.faa', 'w') as w_handler:
    count = SeqIO.write(sr_list, w_handler, "fasta")
    
    



## create database

In [4]:
%%time
%%bash


diamond makedb --in tcdb_generated/tcdb_2024_04_08_formatted.faa -d tcdb_generated/tcdb_2024_04_08_formatted.dmnd


diamond v2.0.15.153 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org
Please cite: http://dx.doi.org/10.1038/s41592-021-01101-x Nature Methods (2021)

#CPU threads: 72
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database input file: tcdb_generated/tcdb_2024_04_08_formatted.faa
Opening the database file...  [0.001s]
Loading sequences...  [0.094s]
Masking sequences...  [0.047s]
Writing sequences...  [0.013s]
Hashing sequences...  [0.003s]
Loading sequences...  [0s]
Writing trailer...  [0s]
Closing the input file...  [0s]
Closing the database file...  [0.002s]

Database sequences  23592
  Database letters  10405765
     Database hash  417c088b128cf023c25d6da95ac22bae
        Total time  0.163000s


CPU times: user 38.3 ms, sys: 21.8 ms, total: 60.1 ms
Wall time: 219 ms


## create tcdb_gprs

In [5]:
# create the tcdb_gprs file:

tcdb_gprs = []
for tc_code, counter in tc_counter.items():
    abs_gpr = 'P_' + '+'.join([f'{tc_code}_comp{i+1}' for i in range(counter)]) 
    for i in range(counter):
        tcdb_gprs.append({'gene': 'G_' + f'{tc_code}_comp{i+1}', 'protein': abs_gpr, 'reaction': 'R_' + f'{tc_code}', 'model': 'TCDB'})
tcdb_gprs = pnd.DataFrame.from_records(tcdb_gprs)


# apply the same formatting as carveme/gempipe: 
tcdb_gprs['BiGG_gene'] = tcdb_gprs.apply(lambda row: f"{row['model']}.{row['gene'][2:]}", axis=1) 

tcdb_gprs.to_csv('tcdb_generated/tcdb_gprs.csv', index=False)

print(tcdb_gprs.shape)
tcdb_gprs.head()

(23592, 5)


Unnamed: 0,gene,protein,reaction,model,BiGG_gene
0,G_1_A_83_1_5_comp1,P_1_A_83_1_5_comp1+1_A_83_1_5_comp2+1_A_83_1_5...,R_1_A_83_1_5,TCDB,TCDB.1_A_83_1_5_comp1
1,G_1_A_83_1_5_comp2,P_1_A_83_1_5_comp1+1_A_83_1_5_comp2+1_A_83_1_5...,R_1_A_83_1_5,TCDB,TCDB.1_A_83_1_5_comp2
2,G_1_A_83_1_5_comp3,P_1_A_83_1_5_comp1+1_A_83_1_5_comp2+1_A_83_1_5...,R_1_A_83_1_5,TCDB,TCDB.1_A_83_1_5_comp3
3,G_1_A_59_1_2_comp1,P_1_A_59_1_2_comp1,R_1_A_59_1_2,TCDB,TCDB.1_A_59_1_2_comp1
4,G_8_B_26_1_3_comp1,P_8_B_26_1_3_comp1,R_8_B_26_1_3,TCDB,TCDB.8_B_26_1_3_comp1


## classify BiGG transporters

In [6]:
# classify BiGG/CarveMe universal trasnporters: 
uni_models = gempipe.get_universe('neg'), gempipe.get_universe('pos')

In [7]:

processed_rids = set()
bigg_tr_df = []


for uni in uni_models:

    
    for r in uni.reactions:
        comps_involved = set([m.id.rsplit('_', 1)[-1] for m in r.metabolites])
        puremids_involved = set([m.id.rsplit('_', 1)[0] for m in r.metabolites])

        comps_involved_reacs = set([m.id.rsplit('_', 1)[-1] for m in r.reactants])
        comps_involved_prods = set([m.id.rsplit('_', 1)[-1] for m in r.products])

        puremids_involved_reacs = set([m.id.rsplit('_', 1)[0] for m in r.reactants])
        puremids_involved_prods = set([m.id.rsplit('_', 1)[0] for m in r.products])


        if r.id=='Growth': 
            continue
        if r.reaction == 'adp_c + 4.0 h_p + pi_c <=> atp_c + h2o_c + 3.0 h_c':
            continue   # ATPs

        
        # detect transportes
        if len(comps_involved) > 1:

             # diffusion / facilitated diffusion / uniports (?) / unknown mechanisms (Biolog evidence):
            if len(r.metabolites) == 2:
                sub = list(r.metabolites)[0].id.rsplit('_', 1)[0]
                if r.id not in processed_rids:
                    bigg_tr_df.append({'rid': r.id, 'sub': sub, 'type': 'DIF', 'rstring': r.reaction,})
                    processed_rids.add(r.id)


            elif len(puremids_involved) == 6:
                # ABC transporters:
                if all([i in puremids_involved for i in ['atp', 'h2o', 'adp', 'h', 'pi']]):
                    sub = list(puremids_involved - set(['atp', 'h2o', 'adp', 'h', 'pi']))[0]
                    if r.id not in processed_rids:
                        bigg_tr_df.append({'rid': r.id, 'sub': sub, 'type': 'ABC', 'rstring': r.reaction,})
                        processed_rids.add(r.id)

                # ABC transporters with GTP instead of ATP
                # redox tranports (q8, mql7, mql8
                # pts non transorming
                else:
                    if r.id not in processed_rids:
                        bigg_tr_df.append({'rid': r.id, 'sub': '?', 'type': '?', 'rstring': r.reaction,})
                        processed_rids.add(r.id)

                
            elif len(puremids_involved) == 2:
                # symports
                if len(comps_involved_reacs) == len(comps_involved_prods) == 1:

                    if 'h' in puremids_involved:
                        sub = list(puremids_involved - set(['h']))[0]
                        if r.id not in processed_rids:
                            bigg_tr_df.append({'rid': r.id, 'sub': sub, 'type': 'SYM-h', 'rstring': r.reaction,})
                            processed_rids.add(r.id)

                    elif 'na1' in puremids_involved:
                        sub = list(puremids_involved - set(['na1']))[0]
                        if r.id not in processed_rids:
                            bigg_tr_df.append({'rid': r.id, 'sub': sub, 'type': 'SYM-na', 'rstring': r.reaction,})
                            processed_rids.add(r.id)
                    
                    # XANt3 ura_e + xan_e --> ura_c + xan_c 
                    else:
                        if r.id not in processed_rids:
                            bigg_tr_df.append({'rid': r.id, 'sub': '?', 'type': '?', 'rstring': r.reaction,})
                            processed_rids.add(r.id)

                # antiports    
                else:
                    if 'h' in puremids_involved:
                        sub = list(puremids_involved - set(['h']))[0]
                        if r.id not in processed_rids:
                            bigg_tr_df.append({'rid': r.id, 'sub': sub, 'type': 'ANT-h', 'rstring': r.reaction,})
                            processed_rids.add(r.id)

                    elif 'na1' in puremids_involved:
                        sub = list(puremids_involved - set(['na1']))[0]
                        if r.id not in processed_rids:
                            bigg_tr_df.append({'rid': r.id, 'sub': sub, 'type': 'ANT-na', 'rstring': r.reaction,})
                            processed_rids.add(r.id)

                    elif 'k' in puremids_involved:
                        sub = list(puremids_involved - set(['k']))[0]
                        if r.id not in processed_rids:
                            bigg_tr_df.append({'rid': r.id, 'sub': sub, 'type': 'ANT-k', 'rstring': r.reaction,})
                            processed_rids.add(r.id)

                    elif 'pi' in puremids_involved:
                        sub = list(puremids_involved - set(['pi']))[0]
                        if r.id not in processed_rids:
                            bigg_tr_df.append({'rid': r.id, 'sub': sub, 'type': 'ANT-pi', 'rstring': r.reaction,})
                            processed_rids.add(r.id)

                    else:
                        # aminoacids (arg__L, ...)
                        # other organic acids (succ, ...)
                        # no3
                        if r.id not in processed_rids:
                            bigg_tr_df.append({'rid': r.id, 'sub': '?', 'type': '?', 'rstring': r.reaction,})
                            processed_rids.add(r.id)
                            

            elif len(puremids_involved) == 3:

                if sum([1 if i in ['h', 'mg2', 'ca2', 'ni2', 'mn2', 'fe2', 'zn2', 'cu2', 'cobalt2', 'cd2'] else 0 for i in puremids_involved]) in [2, 3]:
                    # 3-comps symporters/antiporters
                    if r.id not in processed_rids:
                        bigg_tr_df.append({'rid': r.id, 'sub': '?', 'type': '?', 'rstring': r.reaction,})
                        processed_rids.add(r.id)

                else:
                    # CADVtpp 15dap_c + h_p + lys__L_p --> 15dap_p + h_c + lys__L_c Lysine/Cadaverine antiporter (periplasm)
                    # CMCBTFL cmcbtt_c + fe3_e --> fcmcbtt_c CMCBTFL
                    # DH23t 23dhb_c + h_c --> d23hb_e 
                    # ENTERHpp enter_p + h2o_c --> 23dhbzs3_p Enterobactin hydrolase
                    if r.id not in processed_rids:
                        bigg_tr_df.append({'rid': r.id, 'sub': '?', 'type': '?', 'rstring': r.reaction,})
                        processed_rids.add(r.id)


            elif len(puremids_involved) == 7:
                if all([i in puremids_involved for i in ['atp', 'coa', 'h', 'amp', 'ppi']]):
                    # CoA-coupling
                    if 'atp' in puremids_involved_reacs: 
                        sub  = list(puremids_involved_reacs - set(['atp', 'coa', 'h']))[0]
                        prod = list(puremids_involved_prods - set(['amp', 'ppi', 'h']))[0]
                    else:   # if written right-to-left
                        sub  = list(puremids_involved_prods - set(['atp', 'coa', 'h']))[0] 
                        prod = list(puremids_involved_reacs - set(['amp', 'ppi', 'h']))[0]
                    if r.id not in processed_rids:
                        bigg_tr_df.append({'rid': r.id, 'sub': sub, 'prod': prod, 'type': 'COA', 'rstring': r.reaction,})
                        processed_rids.add(r.id)

                else:
                    # FEDCabc atp_c + 2.0 cit_e + fe3_e + h2o_c --> adp_c + 2.0 cit_c + fe3_c + h_c + pi_c
                    # GLN_NAct atp_c + gln__L_e + h2o_c + na1_e --> adp_c + gln__L_c + h_c + na1_c + pi_c
                    if r.id not in processed_rids:
                        bigg_tr_df.append({'rid': r.id, 'sub': '?', 'type': '?', 'rstring': r.reaction,})
                        processed_rids.add(r.id)
                    

            else:
                if 'pep' in puremids_involved and 'pyr' in puremids_involved:
                    # PEP:Pyr PTS
                    if 'pep' in puremids_involved_reacs: 
                        sub = list(puremids_involved_reacs - set(['pep', 'h']))[0]
                        prod = list(puremids_involved_prods - set(['pyr', 'h']))[0]
                    else:   # if written right-to-left
                        sub = list(puremids_involved_prods - set(['pep', 'h']))[0]  
                        prod = list(puremids_involved_reacs - set(['pyr', 'h']))[0]
                    if r.id not in processed_rids:
                        bigg_tr_df.append({'rid': r.id, 'sub': sub, 'prod': prod, 'type': 'PTS', 'rstring': r.reaction,})
                        processed_rids.add(r.id)

                        
                elif 'accoa' in puremids_involved and 'coa' in puremids_involved:
                    # model-specific
                    if r.id not in processed_rids:
                        bigg_tr_df.append({'rid': r.id, 'sub': '?', 'type': '?', 'rstring': r.reaction,})
                        processed_rids.add(r.id)

                else:
                    if r.id not in processed_rids:
                        bigg_tr_df.append({'rid': r.id, 'sub': '?', 'type': '?', 'rstring': r.reaction,})
                        processed_rids.add(r.id)
                        
                        
# convert dict to dataframe
bigg_tr_df = pnd.DataFrame.from_records(bigg_tr_df)
                        
                        
# insert the 'name' column, to be later matched:
bigg_tr_df['name'] = None
for index, row in bigg_tr_df.iterrows(): 
    if row["sub"] == '?':
        continue
    name = None
    for uni in uni_models:
        try: name = uni.metabolites.get_by_id(f'{row["sub"]}_e').name
        except: 
            try: name = uni.metabolites.get_by_id(f'{row["sub"]}_p').name
            except: 
                try: name = uni.metabolites.get_by_id(f'{row["sub"]}_c').name
                except: pass
    bigg_tr_df.loc[index, 'name'] = name

    
# reorder columns
bigg_tr_df = bigg_tr_df[['rid', 'sub', 'name', 'prod', 'type', 'rstring']]
    
    
print('type == ?:', len(bigg_tr_df[bigg_tr_df['type']=='?']))
    
    
# save to disk: 
bigg_tr_df.to_csv('tcdb_generated/bigg_tr_df.csv')
bigg_tr_df = bigg_tr_df.sort_values(by='type', ascending=False)
bigg_tr_df

type == ?: 137


Unnamed: 0,rid,sub,name,prod,type,rstring
433,CITt4_1,cit,Citrate,,SYM-na,cit_e + na1_e --> cit_c + na1_c
113,4OXPTNtpp,4oxptn,4 Oxopentanoate,,SYM-na,4oxptn_p + na1_p --> 4oxptn_c + na1_c
540,CYTDt4,cytd,Cytidine,,SYM-na,cytd_e + na1_e --> cytd_c + na1_c
164,ACONCt4pp,acon_C,Cis-Aconitate,,SYM-na,acon_C_p + na1_p --> acon_C_c + na1_c
1237,MFUMt8,fum,Fumarate,,SYM-na,fum_e + na1_e --> fum_c + na1_c
...,...,...,...,...,...,...
487,CRNt8pp,?,,,?,crn__D_c + crn_p --> crn__D_p + crn_c
1715,TETDHpp1,?,,,?,mql8_c + tet_p --> 2.0 h_p + mqn8_c + 2.0 tsul_p
1716,TETDHpp2,?,,,?,2dmmql8_c + tet_p --> 2dmmq8_c + 2.0 h_p + 2.0...
1717,TETDHpp3,?,,,?,q8h2_c + tet_p --> 2.0 h_p + q8_c + 2.0 tsul_p


In [8]:
# create the PTS pep/pyr dict: 

pts_assoc = {}
for index, row in bigg_tr_df[bigg_tr_df['type']=='PTS'].iterrows(): 
    pts_assoc[row['sub']] = row['prod']
pts_assoc

{'sucr': 'suc6p',
 'ascb__L': 'ascb6p',
 'galt': 'galt1p',
 'man': 'man6p',
 'acmana': 'acmanap',
 'tag__D': 'tag1p__D',
 'manglyc': 'man6pglyc',
 'glc__D': 'g6p',
 'malt': 'malt6p',
 'galam': 'galam6p',
 'lcts': 'lac6p',
 'arbt': 'arbt6p',
 'tre': 'tre6p',
 'acmum': 'acmum6p',
 'gam': 'gam6p',
 'dachi': 'chtbs6p',
 'gal': 'dgal6p',
 'mnl': 'mnl1p',
 'acgam': 'acgam6p',
 'tgt': 'tag6p__D',
 'glc__aD': 'g6p_A',
 'xyl__D': 'xu5p__D',
 'sbt__D': 'sbt6p',
 'cellb': 'cellb6p',
 'fru': 'f1p',
 'chtbs': 'chtbs6p',
 'fuc': 'fc1p',
 'salcn': 'salcn6p'}

In [9]:
# create the COA dict: 

coa_assoc = {}
for index, row in bigg_tr_df[bigg_tr_df['type']=='COA'].iterrows(): 
    coa_assoc[row['sub']] = row['prod']
coa_assoc

{'phenona': 'pnonacoa',
 'dca': 'dcacoa',
 'ttdca': 'tdcoa',
 'ddca': 'ddcacoa',
 'ttdcea': 'tdecoa',
 'pheocta': 'poctacoa',
 'phehxa': 'phxacoa',
 '6atha': '6athcoa',
 'phedca': 'pdcacoa',
 'phept': 'pptcoa',
 'hdca': 'pmtcoa',
 'octa': 'occoa',
 'hpta': 'hptcoa',
 'hxa': 'hxcoa',
 'lnlc': 'lnlccoa',
 'ibt': 'ibcoa',
 'ocdcea': 'odecoa',
 'ptsla': 'ptslacoa',
 'vacc': 'vacccoa',
 'ocdca': 'stcoa',
 'but': 'btcoa',
 'nona': 'nonacoa',
 'pta': 'ptcoa',
 '3mb': 'ivcoa',
 'hdcea': 'hdcoa',
 'phehpa': 'phpcoa'}

## parse chebi annotation

In [10]:
subs_df = pnd.read_csv('tcdb/tcdb_2024_04_08.tsv', sep='\t', header=None, names=['tc_code', 'raw'])


# create new columns : 
subs_df['chebi'] = None
# populate columns: 
for index, row in subs_df.iterrows(): 
    
    
    raw_substrates = row['raw'].split('|')
    row_dict_chebi = {}
    for raw_substrate in raw_substrates: 
        chebi, substrate = raw_substrate.split(';')
        chebi = chebi.replace('CHEBI:', '')
        row_dict_chebi[chebi] = substrate
        subs_df.loc[index, 'chebi'] = str(row_dict_chebi)
# drop redundant columns
subs_df = subs_df.drop(columns='raw')


print(subs_df.shape)
subs_df.head()

(8724, 2)


Unnamed: 0,tc_code,chebi
0,2.A.1.15.8,{'15992': '1-hydroxy-2-naphthoate'}
1,2.A.50.1.1,"{'5584': 'hydron', '5448': 'glycerol'}"
2,2.A.29.8.3,"{'3424': 'carnitinium', '8946': 'S-adenosyl-L-..."
3,1.B.48.1.11,{'25367': 'molecule'}
4,1.C.118.1.1,{'25367': 'molecule'}


## create chebi-to-bigg dictionary

In [11]:
# create a chebi-to-bigg dictionary.
# childs are included !

add_parents = False


mx2ch = pickle.load(open('mnx_dicts_M/chebi.pickle', 'rb'))
mx2mxc = pickle.load(open('mnx_dicts_M/mnx_to_mnxchilds.pickle', 'rb'))
mx2bg = pickle.load(open('mnx_dicts_M/bigg.metabolite.pickle', 'rb'))
b2o = pickle.load(open('pubchem_dicts/bigg_to_others_extended.pickle', 'rb'))


# invert 'chebi.pickle' to create a ch2mx dict:
ch2mx = {}
for mnx_id, chebi_ids in mx2ch.items(): 
    for chebi_id in chebi_ids: 
        if chebi_id not in ch2mx.keys():
            ch2mx[chebi_id] = set()
        ch2mx[chebi_id].add(mnx_id)
for key, value in ch2mx.items():
    if len(value) > 1:
        print(key, value)
    else: ch2mx[key] = list(value)[0]
# Note: 'ch2mx' is a 1-to-1 dictionary


def mnx_to_bigg(mnx_id, ):
    # recursive function!
    
    bigg_ids = set()
    if mnx_id in mx2bg.keys(): 
        for bigg_id in mx2bg[mnx_id]: 
            bigg_ids.add(bigg_id)
    else: # putting this 'else block' outside the 'else' will add also all the childs
        if mnx_id in mx2mxc.keys():
            for mxc_id in mx2mxc[mnx_id]:
                for bigg_id in mnx_to_bigg(mxc_id):
                    bigg_ids.add(bigg_id)
    return bigg_ids


ch2bi = {}
for chebi_id, mnx_id in ch2mx.items():
    bigg_ids = mnx_to_bigg(mnx_id)
    ch2bi[chebi_id] = bigg_ids
# Note: 'ch2bi' is a 1-to-many dict


if add_parents:
    # Supplement the dict with some parents:
    # for example: '17925' goes from "{'Glc_aD', 'glc__aD'}" to "{'Glc_aD', 'glc__D', 'glc__aD'}"
    for puremid in b2o.keys():
        for chebi_id in b2o[puremid]['chebi']:
            if chebi_id not in ch2bi.keys():
                ch2bi[chebi_id] = set()
            ch2bi[chebi_id].add(puremid)



In [12]:
# test:
for i, k in ch2bi.items():
    if 'glc__D' in k:
        print(i, k)

4167 {'glc__D'}
20898 {'man', 'gal', 'gal_bD', 'glc__D'}
16362 {'man', 'gal', 'gal_bD', 'glc__D'}
12904 {'man', 'gal', 'gal_bD', 'glc__D'}
4092 {'man', 'gal', 'gal_bD', 'glc__D'}
12905 {'man', 'gal', 'gal_bD', 'glc__D'}
12903 {'man', 'gal', 'gal_bD', 'glc__D'}
167473 {'gal', 'man', 'glc__D', 'gal_bD', 'gal__L'}
5418 {'glc__D'}
14313 {'glc__D'}
17234 {'glc__D'}
33929 {'glc__D'}
24277 {'glc__D'}


In [13]:


        
# mappings provided by metanetx are not complete.
# For example, 17234 and 5418 , both 'glucose' are not considered.
# Moreover, using official Chebi mapping, there no mean to obtain a common unique ID.
# therefore, I hardcode relevant substrates, of which the Chebi ID is not pick up by metanetx. 
# The full table of 'missing' substrates is saved.
# Each time a relevant susbstrate is needed, it can be extracted from the table and inserted in the hardcoded list.
hardcoded = {
    '39150': '4oxptn',
    '15824': 'fru',
    '12936': 'gal',
    '17315': 'gam',
    '15748': 'glcur',
    '16024': 'man',
    '30624': 'manur',
    '16988': 'rib__D',
    '16443': 'tag__D',
    '4261': 'val__D',
    '6260': 'leu__L',
    '60903': '4abzglu',
    '28009': 'acgam',
    '7201': 'acgal',
    '59640': 'acgam',
    '17728': 'mmet',
    '33935': 'all__D',
    '35274': 'nh4',
    '22599': 'arab__L',
    '36084': '34dhbz',
    '23981': 'etha',
    '28757': 'fru',
    '5172': 'fru',
    '5182': 'fuc__L',
    '5201': 'fusa',
    '24175': 'galur',
    '5417': 'gam',
    '36315': 'g3pi',
    '24898': 'ile__L',
    '24927': 'tcynt',
    '25017': 'leu__L',
    '6671': 'maltttr',
    '14575': 'man',
    '26546': 'rmn',
    '35396': 'tartr__D',
    '30661': 'thr__L',
    '10080': 'xylb',
    '10085': 'xyl__D',
    '15332': 'xyl__D',
    '5256': 'gal',
}
      

# create the 'unamapped.tsv': 
unmapped = []
for index, row in subs_df.iterrows():
    chebi_dict = eval(row['chebi'])
    for chebi_id in chebi_dict.keys(): 
        #if chebi_id not in ch2bi.keys() and chebi_id not in hardcoded.keys(): 
        if ch2bi[chebi_id] == set() and chebi_id not in hardcoded.keys(): 
            unmapped.append({'ID': chebi_id, 'name': chebi_dict[chebi_id]})
unmapped = pnd.DataFrame.from_records(unmapped)
unmapped = unmapped.drop_duplicates()
unmapped = unmapped.sort_values('name')
unmapped.to_csv('tcdb_generated/tcdb_unmapped.tsv', sep='\t', index=False)
print('unmapped', len(unmapped))

    
# create new columns:
subs_df['bigg'] = None
for index, row in subs_df.iterrows():
    row_dict_bigg = {}
    chebi_dict = eval(row['chebi'])
    for chebi_id in chebi_dict.keys():
        
        # if present in the conversion dictionary: 
        if chebi_id in ch2bi.keys(): 
            row_dict_bigg[chebi_id] = ch2bi[chebi_id]
        
        # 'chebi_id' should contain all chebi ids.
        # if a chebi id is not linked to a bigg transltion, then the dictionary will return set().
        # in this case, check in the 'hardcoded'
        if row_dict_bigg[chebi_id] == set():
            if chebi_id in hardcoded.keys():
                row_dict_bigg[chebi_id] = set([hardcoded[chebi_id]])
                
    subs_df.loc[index, 'bigg'] = str(row_dict_bigg)


#1066 > 1038 > 1030 (with 'add_parents')
print(subs_df.shape)
subs_df.head()

unmapped 998
(8724, 3)


Unnamed: 0,tc_code,chebi,bigg
0,2.A.1.15.8,{'15992': '1-hydroxy-2-naphthoate'},{'15992': {'1h2nap'}}
1,2.A.50.1.1,"{'5584': 'hydron', '5448': 'glycerol'}","{'5584': {'h'}, '5448': {'glyc'}}"
2,2.A.29.8.3,"{'3424': 'carnitinium', '8946': 'S-adenosyl-L-...","{'3424': {'crn'}, '8946': {'amet'}}"
3,1.B.48.1.11,{'25367': 'molecule'},{'25367': set()}
4,1.C.118.1.1,{'25367': 'molecule'},{'25367': set()}


In [14]:
%%script false --no-raise-error

# find most similar elements, to recover some of the 'unmapped' substrates.
# good matches will be manually copied to the 'hardcoded' dictionary. 


tot_items = len(unmapped)
for i, (index, row) in enumerate(unmapped.iterrows()): 
    if row['name']==None: continue
    max_similarity = 0
    most_similar_item = None
    most_similar_item_bigg = None
    for index2, row2 in bigg_tr_df.iterrows():
        if row2['name']==None: continue
        similarity = difflib.SequenceMatcher(None, row['name'].lower(), row2['name'].lower()).ratio()
        if similarity < 0.8: continue   # do not print too much dissimilar matches
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_item = row2['name']
            most_similar_item_bigg = row2['sub']
    if row['name'] != None and most_similar_item != None: 
        print("'"+ row['ID'] +"': '"+ most_similar_item_bigg +"'", '---', row['name'], '>>>', most_similar_item, '---', f'{i+1}/{tot_items}')



## get tcdb annotations

In [15]:
# mamba install natsort
from natsort import natsorted



# extract all families codes: 
good_families = [i.replace('_', '.') for i in tc_to_comps.keys()]
print('tcdb IDs:', len(good_families))
good_families = set(['.'.join(i.split('.')[0:4]) for i in good_families])
print('tcdb families:', len(good_families))


# sort in natural order: 
good_families = natsorted(good_families)


# get "super"families, with 1 representative:
superfamilies = {}
for good_family in good_families: 
    superfamily = '.'.join(good_family.split('.')[0:3])
    if superfamily not in superfamilies.keys(): 
        superfamilies[superfamily] = good_family
print('tcdb superfamilies:', len(superfamilies.keys()))
                        

tcdb IDs: 18653
tcdb families: 3353
tcdb superfamilies: 1498


In [16]:
%%script false --no-raise-error


import time


# download descriptions using lynx terminal browser: 
for i, (superfamily, family) in enumerate(superfamilies.items()):
    
    with open(f'tcdb_generated/lynx_stdout/{superfamily}.txt', 'w') as stdout, open(f'tcdb_generated/lynx_stderr/{superfamily}.txt', 'w') as stderr: 
        
        # mamba install lynx
        command = f"""lynx -source http://www.tcdb.org/search/result.php?tc={family} > tcdb_generated/lynx_dumped/{superfamily}.html"""
        process = subprocess.Popen(command, shell=True, stdout=stdout, stderr=stderr)
        process.wait()
        
        
    time.sleep(2)  # avoid ban
        
    print(f"Done {i+1}/{len(list(superfamilies.keys()))} {round((i+1)/len(list(superfamilies.keys()))*100,1)}% ({family})", end='\r')



In [17]:
#%%script false --no-raise-error


# convert the html table into csv file.
from bs4 import BeautifulSoup


def html_table_to_csv(html_file, csv_file):

    
    # get the table using beautifulsoup: 
    soup = BeautifulSoup(open(html_file, 'r').read(), 'html.parser')
    table = soup.find('table')


    # get the headers:
    headers = [header.text.strip() for header in table.find_all('th')]


    # get the rows:
    rows = []
    for row in table.find_all('tr'):
        row = [cell.text.strip() for cell in row.find_all('td')]
        if row == []: 
            continue


        # check if it is a valid TC code : 
        tcdb_id = row[0]
        levels = tcdb_id.split('.')
        if len(levels) != 5: continue
        if levels[0].isdigit()== False: continue
        if levels[1].isalpha()== False: continue   # second is a char
        if levels[2].isdigit()== False: continue
        if levels[3].isdigit()== False: continue
        if levels[4].isdigit()== False: continue


        # assure at least 5 elements in the list:
        row = row + [['EMPTYCELL'], ['EMPTYCELL'], ['EMPTYCELL'], ['EMPTYCELL'], ['EMPTYCELL']]
        row = row[0:5]
        rows.append(row)


    # convert to dataframe: 
    df = pnd.DataFrame(rows, columns=headers)
    df.to_csv(csv_file, index=False)

    return df

    

des_df = []
for i, (superfamily, family) in enumerate(superfamilies.items()):
    print(f"Parsing {i+1}/{len(list(superfamilies.keys()))} {round((i+1)/len(list(superfamilies.keys()))*100,1)}% ({family})", end='\r')
    des_df.append(html_table_to_csv(f'tcdb_generated/lynx_dumped/{superfamily}.html', f'tcdb_generated/raw_csv/{superfamily}.csv'))
des_df = pnd.concat(des_df, axis=0)
print()
print(des_df.shape)


# detect duplicated based on the 'TCID' column
duplicates = des_df.duplicated('TCID')
# keep only first occurrence: 
des_df = des_df[~duplicates]
print(des_df.shape, '(after dups removal)')


# keep only tcids with aa_seqs
tcids_with_comps = [i.replace('_', '.') for i in tc_to_comps.keys()]
des_df = des_df[des_df['TCID'].isin(tcids_with_comps)]
print(des_df.shape, '(after no_aaseq removal)')


des_df.to_csv('tcdb_generated/des_df.csv', index=False)
des_df.to_excel('tcdb_generated/des_df.xlsx', index=False)

des_df.head()

Parsing 1498/1498 100.0% (9.B.459.1)
(18702, 5)
(18702, 5) (after dups removal)
(18653, 5) (after no_aaseq removal)


Unnamed: 0,TCID,Name,Domain,Kingdom/Phylum,Protein(s)
0,1.A.1.1.1,Two TMS K+ and water channel (conducts K+ (KD ...,Bacteria,Actinomycetota,Skc1 (KcsA) of Streptomyces lividans
1,1.A.1.2.1,Voltage-sensitive K+ channel (PNa+/PK+ ≈ 0.1) ...,Eukaryota,"Metazoa, Arthropoda",Shab11 of Drosophila melanogaster
2,1.A.1.2.2,Voltage-sensitive K+ channel of 498 aas and 6 ...,Eukaryota,"Metazoa, Arthropoda",Shaw2 of Drosophila melanogaster
3,1.A.1.2.3,Voltage-sensitive fast transient outward curre...,Eukaryota,"Metazoa, Arthropoda",Shal2 of Drosophila melanogaster
4,1.A.1.2.4,"Margatoxin-sensitive voltage-gated K+ channel,...",Eukaryota,"Metazoa, Chordata",Kv1.3 homomers and Kv1.3/Kv1.5 heteromers of H...


In [18]:
len(tc_to_comps.keys())

18653

## parse TCDB annotation

In [19]:

des_df = des_df.drop(columns=['Protein(s)'])
des_df = des_df[des_df['Name'].notna()]
des_df = des_df[des_df['Domain'].notna()]
des_df = des_df.reset_index(drop=True)


In [20]:

# create a short description
# '\xa0' is the non-breaking space.
des_df['Name'] = [des_df.loc[index, 'Name'].replace('\xa0', ' ') for index, row in des_df.iterrows()]
des_df['short'] = [des_df.loc[index, 'Name'].split('. ',1)[0] for index, row in des_df.iterrows()]


# create new columns:
des_df['tcat'] = None
des_df['cot'] = None
des_df['dir'] = None
des_df['rev'] = None
for index, row in des_df.iterrows():
    
    
 
    # try to determine direction
    dirs = set()
    if   'efflux' in row['short'].lower(): dirs.add('secretion')
    elif 'secret' in row['short'].lower(): dirs.add('secretion')
    elif 'export' in row['short'].lower(): dirs.add('secretion')
    elif 'resistance' in row['short'].lower(): dirs.add('secretion')
    if   'uptake' in row['short'].lower(): dirs.add('uptake')
    elif 'import' in row['short'].lower(): dirs.add('uptake')
    if dirs == set(): dirs.add('uptake')
    des_df.loc[index, 'dir'] = str(dirs)
    
    
      
    if row['TCID'].startswith('2.A.1.'):
        
        # determine if symport / antiport / uniport
        if   'symport' in row['short'].lower(): des_df.loc[index, 'tcat'] = 'sym'
        elif 'antiport' in row['short'].lower(): des_df.loc[index, 'tcat'] = 'anti'
        elif 'anitport' in row['short'].lower(): des_df.loc[index, 'tcat'] = 'anti'
        else: des_df.loc[index, 'tcat'] = 'uni'
        
        # determine the co-transported species
        cot = []
        if 'Na+' in row['short']: cot.append('na1')   # {'9175': 'sodium(1+)'}
        if 'K+' in row['short']: cot.append('k')     # {'8345': 'potassium(1+)'}
        if 'Li+' in row['short']: pass   # lithium not present in BiGG database ?!
        if 'Ca2+' in row['short']: cot.append('ca2')   # {'3308': 'calcium(2+)'}
        if 'Mg2+' in row['short']: cot.append('mg2')  # {'6635': 'magnesium(2+)'}
        if 'H+' in row['short']: cot.append('h')   # {'5584': 'hydron'}
        if cot==[]: cot.append('h')
        if des_df.loc[index, 'tcat'] == 'uni': cot = []  # empty if uniport
        des_df.loc[index, 'cot'] = str(cot)
                
        # reversibility
        des_df.loc[index, 'rev'] = '<=>'
        
        
    elif row['TCID'].startswith('2.A.2.'):  
        
        # always a symport
        des_df.loc[index, 'tcat'] = 'sym'
        
        cot = []
        if 'Na+' in row['short']: cot.append('na1')   # {'9175': 'sodium(1+)'}
        if 'H+' in row['short']: cot.append('h')   # {'5584': 'hydron'}
        if cot==[]: cot.append('h')
        des_df.loc[index, 'cot'] = str(cot)
        
        # reversibility
        des_df.loc[index, 'rev'] = '-->'
        
        
    elif row['TCID'].startswith('2.A.3.'):
        # this category contains transporters that perform both symport and antiport, 
        # for example https://tcdb.org/search/result.php?tc=2.A.3.2.1
        
        if not 'antiport' in row['short'].lower(): 
            
            des_df.loc[index, 'tcat'] = 'sym'
            
            cot = ['h']   # always
            des_df.loc[index, 'cot'] = str(cot)
            
            # reversibility
            des_df.loc[index, 'rev'] = '-->'
            
        else: continue  ### antiport function to be handled
            
    
    elif row['TCID'].startswith('2.A.6.'):
        
        des_df.loc[index, 'tcat'] = 'sym'
        
        cot = ['h']  
        des_df.loc[index, 'cot'] = str(cot)
        
        # reversibility
        des_df.loc[index, 'rev'] = '-->'
    
    
    elif row['TCID'].startswith('2.A.8.'):
        
        des_df.loc[index, 'tcat'] = 'sym'
        
        cot = ['h']  
        des_df.loc[index, 'cot'] = str(cot)
        
        # reversibility
        des_df.loc[index, 'rev'] = '-->'
        
        
    elif row['TCID'].startswith('2.A.10.'):
        
        des_df.loc[index, 'tcat'] = 'sym'
        
        cot = ['h']  
        des_df.loc[index, 'cot'] = str(cot)
        
        # reversibility
        des_df.loc[index, 'rev'] = '-->'
        
        
    elif row['TCID'].startswith('2.A.14.'):
        
        des_df.loc[index, 'tcat'] = 'sym'
        
        cot = ['h'] 
        des_df.loc[index, 'cot'] = str(cot)
        
        # reversibility
        des_df.loc[index, 'rev'] = '-->'
        
        
    elif row['TCID'].startswith('2.A.18.'):
        
        des_df.loc[index, 'tcat'] = 'sym'
        
        cot = ['h']  
        des_df.loc[index, 'cot'] = str(cot)
        
        # reversibility
        des_df.loc[index, 'rev'] = '-->'
        
        
    elif row['TCID'].startswith('2.A.21.'):
        
        des_df.loc[index, 'tcat'] = 'sym'
        
        cot = ['na1']  
        des_df.loc[index, 'cot'] = str(cot)
        
        # reversibility
        des_df.loc[index, 'rev'] = '-->'
        
        
    elif row['TCID'].startswith('2.A.25.'):
        
        des_df.loc[index, 'tcat'] = 'sym'
        
        cot = []
        if 'Na+' in row['short']: cot.append('na1')   # {'9175': 'sodium(1+)'}
        if 'H+' in row['short']: cot.append('h')   # {'5584': 'hydron'}
        if cot==[]: cot.append('h') 
        des_df.loc[index, 'cot'] = str(cot)
        
        # reversibility
        des_df.loc[index, 'rev'] = '-->'
        
        
    elif row['TCID'].startswith('2.A.26.'):
        
        des_df.loc[index, 'tcat'] = 'sym'
        
        cot = []
        if 'Na+' in row['short']: cot.append('na1')   # {'9175': 'sodium(1+)'}
        if 'H+' in row['short']: cot.append('h')   # {'5584': 'hydron'}
        if cot==[]: cot.append('h') 
        des_df.loc[index, 'cot'] = str(cot)
        
        # reversibility
        des_df.loc[index, 'rev'] = '-->'
        
        
    elif row['TCID'].startswith('2.A.27.'):
        
        des_df.loc[index, 'tcat'] = 'sym'
        
        cot = ['na1']  
        des_df.loc[index, 'cot'] = str(cot)
        
        # reversibility
        des_df.loc[index, 'rev'] = '-->'
        
        
    elif row['TCID'].startswith('2.A.46.'):
        
        des_df.loc[index, 'tcat'] = 'sym'
        
        cot = ['na1']  
        des_df.loc[index, 'cot'] = str(cot)
        
        # reversibility
        des_df.loc[index, 'rev'] = '-->'
        
        
    elif row['TCID'].startswith('2.A.50.'):
        
        des_df.loc[index, 'tcat'] = 'sym'
        
        cot = ['h']  
        des_df.loc[index, 'cot'] = str(cot)
        
        # reversibility
        des_df.loc[index, 'rev'] = '-->'
        
        
    elif row['TCID'].startswith('3.A.1.'):
        des_df.loc[index, 'tcat'] = 'abc'
        
        des_df.loc[index, 'cot'] = '-'
        
        # reversibility
        des_df.loc[index, 'rev'] = '-->'
        
        
    elif row['TCID'].startswith('4.A.'):
        des_df.loc[index, 'tcat'] = 'pts'
        
        des_df.loc[index, 'cot'] = '-'
        
        # reversibility
        des_df.loc[index, 'rev'] = '-->'
        
        
    elif row['TCID'].startswith('4.C.1.'):
        des_df.loc[index, 'tcat'] = 'coa'
        
        des_df.loc[index, 'cot'] = '-'
        
        # reversibility
        des_df.loc[index, 'rev'] = '-->'
        
    else:
        pass
        # print("ERROR: unhandled family!", row['TCID'])
        
        
    

        
        
des_df.to_excel('tcdb_generated/des_parsed.xlsx')
print(des_df.shape)
des_df.head()

(18653, 9)


Unnamed: 0,TCID,Name,Domain,Kingdom/Phylum,short,tcat,cot,dir,rev
0,1.A.1.1.1,Two TMS K+ and water channel (conducts K+ (KD ...,Bacteria,Actinomycetota,Two TMS K+ and water channel (conducts K+ (KD ...,,,{'uptake'},
1,1.A.1.2.1,Voltage-sensitive K+ channel (PNa+/PK+ ≈ 0.1) ...,Eukaryota,"Metazoa, Arthropoda",Voltage-sensitive K+ channel (PNa+/PK+ ≈ 0.1) ...,,,{'uptake'},
2,1.A.1.2.2,Voltage-sensitive K+ channel of 498 aas and 6 ...,Eukaryota,"Metazoa, Arthropoda",Voltage-sensitive K+ channel of 498 aas and 6 ...,,,{'uptake'},
3,1.A.1.2.3,Voltage-sensitive fast transient outward curre...,Eukaryota,"Metazoa, Arthropoda",Voltage-sensitive fast transient outward curre...,,,{'uptake'},
4,1.A.1.2.4,"Margatoxin-sensitive voltage-gated K+ channel,...",Eukaryota,"Metazoa, Chordata","Margatoxin-sensitive voltage-gated K+ channel,...",,,{'uptake'},


## merge 'des_df' and 'subs_df'

In [21]:
tcdb_rs = pnd.merge(des_df, subs_df, left_on='TCID', right_on='tc_code', how='left')
# keep row only if substrate is described: 
#tcdb_rs = tcdb_rs[tcdb_rs['tc_code'].notna()]
# reorder columns
tcdb_rs = tcdb_rs[['TCID', 'Domain', 'Kingdom/Phylum', 'short', 'Name', 'dir', 'tcat', 'cot', 'rev', 'chebi', 'bigg']]
# rename columns:
tcdb_rs = tcdb_rs.rename(columns={'TCID': 'tc_code', 'short': 'des_short', 'Name': 'des_long', 'Domain': 'domain', 'Kingdom/Phylum': 'kingdom_phylum'})


# add the rid:
tcdb_rs['rid'] = tcdb_rs.apply(lambda row: f"{'R_' + row['tc_code'].replace('.', '_')}", axis=1) 
        
    
# add 'chebi' annotation where it was not available: 
for index, row in tcdb_rs.iterrows():
    if type(row['chebi']) == float: 
        tcdb_rs.loc[index, 'chebi'] = str(dict())
    if type(row['bigg']) == float: 
        tcdb_rs.loc[index, 'bigg'] = str(dict())


print(tcdb_rs.shape)
tcdb_rs.head()

(18653, 12)


Unnamed: 0,tc_code,domain,kingdom_phylum,des_short,des_long,dir,tcat,cot,rev,chebi,bigg,rid
0,1.A.1.1.1,Bacteria,Actinomycetota,Two TMS K+ and water channel (conducts K+ (KD ...,Two TMS K+ and water channel (conducts K+ (KD ...,{'uptake'},,,,"{'8345': 'potassium(1+)', '5585': 'water'}","{'8345': {'k'}, '5585': {'h2o'}}",R_1_A_1_1_1
1,1.A.1.2.1,Eukaryota,"Metazoa, Arthropoda",Voltage-sensitive K+ channel (PNa+/PK+ ≈ 0.1) ...,Voltage-sensitive K+ channel (PNa+/PK+ ≈ 0.1) ...,{'uptake'},,,,{'8345': 'potassium(1+)'},{'8345': {'k'}},R_1_A_1_2_1
2,1.A.1.2.2,Eukaryota,"Metazoa, Arthropoda",Voltage-sensitive K+ channel of 498 aas and 6 ...,Voltage-sensitive K+ channel of 498 aas and 6 ...,{'uptake'},,,,{'8345': 'potassium(1+)'},{'8345': {'k'}},R_1_A_1_2_2
3,1.A.1.2.3,Eukaryota,"Metazoa, Arthropoda",Voltage-sensitive fast transient outward curre...,Voltage-sensitive fast transient outward curre...,{'uptake'},,,,{'8345': 'potassium(1+)'},{'8345': {'k'}},R_1_A_1_2_3
4,1.A.1.2.4,Eukaryota,"Metazoa, Chordata","Margatoxin-sensitive voltage-gated K+ channel,...","Margatoxin-sensitive voltage-gated K+ channel,...",{'uptake'},,,,{'8345': 'potassium(1+)'},{'8345': {'k'}},R_1_A_1_2_4


## build rstrings

In [22]:
tcdb_rs['rstrings'] = None

for index, row in tcdb_rs.iterrows(): 
    
    rstrings = {}
    
    
    if type(type(row['bigg']) != float):
            for chebi_id, mids in eval(row['bigg']).items():
                if chebi_id not in rstrings.keys(): 
                    rstrings[chebi_id] = set()
    
                
                # exclude cotransported, as we fish them only via substring matching:
                mids = mids - set(['h', 'mg2', 'ca2', 'k', 'na1'])
                for mid in mids:
                    rstring = None
            
                    if row['tcat'] == 'abc':
                        if 'uptake' in eval(row['dir']):
                            rstrings[chebi_id].add(f'atp_c + h2o_c + {mid}_e {row["rev"]} {mid}_c + pi_c + h_c + adp_c')
                        else:  # secretion
                            rstrings[chebi_id].add(f'atp_c + h2o_c + {mid}_c {row["rev"]} {mid}_e + pi_c + h_c + adp_c')
                            
                            
                    elif row['tcat'] == 'uni':
                        if 'uptake' in eval(row['dir']):
                            rstrings[chebi_id].add(f'{mid}_e {row["rev"]} {mid}_c')
                        else:  # secretion
                            rstrings[chebi_id].add(f'{mid}_c {row["rev"]} {mid}_e')
                            
                            
                    elif row['tcat'] == 'sym':
                        for cot in eval(row['cot']):
                            if 'uptake' in eval(row['dir']):
                                rstrings[chebi_id].add(f'{mid}_e + {cot}_e {row["rev"]} {mid}_c + {cot}_c')
                            else:  # secretion
                                rstrings[chebi_id].add(f'{mid}_c + {cot}_c {row["rev"]} {mid}_e + {cot}_e')
                                
                                
                    elif row['tcat'] == 'anti':
                        for cot in eval(row['cot']):
                            if 'uptake' in eval(row['dir']):
                                rstrings[chebi_id].add(f'{mid}_e + {cot}_c {row["rev"]} {mid}_c + {cot}_e')
                            else:  # secretion
                                rstrings[chebi_id].add(f'{mid}_c + {cot}_e {row["rev"]} {mid}_e + {cot}_c')
                                
                                
                    elif row['tcat'] == 'pts':
                        if mid not in pts_assoc.keys(): continue
                        if 'uptake' in eval(row['dir']):
                            rstrings[chebi_id].add(f'{mid}_e + pep_c {row["rev"]} {pts_assoc[mid]}_c + pyr_c')
                        else:  # secretion    (should not exist)
                            rstrings[chebi_id].add(f'{mid}_c + pep_c {row["rev"]} {pts_assoc[mid]}_e + pyr_c')
                            
                            
                    elif row['tcat'] == 'coa':
                        if mid not in coa_assoc.keys(): continue
                        if 'uptake' in eval(row['dir']):
                            rstrings[chebi_id].add(f'{mid}_e + h_e + coa_c {row["rev"]} {coa_assoc[mid]}_c + h_c + ppi_c')
                        else:  # secretion    (should not exist)
                            rstrings[chebi_id].add(f'{mid}_c + h_c + coa_e {row["rev"]} {coa_assoc[mid]}_e + h_e + ppi_e')
                                             
                
    tcdb_rs.loc[index, 'rstrings'] = str(rstrings)
                
    
    
print(tcdb_rs.shape)
tcdb_rs.head()

(18653, 13)


Unnamed: 0,tc_code,domain,kingdom_phylum,des_short,des_long,dir,tcat,cot,rev,chebi,bigg,rid,rstrings
0,1.A.1.1.1,Bacteria,Actinomycetota,Two TMS K+ and water channel (conducts K+ (KD ...,Two TMS K+ and water channel (conducts K+ (KD ...,{'uptake'},,,,"{'8345': 'potassium(1+)', '5585': 'water'}","{'8345': {'k'}, '5585': {'h2o'}}",R_1_A_1_1_1,"{'8345': set(), '5585': set()}"
1,1.A.1.2.1,Eukaryota,"Metazoa, Arthropoda",Voltage-sensitive K+ channel (PNa+/PK+ ≈ 0.1) ...,Voltage-sensitive K+ channel (PNa+/PK+ ≈ 0.1) ...,{'uptake'},,,,{'8345': 'potassium(1+)'},{'8345': {'k'}},R_1_A_1_2_1,{'8345': set()}
2,1.A.1.2.2,Eukaryota,"Metazoa, Arthropoda",Voltage-sensitive K+ channel of 498 aas and 6 ...,Voltage-sensitive K+ channel of 498 aas and 6 ...,{'uptake'},,,,{'8345': 'potassium(1+)'},{'8345': {'k'}},R_1_A_1_2_2,{'8345': set()}
3,1.A.1.2.3,Eukaryota,"Metazoa, Arthropoda",Voltage-sensitive fast transient outward curre...,Voltage-sensitive fast transient outward curre...,{'uptake'},,,,{'8345': 'potassium(1+)'},{'8345': {'k'}},R_1_A_1_2_3,{'8345': set()}
4,1.A.1.2.4,Eukaryota,"Metazoa, Chordata","Margatoxin-sensitive voltage-gated K+ channel,...","Margatoxin-sensitive voltage-gated K+ channel,...",{'uptake'},,,,{'8345': 'potassium(1+)'},{'8345': {'k'}},R_1_A_1_2_4,{'8345': set()}


In [23]:
# filter for bacteria:

print("Bacteria filter:", len(tcdb_rs), '>>>', end=' ')
#tcdb_rs_bac = tcdb_rs[(tcdb_rs['domain']=='Bacteria')]
tcdb_rs_bac = tcdb_rs[(tcdb_rs['domain']=='Bacteria') | (tcdb_rs['domain']=='') | (tcdb_rs['domain'].isna())]
tcdb_rs_bac = tcdb_rs_bac.reset_index(drop=True)
print(len(tcdb_rs_bac))

Bacteria filter: 18653 >>> 8958


## save 'tcdb_rs'


In [24]:
# save dataframe
tcdb_rs_bac.to_csv('tcdb_generated/tcdb_rs.csv')
tcdb_rs_bac.to_excel('tcdb_generated/tcdb_rs.xlsx')
