In [12]:
import pandas as pd
import numpy as np
import time
import re
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, rdFingerprintGenerator
from rdkit.Chem import PandasTools
from sklearn.linear_model import LogisticRegression
import pubchempy as pcp

In [13]:
# Read data downloaded from TransportDB - all transporters for E. coli 536
df = pd.read_csv('data/ecoli_ed1a.csv')
df

Unnamed: 0,Protein-Name,Substrate,Subtype,Family,Family Name,Transporter Class,TC number
0,ECED1_0811,aspartate:alanine antiporter,,AAE,The Aspartate:Alanine Exchanger (AAE) Family,Secondary Transporter,2.A.81
1,ECED1_4376,aspartate:alanine antiporter,,AAE,The Aspartate:Alanine Exchanger (AAE) Family,Secondary Transporter,2.A.81
2,ECED1_1546,aminobenzoyl-glutamate,,AbgT,The p-Aminobenzoyl-glutamate Transporter (AbgT...,Secondary Transporter,2.A.68
3,ECED1_2819,Unclassified,,AEC,The Auxin Efflux Carrier (AEC) Family,Secondary Transporter,2.A.69
4,ECED1_0006,sodium ion:alanine symporter,,AGCS,The Alanine or Glycine:Cation Symporter (AGCS)...,Secondary Transporter,2.A.25
...,...,...,...,...,...,...,...
639,ECED1_4980,ascorbate,EnzymeIIC,SSPTS,Sugar Specific PTS,Phosphotransferase System (PTS),4.A
640,ECED1_0564,Unclassified,,OMF,The Outer Membrane Factor (OMF) Family,Outer Membrane Porins,1.B.17
641,ECED1_0760,Unclassified,,OMF,The Outer Membrane Factor (OMF) Family,Outer Membrane Porins,1.B.17
642,ECED1_0264,Unclassified,,OOP,The OmpA-OmpF Porin (OOP) Family,Outer Membrane Porins,1.B.6


In [14]:
substrates = df['Substrate'].tolist()
# Remove duplicates
substrates = list(set(substrates))
# Remove empty strings
substrates = [s for s in substrates if s]
substrates

['glucose',
 'metabolite (alpha-ketoglutarate?)',
 'mannitol/fructose',
 'glucitol/sorbitol',
 'large-conductance mechanosensitive ion channel',
 'gluconate',
 'peptide uptake',
 'aspartate:alanine antiporter',
 'multidrug efflux (Bcr/CflA subfamily)',
 'amino acid (lysine/arginine/ornithine/histidine/octopine)',
 'mannitol',
 'ammonium',
 'nitrate/nitrite',
 'multidrug efflux',
 'potassium ion uptake',
 'sodium ion:phosphate symporter',
 'GABA',
 'thiamin',
 'leucine/valine',
 '2-keto-3-deoxygluconate',
 'glycerol uptake',
 'D-galactonate',
 'glycerol-3-phosphate',
 'Autoinducer-2 export',
 'sugar (maltose?)',
 'metabolite (benzoate?)',
 'short-chain fatty acid',
 'amino acid efflux',
 'daunorubicin',
 'cobalt',
 'phosphate',
 'sodium ion:calcium ion antiporter',
 'ribose',
 'polysaccharide export',
 'cytosine/purines/uracil/thiamine/allantoin',
 'multidrug',
 'sulfate',
 'glycine betaine/carnitine/choline',
 'galactitol',
 'branched-chain amino acid',
 'copper ion',
 'sodium ion/?',


In [15]:
# process names
substrates = [s.split('/') for s in substrates]
substrates = [item for sublist in substrates for item in sublist]
print(substrates)

# Define function to retrieve SMILES
def get_smiles(chemical_name):
    name = re.sub("[^A-Za-z0-9 ]+", " ", chemical_name)
    print(name)
    try:
        compound = pcp.get_compounds(name, 'name')
        if compound:
            return compound[0].canonical_smiles
    except Exception as e:
        return str(e)


['glucose', 'metabolite (alpha-ketoglutarate?)', 'mannitol', 'fructose', 'glucitol', 'sorbitol', 'large-conductance mechanosensitive ion channel', 'gluconate', 'peptide uptake', 'aspartate:alanine antiporter', 'multidrug efflux (Bcr', 'CflA subfamily)', 'amino acid (lysine', 'arginine', 'ornithine', 'histidine', 'octopine)', 'mannitol', 'ammonium', 'nitrate', 'nitrite', 'multidrug efflux', 'potassium ion uptake', 'sodium ion:phosphate symporter', 'GABA', 'thiamin', 'leucine', 'valine', '2-keto-3-deoxygluconate', 'glycerol uptake', 'D-galactonate', 'glycerol-3-phosphate', 'Autoinducer-2 export', 'sugar (maltose?)', 'metabolite (benzoate?)', 'short-chain fatty acid', 'amino acid efflux', 'daunorubicin', 'cobalt', 'phosphate', 'sodium ion:calcium ion antiporter', 'ribose', 'polysaccharide export', 'cytosine', 'purines', 'uracil', 'thiamine', 'allantoin', 'multidrug', 'sulfate', 'glycine betaine', 'carnitine', 'choline', 'galactitol', 'branched-chain amino acid', 'copper ion', 'sodium ion'

In [16]:
print(get_smiles(substrates[1]))

metabolite  alpha ketoglutarate 
None


In [17]:
# Query PubChem and collect SMILES
results = []
for substrate in substrates:
    smiles = get_smiles(substrate)
    results.append({
        'Substrate': substrate,
        'SMILES': smiles
    })
    time.sleep(0.2)  # Add delay to be polite to PubChem servers

smiles_df = pd.DataFrame(results)


glucose
metabolite  alpha ketoglutarate 
mannitol
fructose
glucitol
sorbitol
large conductance mechanosensitive ion channel
gluconate
peptide uptake
aspartate alanine antiporter
multidrug efflux  Bcr
CflA subfamily 
amino acid  lysine
arginine
ornithine
histidine
octopine 
mannitol
ammonium
nitrate
nitrite
multidrug efflux
potassium ion uptake
sodium ion phosphate symporter
GABA
thiamin
leucine
valine
2 keto 3 deoxygluconate
glycerol uptake
D galactonate
glycerol 3 phosphate
Autoinducer 2 export
sugar  maltose 
metabolite  benzoate 
short chain fatty acid
amino acid efflux
daunorubicin
cobalt
phosphate
sodium ion calcium ion antiporter
ribose
polysaccharide export
cytosine
purines
uracil
thiamine
allantoin
multidrug
sulfate
glycine betaine
carnitine
choline
galactitol
branched chain amino acid
copper ion
sodium ion
 
lipoprotein releasing
iron hydroxamate
zinc
cadmium
cobalt ion
toluene tolerance
L lactate
tricarboxylate  TctC 
protons
C4 dicarboxylate
nickel
magnesium ion
spermidine
p

In [None]:
pfas_smiles = [
    "FC(F)(F)C(F)(F)C(F)(F)C(F)(F)F", # Example long-chain PFAS
    "FC(F)(F)C(F)(F)C(F)(F)F"          # Example short-chain PFAS
]
pfas_mols = [Chem.MolFromSmiles(smile) for smile in pfas_smiles]
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
# Generate fingerprints for PFAS
pfas_fps = [mfpgen.GetFingerprint(mol) for mol in pfas_mols]

# filter none values from smiles_df
smiles_df = smiles_df[smiles_df['SMILES'].notna()]
substrate_fps = []
for substrate in smiles_df['SMILES']:
    mol = Chem.MolFromSmiles(substrate)
    if mol:
        fp = mfpgen.GetFingerprint(mol)
        substrate_fps.append(fp)
    else:
        substrate_fps.append(None)
# Remove None values
substrate_fps = [fp for fp in substrate_fps if fp is not None]
# Create a DataFrame for fingerprints
fingerprint_df = pd.DataFrame({
    'Substrate': smiles_df['Substrate'],
    'SMILES': smiles_df['SMILES'],
    'Fingerprint': substrate_fps
})



Unnamed: 0,Substrate,SMILES,Fingerprint
0,glucose,C(C1C(C(C(C(O1)O)O)O)O)O,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,mannitol,C(C(C(C(C(CO)O)O)O)O)O,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,fructose,C1C(C(C(C(O1)(CO)O)O)O)O,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,glucitol,C(C(C(C(C(CO)O)O)O)O)O,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,sorbitol,C(C(C(C(C(CO)O)O)O)O)O,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
186,glutamate,C(CC(=O)O)C(C(=O)O)N,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
187,aspartate?),C(C(C(=O)O)N)C(=O)O,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
190,glycine betaine,C[N+](C)(C)CC(=O)[O-],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
201,xylose,C1C(C(C(C(O1)O)O)O)O,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [31]:
# calculate similarity
def calculate_similarity(fp1, fp2):
    return DataStructs.TanimotoSimilarity(fp1, fp2)
# Calculate similarity for each substrate against PFAS
similarity_results = []
for substrate_fp in substrate_fps:
    simsum = 0
    for pfas_fp in pfas_fps:
        simsum += calculate_similarity(substrate_fp, pfas_fp)
    similarity_results.append(simsum / len(pfas_fps))

# Create a DataFrame for similarity results
similarity_df = pd.DataFrame({
    'Substrate': smiles_df['Substrate'],
    'Similarity': similarity_results
})
# Merge the similarity results with the original DataFrame
merged_df = pd.merge(smiles_df, similarity_df, on='Substrate', how='left')
merged_df = merged_df.dropna()
print(merged_df[merged_df['Similarity'] > 0.5])

Empty DataFrame
Columns: [Substrate, SMILES, pfas_fps, Similarity]
Index: []
