In [1]:
!pip install rdkit mordred



In [2]:
import pandas as pd
import numpy as np
from rdkit import Chem
from mordred import Calculator, descriptors
import warnings
warnings.filterwarnings('ignore') # To suppress runtime and deprication warnings. 


# Feature Calculation

from https://github.com/rohitfarmer/bittersweet/blob/master/notebooks/feature-calculation-mordred.ipynb

In [81]:
df_sweet_train = pd.read_csv('data/sweet-train.tsv', sep='\t')[['Name','SMILES','Reference','Sweet']]
df_sweet_test = pd.read_csv('data/sweet-test.tsv', sep='\t')[['Name','SMILES','Reference','Sweet']]
df_bitter_train = pd.read_csv('data/bitter-train.tsv', sep='\t')[['Name','SMILES','Reference','Bitter']]
df_bitter_test = pd.read_csv('data/bitter-test.tsv', sep='\t')[['Name','SMILES','Reference','Bitter']]

# Concatenate train and test set
# df_sweet = pd.concat([sweet_train[['Name','SMILES','Sweet']], sweet_test[['Name','SMILES','Sweet']] ])
# df_bitter = pd.concat([bitter_train[['Name','SMILES','Bitter']], bitter_test[['Name','SMILES','Bitter']] ])


# Rename taste column as Target.
df_sweet_train.rename(columns={'Sweet':'Target'}, inplace=True)
df_sweet_test.rename(columns={'Sweet':'Target'}, inplace=True)
df_bitter_train.rename(columns={'Bitter':'Target'}, inplace=True)
df_bitter_test.rename(columns={'Bitter':'Target'}, inplace=True)

In [83]:
df_sweet_train['Reference'].value_counts()

BitterDB                                                            592
Rojas et al. (2017)                                                 486
Fenaroli’s Handbook of Flavor Ingredient                            462
The Good Scents Company Database                                    201
SuperSweet                                                          198
ToxNet                                                               72
Biochemical Targets of Plant Bioactive Compounds by Gideon Polya     71
Wiener et al. (2017) - Phyto-Dictionary                              51
Rodgers et al. (2006)                                                29
Wiener et al. (2017) - Bitter-New                                    25
Wiener et al. (2017) - UNIMI                                         18
Name: Reference, dtype: int64

In [84]:
df_sweet_test['Reference'].value_counts()

Rojas et al. (2017)    161
Name: Reference, dtype: int64

In [85]:
df_sweet_train['Target'].value_counts()

True     1139
False    1066
Name: Target, dtype: int64

In [86]:
df_sweet_test['Target'].value_counts()

True     108
False     53
Name: Target, dtype: int64

In [78]:
df_sweet_test['Target'].describe()

count      161
unique       2
top       True
freq       108
Name: Target, dtype: object

In [87]:
df_bitter_train['Target'].value_counts()

False    1444
True      813
Name: Target, dtype: int64

In [90]:
df_bitter_test['Target'].value_counts()

True     105
False     66
Name: Target, dtype: int64

In [46]:
print(df_sweet_train.count())
df_sweet_train[df_sweet_train['SMILES'].isna()] # check if any of the SMILES is a NaN.

Name         2105
SMILES       2205
Reference    2205
Target       2205
dtype: int64


Unnamed: 0,Name,SMILES,Reference,Target


In [47]:
print(df_sweet_test.count())
df_sweet_test[df_sweet_test['SMILES'].isna()] # check if any of the SMILES is a NaN.

Name         161
SMILES       161
Reference    161
Target       161
dtype: int64


Unnamed: 0,Name,SMILES,Reference,Target


In [48]:
print(df_bitter_train.count())
df_bitter_train[df_bitter_train['SMILES'].isna()] # check if any of the SMILES is a NaN.

Name         2176
SMILES       2257
Reference    2257
Target       2257
dtype: int64


Unnamed: 0,Name,SMILES,Reference,Target


In [49]:
print(df_bitter_test.count())
df_bitter_test[df_bitter_test['SMILES'].isna()] # check if any of the SMILES is a NaN.

Name         115
SMILES       171
Reference    171
Target       171
dtype: int64


Unnamed: 0,Name,SMILES,Reference,Target


In [50]:
df_sweet_train['Target'] = df_sweet_train['Target'].astype(int) # Convert to numeric
df_sweet_test['Target'] = df_sweet_test['Target'].astype(int) # Convert to numeric
df_bitter_train['Target'] = df_bitter_train['Target'].astype(int) # Convert to numeric
df_bitter_test['Target'] = df_bitter_test['Target'].astype(int) # Convert to numeric

In [54]:
# Convert SMILES into Canonical SMILES using RDKit.
def convert_to_canonical_smiles(df):  
    length = df.shape[0]

    name = []
    smiles = []
    canon_smiles = []
    reference = []
    target = []
    exceptions = 0
    failed_smiles = []
    failed_name = []
    failed_target = []
    failed_reference = []
    for i in range(length):
        try:
            c = Chem.CanonSmiles(str(df.iloc[i]['SMILES']))
            canon_smiles.append(c)
            smiles.append(str(df.iloc[i]['SMILES']))
            target.append(str(df.iloc[i]['Target']))
            name.append(str(df.iloc[i]['Name']))
            reference.append(str(df.iloc[i]['Reference']))
        except:
            exceptions += 1
            failed_smiles.append(str(df.iloc[i]['SMILES']))
            failed_name.append(str(df.iloc[i]['Target']))
            failed_target.append(str(df.iloc[i]['Name']))
            failed_reference.append(str(df.iloc[i]['Reference']))
            continue
            
    return {'Name':name, 'SMILES':smiles, 'Canonical SMILES':canon_smiles, 'Reference': reference, 'Target':target}

In [55]:
valid_sweet_train = convert_to_canonical_smiles(df_sweet_train)
valid_sweet_test = convert_to_canonical_smiles(df_sweet_test)
valid_bitter_train = convert_to_canonical_smiles(df_bitter_train)
valid_bitter_test = convert_to_canonical_smiles(df_bitter_test)

df_sweet_valid_train = pd.DataFrame(valid_sweet_train)
df_bitter_valid_train = pd.DataFrame(valid_bitter_train)
df_sweet_valid_test = pd.DataFrame(valid_sweet_test)
df_bitter_valid_test = pd.DataFrame(valid_bitter_test)

[20:13:03] Explicit valence for atom # 2 N, 4, is greater than permitted
[20:13:03] Explicit valence for atom # 2 N, 4, is greater than permitted
[20:13:03] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:13:03] Explicit valence for atom # 6 N, 4, is greater than permitted
[20:13:03] Explicit valence for atom # 7 N, 4, is greater than permitted
[20:13:03] Explicit valence for atom # 9 N, 4, is greater than permitted
[20:13:03] Explicit valence for atom # 11 N, 4, is greater than permitted
[20:13:03] Explicit valence for atom # 3 N, 4, is greater than permitted
[20:13:03] Explicit valence for atom # 3 N, 4, is greater than permitted
[20:13:03] Explicit valence for atom # 3 N, 4, is greater than permitted
[20:13:03] Explicit valence for atom # 2 N, 4, is greater than permitted
[20:13:03] SMILES Parse Error: unclosed ring for input: '[NaH].[NaH].O1[C@@H](OC2CC[C@]3([C@H]4[C@@](CC[C@H]3C2(C)C)(C)[C@@]2(CC[C@@]3([C@@H](C2=CC4=O)C[C@@](CC3)(C(O)=O)C)C)C)C)C(O[C@@H]2OC(C(O)=

In [56]:
df_sweet_valid_train.describe()

Unnamed: 0,Name,SMILES,Canonical SMILES,Reference,Target
count,2186.0,2186,2186,2186,2186
unique,2038.0,2186,2183,11,2
top,,OCC1OC(C(C1O)O)(CO)OC1OC(CO)C(C(C1O)O)O,O=C1NS(=O)(=O)c2cc([N+](=O)[O-])ccc21,BitterDB,1
freq,100.0,1,2,590,1134


In [57]:
df_bitter_valid.describe()

Unnamed: 0,Name,SMILES,Canonical SMILES,Target
count,2393.0,2393,2393,2393
unique,2198.0,2393,2393,2
top,,OCC1OC(C(C1O)O)(CO)OC1OC(CO)C(C(C1O)O)O,OCC1OC(OC2(CO)OC(CO)C(O)C2O)C(O)C(O)C1O,0
freq,132.0,1,1,1501


In [58]:
df_sweet_valid_train[df_sweet_valid_train['Canonical SMILES'].isna()]

Unnamed: 0,Name,SMILES,Canonical SMILES,Reference,Target


In [59]:
df_sweet_valid_test[df_sweet_valid_test['Canonical SMILES'].isna()]

Unnamed: 0,Name,SMILES,Canonical SMILES,Reference,Target


In [60]:
df_bitter_valid_train[df_bitter_valid_train['Canonical SMILES'].isna()]

Unnamed: 0,Name,SMILES,Canonical SMILES,Reference,Target


In [61]:
df_bitter_valid_test[df_bitter_valid_test['Canonical SMILES'].isna()]

Unnamed: 0,Name,SMILES,Canonical SMILES,Reference,Target


In [62]:
# Remove duplicates
df_sweet_valid_train.drop_duplicates(subset="Canonical SMILES", inplace=True)
df_sweet_valid_test.drop_duplicates(subset="Canonical SMILES", inplace=True)
df_bitter_valid_train.drop_duplicates(subset="Canonical SMILES", inplace=True)
df_bitter_valid_test.drop_duplicates(subset="Canonical SMILES", inplace=True)

# Reset index
df_sweet_valid_train.reset_index(drop=True, inplace = True)
df_sweet_valid_test.reset_index(drop=True, inplace = True)
df_bitter_valid_train.reset_index(drop=True, inplace = True)
df_bitter_valid_test.reset_index(drop=True, inplace = True)

In [63]:
# Calculate Descriptors
calc = Calculator(descriptors, ignore_3D=False)
mols_sweet_train = [Chem.MolFromSmiles(smi) for smi in list(df_sweet_valid_train['Canonical SMILES'])]
mols_sweet_test = [Chem.MolFromSmiles(smi) for smi in list(df_sweet_valid_test['Canonical SMILES'])]
mols_bitter_train = [Chem.MolFromSmiles(smi) for smi in list(df_bitter_valid_train['Canonical SMILES'])]
mols_bitter_test = [Chem.MolFromSmiles(smi) for smi in list(df_bitter_valid_test['Canonical SMILES'])]



In [64]:
df_sweet_descriptors_train = calc.pandas(mols_sweet_train, quiet = True, ipynb = True)
df_sweet_descriptors_test = calc.pandas(mols_sweet_test, quiet = True, ipynb = True)
df_bitter_descriptors_train = calc.pandas(mols_bitter_train, quiet = True, ipynb = True)
df_bitter_descriptors_test = calc.pandas(mols_bitter_test, quiet = True, ipynb = True)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)




  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)




  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)




  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)




  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [67]:
df_sweet_base_train = df_sweet_valid_train[['Name', 'Reference', 'SMILES', 'Canonical SMILES', 'Target']]
df_sweet_with_descriptors_train = pd.concat([
    df_sweet_base_train, 
    df_sweet_descriptors_train
    ], axis=1)
df_sweet_base_test = df_sweet_valid_test[['Name', 'Reference', 'SMILES', 'Canonical SMILES', 'Target']]
df_sweet_with_descriptors_test = pd.concat([
    df_sweet_base_test, 
    df_sweet_descriptors_test
    ], axis=1)

df_bitter_base_train = df_bitter_valid_train[['Name', 'Reference', 'SMILES', 'Canonical SMILES', 'Target']]
df_bitter_with_descriptors_train = pd.concat([
    df_bitter_base_train, 
    df_bitter_descriptors_train
    ], axis=1)
df_bitter_base_test = df_bitter_valid_test[['Name', 'Reference', 'SMILES', 'Canonical SMILES', 'Target']]
df_bitter_with_descriptors_test = pd.concat([
    df_bitter_base_test, 
    df_bitter_descriptors_test
    ], axis=1)

In [68]:
df_sweet_with_descriptors_train.to_csv('./mordred/sweet_descriptor_train.csv', index=False)
df_sweet_with_descriptors_test.to_csv('./mordred/sweet_descriptor_test.csv', index=False)
df_bitter_with_descriptors_train.to_csv('./mordred/bitter_descriptor_train.csv', index=False)
df_bitter_with_descriptors_test.to_csv('./mordred/bitter_descriptor_test.csv', index=False)

Unnamed: 0,Name,Reference,SMILES,Canonical SMILES,Target,ABC,ABCGG,nAcid,nBase,SpAbs_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,D-Fructose,Wiener et al. (2017) - Phyto-Dictionary,OC[C@]1(O)[C@@H](O)[C@H](O)[C@@H](CO1)O,OC[C@@]1(O)OC[C@@H](O)[C@@H](O)[C@@H]1O,0,8.829880,8.715591,0,0,14.640475,...,9.569063,43.370989,180.063388,7.502641,178,20,60.0,71.0,6.145833,2.722222
1,D-Glucose,Wiener et al. (2017) - Phyto-Dictionary,OC[C@@H]1[C@@H](O)[C@H](O)[C@@H](O)[C@H](O1)O,OC[C@H]1O[C@H](O)[C@H](O)[C@@H](O)[C@@H]1O,0,8.761080,8.651650,0,0,14.708146,...,9.396405,42.927926,180.063388,7.502641,182,19,58.0,68.0,6.055556,2.777778
2,D-Mannose,Wiener et al. (2017) - Phyto-Dictionary,OC[C@@H]1[C@@H](O)[C@H](O)[C@H](O)[C@H](O1)O,OC[C@H]1O[C@H](O)[C@@H](O)[C@@H](O)[C@@H]1O,0,8.761080,8.651650,0,0,14.708146,...,9.396405,42.927926,180.063388,7.502641,182,19,58.0,68.0,6.055556,2.777778
3,L-Rhamnose,Wiener et al. (2017) - Phyto-Dictionary,C[C@H]1[C@H](O)[C@@H](O)[C@@H](O)[C@H](O1)O,C[C@@H]1O[C@H](O)[C@H](O)[C@H](O)[C@H]1O,0,8.163363,8.029752,0,0,13.137460,...,9.331052,41.588577,164.068473,7.133412,140,17,54.0,63.0,5.805556,2.444444
4,D-Ribulose,Wiener et al. (2017) - Phyto-Dictionary,OCC(=O)[C@H](O)[C@H](O)CO,O=C(CO)[C@H](O)[C@H](O)CO,0,6.611250,7.282959,0,0,11.763639,...,8.575273,38.268062,150.052823,7.502641,125,12,40.0,43.0,5.833333,2.555556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,,Wiener et al. (2017) - UNIMI,c1cc(O)c(O)cc1C[C@@H](C2=O)COc(c23)c(OC)c(OC)c...,COc1c(O)c(O)c2c(c1OC)OC[C@@H](Cc1ccc(O)c(O)c1)...,1,19.981978,16.397806,0,0,33.251113,...,10.378385,61.437322,362.100168,8.229549,1633,49,138.0,168.0,10.222222,5.833333
160,,Wiener et al. (2017) - UNIMI,c1cccc(c1C([O-])=O)NC(=O)/C=C/c2ccc(O)cc2,O=C(/C=C/c1ccc(O)cc1)Nc1ccccc1C(=O)[O-],1,15.913028,13.077290,1,0,26.247606,...,9.668714,54.181448,282.077181,8.547793,1064,28,102.0,114.0,7.416667,4.722222
161,,Wiener et al. (2017) - UNIMI,c1cccc(c1C([O-])=O)NC(=O)/C=C/c2cc(OC)c(cc2)OC,COc1ccc(/C=C/C(=O)Nc2ccccc2C(=O)[O-])cc1OC,1,17.884519,14.823609,1,0,30.336232,...,9.866357,57.946497,326.103396,8.152585,1535,35,116.0,132.0,8.777778,5.583333
162,,Wiener et al. (2017) - UNIMI,c1cccc(c1C([O-])=O)NC(=O)/C=C/c2cc(O)ccc2,O=C(/C=C/c1cccc(O)c1)Nc1ccccc1C(=O)[O-],1,15.913028,13.206444,1,0,26.180249,...,9.674389,54.189588,282.077181,8.547793,1050,28,102.0,114.0,7.416667,4.722222
