In [1]:
!pip install rdkit mordred



In [66]:
import pandas as pd
import numpy as np
from rdkit import Chem
from mordred import Calculator, descriptors
import warnings
warnings.filterwarnings('ignore') # To suppress runtime and deprication warnings. 


# Feature Calculation

from https://github.com/rohitfarmer/bittersweet/blob/master/notebooks/feature-calculation-mordred.ipynb

In [67]:
sweet_train = pd.read_csv('data/sweet-train.tsv', sep='\t')
sweet_test = pd.read_csv('data/sweet-test.tsv', sep='\t')
bitter_train = pd.read_csv('data/bitter-train.tsv', sep='\t')
bitter_test = pd.read_csv('data/bitter-test.tsv', sep='\t')

df_sweet = pd.DataFrame()
df_bitter = pd.DataFrame()

# Concatenate train and test set
df_sweet = pd.concat([sweet_train[['Name','SMILES','Sweet']], sweet_test[['Name','SMILES','Sweet']] ])
df_bitter = pd.concat([bitter_train[['Name','SMILES','Bitter']], bitter_test[['Name','SMILES','Bitter']] ])


# Rename taste column as Target.
df_sweet.rename(columns={'Sweet':'Target'}, inplace=True)
df_bitter.rename(columns={'Bitter':'Target'}, inplace=True)

In [68]:
print(df_sweet.count())
df_sweet[df_sweet['SMILES'].isna()] # check if any of the SMILES is a NaN.

Name      2266
SMILES    2366
Target    2366
dtype: int64


Unnamed: 0,Name,SMILES,Target


In [70]:
print(df_bitter.count())
df_bitter[df_bitter['SMILES'].isna()] # check if any of the SMILES is a NaN.

Name      2291
SMILES    2428
Target    2428
dtype: int64


Unnamed: 0,Name,SMILES,Target


In [196]:
df_sweet['Target'] = df_sweet['Target'].astype(int) # Convert to numeric

In [197]:
df_bitter['Target'] = df_bitter['Target'].astype(int) # Convert to numeric

In [200]:
# Convert SMILES into Canonical SMILES using RDKit.
def convert_to_canonical_smiles(df):
#     length = df.shape[0]
#     exceptions = 0
#     valid = []
#     for i in range(length):
#         try:
#             c = Chem.CanonSmiles(str(df.iloc[i]['SMILES']))
#             row = df.iloc[i]
#             valid.append({'name': str(row['Name']), 'smiles': str(row['SMILES']), 'Canonical SMILES': c, 'target': 1 if row['Target'] == True else 0})
#         except:
#             exceptions += 1
#             continue
            
#     return valid     
    length = df.shape[0]

    name = []
    smiles = []
    canon_smiles = []
    target = []
    exceptions = 0
    failed_smiles = []
    failed_name = []
    failed_target = []
    for i in range(length):
        try:
            c = Chem.CanonSmiles(str(df.iloc[i]['SMILES']))
            canon_smiles.append(c)
            smiles.append(str(df.iloc[i]['SMILES']))
            target.append(str(df.iloc[i]['Target']))
            name.append(str(df.iloc[i]['Name']))
        except:
            exceptions += 1
            failed_smiles.append(str(df.iloc[i]['SMILES']))
            failed_name.append(str(df.iloc[i]['Target']))
            failed_target.append(str(df.iloc[i]['Name']))
            continue
            
    return {'Name':name, 'SMILES':smiles, 'Canonical SMILES':canon_smiles, 'Target':target}

In [216]:
valid_sweet = convert_to_canonical_smiles(df_sweet)
valid_bitter = convert_to_canonical_smiles(df_bitter)
df_sweet_valid = pd.DataFrame(valid_sweet)
df_bitter_valid = pd.DataFrame(valid_bitter)

[17:14:57] Explicit valence for atom # 2 N, 4, is greater than permitted
[17:14:57] Explicit valence for atom # 2 N, 4, is greater than permitted
[17:14:57] Explicit valence for atom # 5 N, 4, is greater than permitted
[17:14:57] Explicit valence for atom # 6 N, 4, is greater than permitted
[17:14:57] Explicit valence for atom # 7 N, 4, is greater than permitted
[17:14:57] Explicit valence for atom # 9 N, 4, is greater than permitted
[17:14:57] Explicit valence for atom # 11 N, 4, is greater than permitted
[17:14:57] Explicit valence for atom # 3 N, 4, is greater than permitted
[17:14:57] Explicit valence for atom # 3 N, 4, is greater than permitted
[17:14:57] Explicit valence for atom # 3 N, 4, is greater than permitted
[17:14:57] Explicit valence for atom # 2 N, 4, is greater than permitted
[17:14:57] SMILES Parse Error: unclosed ring for input: '[NaH].[NaH].O1[C@@H](OC2CC[C@]3([C@H]4[C@@](CC[C@H]3C2(C)C)(C)[C@@]2(CC[C@@]3([C@@H](C2=CC4=O)C[C@@](CC3)(C(O)=O)C)C)C)C)C(O[C@@H]2OC(C(O)=

In [217]:
df_sweet_valid.describe()

Unnamed: 0,Name,SMILES,Canonical SMILES,Target
count,2342.0,2342,2342,2342
unique,2185.0,2342,2338,2
top,,OCC1OC(C(C1O)O)(CO)OC1OC(CO)C(C(C1O)O)O,O=C1NS(=O)(=O)c2cc([N+](=O)[O-])ccc21,1
freq,100.0,1,2,1240


In [218]:
df_bitter_valid.describe()

Unnamed: 0,Name,SMILES,Canonical SMILES,Target
count,2404.0,2404,2404,2404
unique,2202.0,2397,2393,2
top,,c1coc(c12)c(OC)c3c(c2)ccc(=O)o3,O=C1NS(=O)(=O)c2c1cccc2[N+](=O)[O-],0
freq,137.0,2,2,1502


In [219]:
df_sweet_valid[df_sweet_valid['Canonical SMILES'].isna()]

Unnamed: 0,Name,SMILES,Canonical SMILES,Target


In [220]:
df_bitter_valid[df_bitter_valid['Canonical SMILES'].isna()]

Unnamed: 0,Name,SMILES,Canonical SMILES,Target


In [221]:
# Remove duplicates
df_sweet_valid.drop_duplicates(subset="Canonical SMILES", inplace=True)
df_bitter_valid.drop_duplicates(subset="Canonical SMILES", inplace=True)

# Reset index
df_sweet_valid.reset_index(drop=True, inplace = True)
df_bitter_valid.reset_index(drop=True, inplace = True)

In [222]:
# Calculate Descriptors
calc = Calculator(descriptors, ignore_3D=False)
mols_sweet = [Chem.MolFromSmiles(smi) for smi in list(df_sweet_valid['Canonical SMILES'])]
mols_bitter = [Chem.MolFromSmiles(smi) for smi in list(df_bitter_valid['Canonical SMILES'])]



In [223]:
df_sweet_descriptors = calc.pandas(mols_sweet, quiet = True, ipynb = True)
df_bitter_descriptors = calc.pandas(mols_bitter, quiet = True, ipynb = True)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)




  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)




  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)




  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)




In [224]:
df_sweet_base = df_sweet_valid[['Name', 'SMILES', 'Canonical SMILES']]
df_sweet_with_descriptors = pd.concat([df_sweet_base, df_sweet_descriptors, df_sweet_valid[['Target']]], axis=1)
df_bitter_base = df_bitter_valid[['Name', 'SMILES', 'Canonical SMILES']]
df_bitter_with_descriptors = pd.concat([df_bitter_base, df_bitter_descriptors, df_bitter_valid[['Target']]], axis=1)

In [225]:
df_sweet_with_descriptors.to_csv('mordred/sweet_descriptor.csv', index=False)
df_bitter_with_descriptors.to_csv('mordred/bitter_descriptor.csv', index=False)