In [8]:
!pip install rdkit


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit
  Downloading rdkit-2022.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.3 MB)
[K     |████████████████████████████████| 29.3 MB 3.3 MB/s 
Installing collected packages: rdkit
Successfully installed rdkit-2022.9.2


In [10]:
import pandas as pd

In [224]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
import numpy as np

def get_fingerprint(mol):
  fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=1024)
  fp_dict = fp.GetNonzeroElements()
  arr = np.zeros((1024,))
  for key, val in fp_dict.items():
      arr[key] = val
  return arr

def load_data(file_name, X_label, y_label, nBits=1024):
  df = pd.read_csv(file_name, sep='\t')[[X_label,y_label]]
  df = df.rename(columns={X_label: 'X', y_label: 'y'})

  X_clean = []
  y_clean = []
  for X,y in zip(df['X'], df['y']):
    mol = Chem.MolFromSmiles(X)
    if mol is not None:
      # Get Fingerprint
      fingerprint = get_fingerprint(mol)

      # Append
      X_clean.append(fingerprint)
      y_clean.append(1 if y is True else 0)


  return np.array(X_clean), np.array(y_clean)

In [204]:
X_train, y_train = load_data('data/sweet-train.tsv', 'Canonical SMILES', 'Sweet')
X_test, y_test = load_data('data/sweet-test.tsv', 'Canonical SMILES', 'Sweet')

In [205]:
X_train

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 2., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [239]:
from xgboost import XGBClassifier

In [240]:
from sklearn.metrics import accuracy_score

In [241]:
model = XGBClassifier()

In [242]:
model.fit(X_train, y_train)

XGBClassifier()

In [243]:
y_pred = model.predict(X_test)

In [244]:
print("Accuracy of Model::",accuracy_score(y_test,y_pred))

Accuracy of Model:: 0.7261146496815286


## Compute for 2D Descriptors

In [248]:
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit.Chem import Descriptors

In [251]:
descriptors_list = [x[0] for x in Descriptors._descList]

In [252]:
descriptors_list

['MaxEStateIndex',
 'MinEStateIndex',
 'MaxAbsEStateIndex',
 'MinAbsEStateIndex',
 'qed',
 'MolWt',
 'HeavyAtomMolWt',
 'ExactMolWt',
 'NumValenceElectrons',
 'NumRadicalElectrons',
 'MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'FpDensityMorgan1',
 'FpDensityMorgan2',
 'FpDensityMorgan3',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW',
 'BalabanJ',
 'BertzCT',
 'Chi0',
 'Chi0n',
 'Chi0v',
 'Chi1',
 'Chi1n',
 'Chi1v',
 'Chi2n',
 'Chi2v',
 'Chi3n',
 'Chi3v',
 'Chi4n',
 'Chi4v',
 'HallKierAlpha',
 'Ipc',
 'Kappa1',
 'Kappa2',
 'Kappa3',
 'LabuteASA',
 'PEOE_VSA1',
 'PEOE_VSA10',
 'PEOE_VSA11',
 'PEOE_VSA12',
 'PEOE_VSA13',
 'PEOE_VSA14',
 'PEOE_VSA2',
 'PEOE_VSA3',
 'PEOE_VSA4',
 'PEOE_VSA5',
 'PEOE_VSA6',
 'PEOE_VSA7',
 'PEOE_VSA8',
 'PEOE_VSA9',
 'SMR_VSA1',
 'SMR_VSA10',
 'SMR_VSA2',
 'SMR_VSA3',
 'SMR_VSA4',
 'SMR_VSA5',
 'SMR_VSA6',
 'SMR_VSA7',
 'SMR_

In [283]:
def get_2d_descriptors(mol):
  calc = MolecularDescriptorCalculator(descriptors_list)
  header = calc.GetDescriptorNames()
  descriptors = calc.CalcDescriptors(mol)
  return dict(zip(header, descriptors))

def get_fingerprint(mol):
  fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=1024)
  fp_dict = fp.GetNonzeroElements()
  arr = np.zeros((1024,))
  for key, val in fp_dict.items():
      arr[key] = val
  return arr

def load_data(file_name, X_label='Canonical SMILES', y_label='Sweet', nBits=1024):
  df = pd.read_csv(file_name, sep='\t')[[X_label,y_label]]
  df = df.rename(columns={X_label: 'X', y_label: 'y'})

  X_clean = []
  y_clean = []

  iter = 0
  for X,y in zip(df['X'], df['y']):
    print(f'Parsing {iter+1}/{len(df)}...')
    mol = Chem.MolFromSmiles(X)
    if mol is not None:
      # Get Fingerprint
      fingerprint = get_fingerprint(mol)

      # 2D Descriptors
      descriptors = get_2d_descriptors(mol)

      # Append
      X_clean.append(descriptors)
      y_clean.append(1 if y is True else 0)
    iter += 1

  return pd.DataFrame(X_clean), pd.DataFrame(y_clean)


In [287]:
X_exp, y_exp = load_data('data/sweet-train.tsv','Canonical SMILES', 'Sweet')

Parsing 1/2205...
Parsing 2/2205...
Parsing 3/2205...
Parsing 4/2205...
Parsing 5/2205...
Parsing 6/2205...
Parsing 7/2205...
Parsing 8/2205...
Parsing 9/2205...
Parsing 10/2205...
Parsing 11/2205...
Parsing 12/2205...
Parsing 13/2205...
Parsing 14/2205...
Parsing 15/2205...
Parsing 16/2205...
Parsing 17/2205...
Parsing 18/2205...
Parsing 19/2205...
Parsing 20/2205...
Parsing 21/2205...
Parsing 22/2205...
Parsing 23/2205...
Parsing 24/2205...
Parsing 25/2205...
Parsing 26/2205...
Parsing 27/2205...
Parsing 28/2205...
Parsing 29/2205...
Parsing 30/2205...
Parsing 31/2205...
Parsing 32/2205...
Parsing 33/2205...
Parsing 34/2205...
Parsing 35/2205...
Parsing 36/2205...
Parsing 37/2205...
Parsing 38/2205...
Parsing 39/2205...
Parsing 40/2205...
Parsing 41/2205...
Parsing 42/2205...
Parsing 43/2205...
Parsing 44/2205...
Parsing 45/2205...
Parsing 46/2205...
Parsing 47/2205...
Parsing 48/2205...
Parsing 49/2205...
Parsing 50/2205...
Parsing 51/2205...
Parsing 52/2205...
Parsing 53/2205...
Pa

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
2181,0
2182,0
2183,0
2184,0


In [195]:
# #!pip install BorutaShap
# from BorutaShap import BorutaShap
# # Creates a BorutaShap selector for regression
# selector = BorutaShap(importance_measure = 'shap', classification = False)
# # Fits the selector
# selector.fit(X = X_clean_df, y = np.array(y_clean), n_trials = 10, sample = True, verbose = True)
# # n_trials -> number of iterations for Boruta algorithm
# # sample -> samples the data so it goes faster