# Feature computation

In [1]:
import numpy as np

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import Descriptors, rdFingerprintGenerator
from rdkit.Chem import Descriptors
from tqdm import tqdm

from statsmodels.distributions.empirical_distribution import ECDF

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


real_200_descr = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,25,26,27,28,29,30, 31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207]

In [2]:
example_mols = np.array([
    'CC1CC2C3CCC4=CC(=O)C=CC4([C@]3(C(CC2([C@]1(C(=O)CCl)O)C)O)F)C',
    'CCCCCCOC(=O)N=C(C1=CC=C(C=C1)NCC2=NC3=C(N2C)C=CC(=C3)C(=O)N(CCC(=O)OCC)C4=CC=CC=N4)N',
    'CSCCC(C(=O)NCC(=O)NC(CC1=CNC2=CC=CC=C21)C(=O)NC(CCSC)C(=O)NC(CC(=O)O)C(=O)NC(CC3=CC=CC=C3)C(=O)N)NC(=O)C(CC4=CC=C(C=C4)OS(=O)(=O)O)NC(=O)C(CC(=O)O)N'
])

In [3]:
muv = pd.read_csv("./data/muv.csv")

print('Shape: ', muv.shape)

vals = muv.values.flatten()
print('# meas.: ', len([v for v in vals if str(v) != 'nan']))

muv.head() 

Shape:  (93087, 19)
# meas.:  436060


Unnamed: 0,MUV-466,MUV-548,MUV-600,MUV-644,MUV-652,MUV-689,MUV-692,MUV-712,MUV-713,MUV-733,MUV-737,MUV-810,MUV-832,MUV-846,MUV-852,MUV-858,MUV-859,mol_id,smiles
0,,,,,,,,0.0,,,,0.0,,,,,,CID2999678,Cc1cccc(N2CCN(C(=O)C34CC5CC(CC(C5)C3)C4)CC2)c1C
1,0.0,0.0,,,0.0,0.0,0.0,,,,0.0,,0.0,,,0.0,0.0,CID2999679,Cn1ccnc1SCC(=O)Nc1ccc(Oc2ccccc2)cc1
2,,,0.0,,,,,,,,,,,,,,0.0,CID2999672,COc1cc2c(cc1NC(=O)CN1C(=O)NC3(CCc4ccccc43)C1=O...
3,,0.0,0.0,,,0.0,,,,,,,,,,0.0,,CID5390002,O=C1/C(=C/NC2CCS(=O)(=O)C2)c2ccccc2C(=O)N1c1cc...
4,0.0,,,,0.0,,0.0,0.0,,,,,,,,,,CID2999670,NC(=O)NC(Cc1ccccc1)C(=O)O


In [4]:
smiles_molecules = list(muv["smiles"])

In [5]:
# create mol objects
mols = list()

for smiles in smiles_molecules:
    mol = Chem.MolFromSmiles(smiles)
    mols.append(mol)
  

In [6]:
# ECFP fingerprints
ecfps = list()

for mol in tqdm(mols):
    fp_sparseVec = rdFingerprintGenerator.GetCountFPs(
                    [mol], fpType=rdFingerprintGenerator.MorganFP
                )[0]
    fp = np.zeros((0,), np.int8)  # Generate target pointer to fill
    DataStructs.ConvertToNumpyArray(fp_sparseVec, fp)
    
    ecfps.append(fp)

ecfps = np.array(ecfps)

100%|██████████| 93087/93087 [00:08<00:00, 11129.76it/s]


In [7]:
# Descriptors
rdkit_descriptors = list()
for mol in tqdm(mols):
    descrs = list()
    for descr in Descriptors._descList:
        _, descr_calc_fn = descr
        descrs.append(descr_calc_fn(mol))       
    descrs = np.array(descrs)
    descrs = descrs[real_200_descr]
    rdkit_descriptors.append(descrs)   
rdkit_descriptors = np.array(rdkit_descriptors)

100%|██████████| 93087/93087 [16:25<00:00, 94.45it/s] 


In [8]:
# create pretraining, finetuning splits (-> split in targets)
muv.columns

pretraining_cols = list(muv.columns[0:9])  # 9 tasks
finetuning1_cols = list(muv.columns[9:13]) # 4 tasks
finetuning2_cols = list(muv.columns[13:17])  # 4 tasks

sample_num = muv.shape[0] 
split_point = int(0.8 * sample_num)  # 80% for pretraining
pretraining_set = muv[:split_point][pretraining_cols]
finetuning1_set = muv[split_point:][finetuning1_cols]
finetuning2_set = muv[split_point:][finetuning2_cols]

In [9]:
# split samples of preprocessing into train val and test sets
indices = np.arange(len(pretraining_set))
train_idx, temp_idx = train_test_split(indices, test_size=0.3, random_state=42)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.33, random_state=42)

In [10]:
# report quantils for rdkit descriptors
descriptors_raw_forECDF = rdkit_descriptors[train_idx]

In [11]:
rdkit_descriptors_quantils = np.zeros_like(rdkit_descriptors)

for column in range(descriptors_raw_forECDF.shape[1]):
    raw_values_ecdf = descriptors_raw_forECDF[:,column].reshape(-1)
    raw_values = rdkit_descriptors[:,column]#.reshape(-1)

    ecdf = ECDF(raw_values_ecdf)
    quantils = ecdf(raw_values)
    rdkit_descriptors_quantils[:,column] = quantils

In [12]:
# get all data and target splits
df_rdkit_descriptors_quantils = pd.DataFrame(rdkit_descriptors_quantils)
df_ecfps = pd.DataFrame(ecfps)

all_features = np.array(pd.concat([df_rdkit_descriptors_quantils, df_ecfps], axis=1))
# 0 - 200: rdkit descriptors quantils; 200 - 2248: ecfps
X_train = all_features[train_idx]
X_val = all_features[val_idx]
X_test = all_features[test_idx]

y_train = np.array(pretraining_set)[train_idx]
y_val = np.array(pretraining_set)[val_idx]
y_test = np.array(pretraining_set)[test_idx]

y_finetuning1 = np.array(finetuning1_set)
y_finetuning2 = np.array(finetuning2_set)



In [13]:
#   - fit standardizer only on train set but apply to all
scaler = StandardScaler()
scaler.fit(X_train)

In [14]:
# standardize features
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [15]:
# log standardized features
np.savez('./data/data_scaled.npz', X_train=X_train_scaled, X_test=X_test_scaled, X_val=X_val_scaled, y_train=y_train, y_test=y_test, y_val=y_val, y_finetuning1=y_finetuning1, y_finetuning2=y_finetuning2)
