# Calculate MolLogP features

In [30]:
import pandas as pd
import os

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
import matplotlib.pyplot as plt
from tqdm import tqdm

In [25]:
SMILES_COLUMN = 'smiles'
VALUE_COLUMN = 'logP'
PREDS_COLUMN = 'MolLogP'
DATASET_INPUT_PATH = '../../../data/3_final_data/split_data'

DATASET_OUTPUT_PATH = '../../../data/raw/baselines/jtree'

In [26]:
from descriptastorus.descriptors import rdDescriptors
from rdkit import Chem
import logging


generator = rdDescriptors.RDKit2D()


def rdkit_2d_features(smiles: str):
    # n.b. the first element is true/false if the descriptors were properly computed
    results = generator.process(smiles)
    processed, features = results[0], results[1:]
    if processed is None:
       logging.warning("Unable to process smiles %s", smiles)
    # if processed is None, the features are are default values for the type
    return features

In [27]:
train_data = pd.read_csv(os.path.join(DATASET_INPUT_PATH, 'logp_wo_averaging_train.csv'))
val_data = pd.read_csv(os.path.join(DATASET_INPUT_PATH, 'logp_wo_averaging_validation.csv'))
test_data = pd.read_csv(os.path.join(DATASET_INPUT_PATH, 'logp_wo_averaging_test.csv'))

In [28]:
def create_feature_dataframe(df):
    import numpy as np
    rdkit_table = []
    features_names = [gen[0] for gen in generator.columns]
    smiles_index_dict = {}
    for i in tqdm(range(df.shape[0])):
        smiles = df.iloc[i][SMILES_COLUMN]
        logP = df.iloc[i][VALUE_COLUMN]
        features = {features_names[j]:feature for j,feature in enumerate(rdkit_2d_features(smiles))}
        features[SMILES_COLUMN] = smiles
        features[VALUE_COLUMN] = logP
        rdkit_table.append(features)
        smiles_index_dict[smiles]=i
    rdkit_features = pd.DataFrame(rdkit_table)
    return rdkit_features, smiles_index_dict

In [31]:
train_data_rdkit, train_smiles_dict = create_feature_dataframe(train_data)
val_data_rdkit, val_smiles_dict = create_feature_dataframe(val_data)
test_data_rdkit, test_smiles_dict = create_feature_dataframe(test_data)

100%|██████████| 9643/9643 [04:00<00:00, 40.15it/s]
100%|██████████| 2067/2067 [00:53<00:00, 39.00it/s]
100%|██████████| 2067/2067 [00:52<00:00, 39.04it/s]


In [32]:
import json

train_data_rdkit.to_csv(os.path.join(DATASET_OUTPUT_PATH,'logp_wo_averaging_train_drkit_feat.csv'))
with open(os.path.join(DATASET_OUTPUT_PATH,'logp_wo_averaging_train_smiles_dict.json'), 'w') as f:
    json.dump(train_smiles_dict, f)
val_data_rdkit.to_csv(os.path.join(DATASET_OUTPUT_PATH,'logp_wo_averaging_val_drkit_feat.csv'))
with open(os.path.join(DATASET_OUTPUT_PATH,'logp_wo_averaging_val_smiles_dict.json'), 'w') as f:
    json.dump(val_smiles_dict, f)
test_data_rdkit.to_csv(os.path.join(DATASET_OUTPUT_PATH,'logp_wo_averaging_test_drkit_feat.csv'))
with open(os.path.join(DATASET_OUTPUT_PATH,'logp_wo_averaging_test_smiles_dict.json'), 'w') as f:
    json.dump(test_smiles_dict, f)


# Get errors

In [33]:
import pandas as pd
import os

In [40]:
SMILES_COLUMN = 'smiles'
VALUE_COLUMN = 'logP'
PREDS_COLUMN = 'MolLogP'


DATA_PATH = '../../../data/raw/baselines/jtree'

In [41]:
train_data = pd.read_csv(os.path.join(DATA_PATH, 'logp_wo_averaging_train_drkit_feat.csv'), index_col=0)
train_data = train_data[[SMILES_COLUMN, VALUE_COLUMN, 'MolLogP']]
val_data = pd.read_csv(os.path.join(DATA_PATH, 'logp_wo_averaging_val_drkit_feat.csv'), index_col=0)
val_data = val_data[[SMILES_COLUMN, VALUE_COLUMN, 'MolLogP']]
test_data = pd.read_csv(os.path.join(DATA_PATH, 'logp_wo_averaging_test_drkit_feat.csv'), index_col=0)
test_data = test_data[[SMILES_COLUMN, VALUE_COLUMN, 'MolLogP']]

In [42]:
from sklearn.metrics import mean_squared_error, r2_score

In [43]:
train_rmse = mean_squared_error(train_data[VALUE_COLUMN], train_data[PREDS_COLUMN])**0.5
train_r2 = r2_score(train_data[VALUE_COLUMN], train_data[PREDS_COLUMN])
val_rmse = mean_squared_error(val_data[VALUE_COLUMN], val_data[PREDS_COLUMN])**0.5
val_r2 = r2_score(val_data[VALUE_COLUMN], val_data[PREDS_COLUMN])
test_rmse = mean_squared_error(test_data[VALUE_COLUMN], test_data[PREDS_COLUMN])**0.5
test_r2 = r2_score(test_data[VALUE_COLUMN], test_data[PREDS_COLUMN])

In [44]:
print('Test RMSE is '+str(test_rmse)+'\n')
print('Test R2 is '+str(test_r2)+'\n')
print('Val RMSE is '+str(val_rmse)+'\n')
print('Val R2 is '+str(val_r2)+'\n')
print('Train RMSE is '+str(train_rmse)+'\n')
print('Train R2 is '+str(train_r2)+'\n')

Test RMSE is 0.9563863600078659

Test R2 is 0.7283876679456853

Val RMSE is 0.8915539462330561

Val R2 is 0.764251706400189

Train RMSE is 0.9186532709897776

Train R2 is 0.761049391797775

