In [1]:
import pandas as pd
import os
import numpy as np
import yaml

In [2]:
def prepare_dataset(tar_list):
    df_blank = pd.DataFrame({'smiles':[]})
    for dataset in tar_list:
        df0 = pd.read_csv(os.path.join(DATASET_PATH, dataset))
        df_blank =  pd.merge(df_blank, df0, on=SMILES_COLUMN, how='outer')
    return df_blank

## LogP + LogD

In [4]:
DATASET_PATH = '../data/3_final_data'
DATASET_OUTPUT_PATH = '../data/3_final_data'
SMILES_COLUMN = 'smiles'
VALUE_COLUMNS = ['logP','logD']
DATASET_NAMES = ['logp_wo_averaging.csv', 'logd_Lip_wo_averaging.csv']

In [4]:
dataset = prepare_dataset(DATASET_NAMES)
dataset.to_csv(os.path.join(DATASET_OUTPUT_PATH, 'logp_logd_Lip_wo_averaging.csv'))

In [7]:
dataset.shape

(17685, 3)

In [4]:
dataset_logP = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[0]))
dataset_logD = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[1]))

### LogP without json + LogD

In [5]:
DATASET_PATH = '../data/3_final_data'
DATASET_OUTPUT_PATH = '../data/3_final_data'
SMILES_COLUMN = 'smiles'
VALUE_COLUMNS = ['logP','logD']
DATASET_NAMES = ['logp_wo_logp_json_wo_averaging.csv', 'logd_Lip_wo_averaging.csv']

In [8]:
dataset_logP = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[0]))
dataset_logD = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[1]))

In [6]:
dataset = prepare_dataset(DATASET_NAMES)
dataset.to_csv(os.path.join(DATASET_OUTPUT_PATH, 'logp_wo_logp_json_logd_Lip_wo_averaging.csv'))

In [7]:
dataset.shape

(17603, 3)

### Calculate number of symmetric molecules

In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import torch
from scipy import stats
from torch.nn.utils.rnn import pad_sequence
from sklearn.neural_network import MLPRegressor
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from sklearn.metrics import mean_squared_error, r2_score
import time
import os
from tqdm import tqdm
from scipy.stats import shapiro
from scipy.stats import normaltest
from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu
import seaborn as sns
import matplotlib.pyplot as plt

In [24]:
VALUE_COLUMN = 'logP'
symmetric_rull = lambda values: (len(values) - 1 <= len(values[values % 2 == 0])) or\
(len(values) - 1 <= len(values[(values % 2 == 1) & (values > 1)]))
values_list = [pd.Series(Chem.CanonicalRankAtoms(Chem.MolFromSmiles(smiles), breakTies=False)).value_counts()\
               for smiles in\
               dataset_logP[~dataset_logP[VALUE_COLUMN].isna()][SMILES_COLUMN]]
symmetric_indices = [symmetric_rull(values) for values in values_list]
not_symmetric_indices = [not e for e in symmetric_indices]

In [25]:
len(dataset_logP[symmetric_indices])

608

In [26]:
len(dataset_logP[not_symmetric_indices])

13080

In [16]:
VALUE_COLUMN = 'logD'
symmetric_rull = lambda values: (len(values) - 1 <= len(values[values % 2 == 0])) or\
(len(values) - 1 <= len(values[(values % 2 == 1) & (values > 1)]))
values_list = [pd.Series(Chem.CanonicalRankAtoms(Chem.MolFromSmiles(smiles), breakTies=False)).value_counts()\
               for smiles in\
               dataset_logD[~dataset_logD[VALUE_COLUMN].isna()][SMILES_COLUMN]]
symmetric_indices = [symmetric_rull(values) for values in values_list]
not_symmetric_indices = [not e for e in symmetric_indices]

In [21]:
len(dataset_logD[symmetric_indices])

33

In [22]:
len(dataset_logD[not_symmetric_indices])

4133

## Logp + LogS

In [5]:
DATASET_PATH = '../data/3_final_data'
DATASET_OUTPUT_PATH = '../data/3_final_data'
SMILES_COLUMN = 'smiles'
VALUE_COLUMNS = ['logP','logS']
DATASET_NAMES = ['logp_wo_averaging.csv', 'esol.csv']

In [8]:
dataset = prepare_dataset(DATASET_NAMES)

In [5]:



dataset.to_csv(os.path.join(DATASET_OUTPUT_PATH, 'logp_esol_wo_averaging.csv'))



In [6]:
dataset_logP = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[0]))
dataset_logS = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[1]))

In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14043 entries, 0 to 14042
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   smiles  14043 non-null  object 
 1   logP    13777 non-null  float64
 2   logS    1058 non-null   float64
dtypes: float64(2), object(1)
memory usage: 438.8+ KB


### LogP without json + LogS

In [27]:
DATASET_PATH = '../data/3_final_data'
DATASET_OUTPUT_PATH = '../data/3_final_data'
SMILES_COLUMN = 'smiles'
VALUE_COLUMNS = ['logP','logS']
DATASET_NAMES = ['logp_wo_logp_json_wo_averaging.csv', 'esol.csv']

In [28]:
dataset = prepare_dataset(DATASET_NAMES)
dataset.to_csv(os.path.join(DATASET_OUTPUT_PATH, 'logp_wo_logp_json_esol_wo_averaging.csv'))

In [29]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13960 entries, 0 to 13959
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   smiles  13960 non-null  object 
 1   logP    13688 non-null  float64
 2   logS    1058 non-null   float64
dtypes: float64(2), object(1)
memory usage: 436.2+ KB


## Logp + FreeSolv

In [15]:
DATASET_PATH = '../data/3_final_data'
DATASET_OUTPUT_PATH = '../data/3_final_data'
SMILES_COLUMN = 'smiles'
VALUE_COLUMNS = ['logP','Energy']
DATASET_NAMES = ['logp_wo_averaging.csv', 'freesolv.csv']

In [16]:
dataset = prepare_dataset(DATASET_NAMES)

In [27]:

dataset.to_csv(os.path.join(DATASET_OUTPUT_PATH, 'logp_FreeSolv_wo_averaging.csv'))

In [11]:
dataset_logP = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[0]))
dataset_Energy = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[1]))

In [17]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13940 entries, 0 to 13939
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   smiles  13940 non-null  object 
 1   logP    13777 non-null  float64
 2   Energy  565 non-null    float64
dtypes: float64(2), object(1)
memory usage: 435.6+ KB


### LogP without + json FreeSolv

In [24]:
DATASET_PATH = '../data/3_final_data'
DATASET_OUTPUT_PATH = '../data/3_final_data'
SMILES_COLUMN = 'smiles'
VALUE_COLUMNS = ['logP','Energy']
DATASET_NAMES = ['logp_wo_logp_json_wo_averaging.csv', 'freesolv.csv']

In [25]:
dataset = prepare_dataset(DATASET_NAMES)
dataset.to_csv(os.path.join(DATASET_OUTPUT_PATH, 'logp_wo_logp_json_FreeSolv_wo_averaging.csv'))

In [26]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13851 entries, 0 to 13850
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   smiles  13851 non-null  object 
 1   logP    13688 non-null  float64
 2   Energy  565 non-null    float64
dtypes: float64(2), object(1)
memory usage: 432.8+ KB


## Check intersected molecules

In [11]:
DATASET_PATH = '../data/3_final_data'
SMILES_COLUMN = 'smiles'

In [12]:
DATASET_NAMES = ['logp_wo_logp_json_wo_averaging.csv', 'logd_Lip_wo_averaging.csv','esol.csv', 'freesolv.csv']

In [13]:
dataset_logP = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[0]))
dataset_logD = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[1]))
dataset_logS = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[2]))
dataset_Energy = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[3]))

### LogP + LogD

In [14]:
dataset_logP[dataset_logP[SMILES_COLUMN].isin(dataset_logD[SMILES_COLUMN])]

Unnamed: 0,smiles,logP
94,C#CCN(C)C(C)Cc1ccccc1,2.90
440,C=CCOc1ccccc1OCC(O)CNC(C)C,2.10
450,C=CCc1ccccc1OCC(O)CNC(C)C,3.10
607,CC(=O)CC(c1ccccc1)c1c(O)c2ccccc2oc1=O,2.60
612,CC(=O)CCCCn1c(=O)c2c(ncn2C)n(C)c1=O,0.29
...,...,...
13529,c1ccc2[nH]ncc2c1,1.77
13619,c1ccc2ccccc2c1,3.30
13620,c1ccc2cnccc2c1,2.08
13628,c1ccc2ncccc2c1,2.03


### LogP + LogS

In [15]:
dataset_logP[dataset_logP[SMILES_COLUMN].isin(dataset_logS[SMILES_COLUMN])]

Unnamed: 0,smiles,logP
0,BrC(Br)(Br)Br,3.42
15,Brc1cc(Br)c(Br)cc1Br,5.13
19,Brc1cc(Br)cc(Br)c1,4.51
26,Brc1ccc(Br)cc1,3.79
34,Brc1cccc(Br)c1,3.75
...,...,...
13649,c1ccoc1,1.34
13650,c1ccsc1,1.81
13658,c1cnc2c(c1)ccc1ncccc12,2.51
13661,c1cnc2ncncc2n1,-0.58


### LogP + FreeSolv

In [16]:
dataset_logP[dataset_logP[SMILES_COLUMN].isin(dataset_Energy[SMILES_COLUMN])]

Unnamed: 0,smiles,logP
11,BrCc1ccccc1,2.92
26,Brc1ccc(Br)cc1,3.79
38,Brc1ccccc1,2.99
87,C#CCCC,1.98
90,C#CCCCC,2.73
...,...,...
13619,c1ccc2ccccc2c1,3.30
13628,c1ccc2ncccc2c1,2.03
13644,c1ccccc1,2.13
13647,c1ccncc1,0.65


## Split datasets

### LogP + LogD

In [12]:
import pandas as pd
import os

In [13]:
DATASET_OUTPUT_PATH = '../data/raw/baselines/dmpnn'

In [14]:
def train_test_validation_split(df):
    from sklearn.model_selection import train_test_split
    train_data, rest_data = train_test_split(df, test_size=0.3)
    test_data, validation_data = train_test_split(rest_data, test_size=0.5)
    return train_data.reset_index(drop=True), validation_data.reset_index(drop=True), test_data.reset_index(drop=True)

In [15]:
file = 'logp_logd_Lip_wo_averaging'

In [16]:
data = pd.read_csv(os.path.join(DATASET_OUTPUT_PATH, file+'.csv'))
DATASET_PATH = '../data/3_final_data'

data.to_csv(os.path.join(DATASET_PATH, file+'.csv'))

print(file, 'shape: ', data.shape)    
train, validation, test = train_test_validation_split(data)
print('SPLITTED SHAPES:\n\ttrain: {0}\n\tvalidation: {1}\n\ttest: {2}\n'.format(train.shape, validation.shape, test.shape))

train.to_csv(os.path.join(DATASET_PATH, 'split_data', file + '_train.csv'))
validation.to_csv(os.path.join(DATASET_PATH, 'split_data', file + '_validation.csv'))
test.to_csv(os.path.join(DATASET_PATH, 'split_data',  file + '_test.csv'))

logp_logd_Lip_wo_averaging shape:  (17685, 4)
SPLITTED SHAPES:
	train: (12379, 4)
	validation: (2653, 4)
	test: (2653, 4)



### LogP + LogS

In [11]:
import pandas as pd
import os

In [12]:
DATASET_OUTPUT_PATH = '../data/raw/baselines/dmpnn'

In [13]:
file = 'logp_esol_wo_averaging'

In [14]:
def train_test_validation_split(df):
    from sklearn.model_selection import train_test_split
    train_data, rest_data = train_test_split(df, test_size=0.3)
    test_data, validation_data = train_test_split(rest_data, test_size=0.5)
    return train_data.reset_index(drop=True), validation_data.reset_index(drop=True), test_data.reset_index(drop=True)

In [15]:
data = pd.read_csv(os.path.join(DATASET_OUTPUT_PATH, file+'.csv'))
DATASET_PATH = '../data/3_final_data'

data.to_csv(os.path.join(DATASET_PATH, file+'.csv'))

print(file, 'shape: ', data.shape)    
train, validation, test = train_test_validation_split(data)
print('SPLITTED SHAPES:\n\ttrain: {0}\n\tvalidation: {1}\n\ttest: {2}\n'.format(train.shape, validation.shape, test.shape))

train.to_csv(os.path.join(DATASET_PATH, 'split_data', file + '_train.csv'))
validation.to_csv(os.path.join(DATASET_PATH, 'split_data', file + '_validation.csv'))
test.to_csv(os.path.join(DATASET_PATH, 'split_data',  file + '_test.csv'))

logp_esol_wo_averaging shape:  (14043, 4)
SPLITTED SHAPES:
	train: (9830, 4)
	validation: (2107, 4)
	test: (2106, 4)



### LogP + Energy

In [30]:
import pandas as pd
import os

In [31]:
DATASET_OUTPUT_PATH = '../data/raw/baselines/dmpnn'

In [32]:
file = 'logp_FreeSolv_wo_averaging'

In [33]:
def train_test_validation_split(df):
    from sklearn.model_selection import train_test_split
    train_data, rest_data = train_test_split(df, test_size=0.3)
    test_data, validation_data = train_test_split(rest_data, test_size=0.5)
    return train_data.reset_index(drop=True), validation_data.reset_index(drop=True), test_data.reset_index(drop=True)

In [34]:
data = pd.read_csv(os.path.join(DATASET_OUTPUT_PATH, file+'.csv'))
DATASET_PATH = '../data/3_final_data'

data.to_csv(os.path.join(DATASET_PATH, file+'.csv'))

print(file, 'shape: ', data.shape)    
train, validation, test = train_test_validation_split(data)
print('SPLITTED SHAPES:\n\ttrain: {0}\n\tvalidation: {1}\n\ttest: {2}\n'.format(train.shape, validation.shape, test.shape))

train.to_csv(os.path.join(DATASET_PATH, 'split_data', file + '_train.csv'))
validation.to_csv(os.path.join(DATASET_PATH, 'split_data', file + '_validation.csv'))
test.to_csv(os.path.join(DATASET_PATH, 'split_data',  file + '_test.csv'))

logp_FreeSolv_wo_averaging shape:  (13940, 4)
SPLITTED SHAPES:
	train: (9758, 4)
	validation: (2091, 4)
	test: (2091, 4)



## Create datasets with intersected molecules for logP+logD

In [30]:
import pandas as pd
import os
import numpy as np
import yaml

In [31]:
def prepare_dataset(tar_list):
    df_blank = pd.DataFrame({'smiles':[]})
    for dataset in tar_list:
        df0 = pd.read_csv(os.path.join(DATASET_PATH, dataset))
        df_blank =  pd.merge(df_blank, df0, on=SMILES_COLUMN, how='outer')
    return df_blank

In [9]:
DATASET_PATH = '../data/3_final_data'
DATASET_OUTPUT_PATH = '../data/raw/baselines/dmpnn'
SMILES_COLUMN = 'smiles'
VALUE_COLUMNS = ['logP','logD']
DATASET_NAMES = ['logp_wo_averaging.csv', 'logd_Lip_wo_averaging.csv']

In [10]:
dataset = prepare_dataset(DATASET_NAMES)

In [12]:
dataset_logP = dataset[~dataset[VALUE_COLUMNS[0]].isna()]
dataset_logD = dataset[~dataset[VALUE_COLUMNS[1]].isna()]

In [13]:
dataset_logP.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13777 entries, 0 to 13776
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   smiles  13777 non-null  object 
 1   logP    13777 non-null  float64
 2   logD    258 non-null    float64
dtypes: float64(2), object(1)
memory usage: 430.5+ KB


In [14]:
dataset_logD.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4166 entries, 95 to 17684
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   smiles  4166 non-null   object 
 1   logP    258 non-null    float64
 2   logD    4166 non-null   float64
dtypes: float64(2), object(1)
memory usage: 130.2+ KB


In [17]:
DATASET_PATH = '../data/3_final_data'

dataset_logP.to_csv(os.path.join(DATASET_PATH, 'logp_258_Lip_wo_averaging'+'.csv'), index = False)
dataset_logD.to_csv(os.path.join(DATASET_PATH, 'logd_258_Logp_wo_averaging'+'.csv'), index = False)

### Logp wo logp.json + log D

In [32]:
DATASET_PATH = '../data/3_final_data'
DATASET_OUTPUT_PATH = '../data/3_final_data'
SMILES_COLUMN = 'smiles'
VALUE_COLUMNS = ['logP','logD']
DATASET_NAMES = ['logp_wo_logp_json_wo_averaging.csv', 'logd_Lip_wo_averaging.csv']

In [33]:
dataset = prepare_dataset(DATASET_NAMES)

In [34]:
dataset_logP = dataset[~dataset[VALUE_COLUMNS[0]].isna()]
dataset_logD = dataset[~dataset[VALUE_COLUMNS[1]].isna()]

In [35]:
dataset_logP.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13688 entries, 0 to 13687
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   smiles  13688 non-null  object 
 1   logP    13688 non-null  float64
 2   logD    251 non-null    float64
dtypes: float64(2), object(1)
memory usage: 427.8+ KB


In [36]:
dataset_logD.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4166 entries, 94 to 17602
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   smiles  4166 non-null   object 
 1   logP    251 non-null    float64
 2   logD    4166 non-null   float64
dtypes: float64(2), object(1)
memory usage: 130.2+ KB


In [37]:
DATASET_PATH = '../data/3_final_data'

dataset_logP.to_csv(os.path.join(DATASET_PATH, 'logp_wo_logp_json_251_Lip_wo_averaging'+'.csv'), index = False)
dataset_logD.to_csv(os.path.join(DATASET_PATH, 'logd_251_logp_wo_logp_json_wo_averaging'+'.csv'), index = False)

### Split data

In [16]:
def train_test_validation_split(df):
    from sklearn.model_selection import train_test_split
    train_data, rest_data = train_test_split(df, test_size=0.3)
    test_data, validation_data = train_test_split(rest_data, test_size=0.5)
    return train_data.reset_index(drop=True), validation_data.reset_index(drop=True), test_data.reset_index(drop=True)

In [19]:
files = ['logp_258_Lip_wo_averaging', 'logd_258_Logp_wo_averaging']
for file in files:
    data = pd.read_csv(os.path.join(DATASET_PATH, file+'.csv'))
    print(file, 'shape: ', data.shape)    
    train, validation, test = train_test_validation_split(data)
    print('SPLITTED SHAPES:\n\ttrain: {0}\n\tvalidation: {1}\n\ttest: {2}\n'.format(train.shape, validation.shape, test.shape))

    train.to_csv(os.path.join(DATASET_PATH, 'split_data', file + '_train.csv'))
    validation.to_csv(os.path.join(DATASET_PATH, 'split_data', file + '_validation.csv'))
    test.to_csv(os.path.join(DATASET_PATH, 'split_data',  file + '_test.csv'))

logp_258_Lip_wo_averaging shape:  (13777, 3)
SPLITTED SHAPES:
	train: (9643, 3)
	validation: (2067, 3)
	test: (2067, 3)

logd_258_Logp_wo_averaging shape:  (4166, 3)
SPLITTED SHAPES:
	train: (2916, 3)
	validation: (625, 3)
	test: (625, 3)



# Datasets correlation

### LogP + LogD

In [19]:
DATASET_PATH = '../data/3_final_data'
SMILES_COLUMN = 'smiles'
VALUE_COLUMNS = ['logP','logD']
DATASET_NAMES = ['logp_wo_logp_json_wo_averaging.csv', 'logd_Lip_wo_averaging.csv']
file = 'logp_wo_logp_json_logd_Lip_wo_averaging'

data = pd.read_csv(os.path.join(DATASET_PATH, file+'.csv'))
dataset_logP = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[0]))
dataset_logD = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[1]))

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17603 entries, 0 to 17602
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  17603 non-null  int64  
 1   smiles      17603 non-null  object 
 2   logP        13688 non-null  float64
 3   logD        4166 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 550.2+ KB


In [21]:
logP_logD_cross_section = data.dropna()

In [22]:
print('Pearson\'s r LogP/LogD', \
      round(logP_logD_cross_section[VALUE_COLUMNS[0]].corr(logP_logD_cross_section[VALUE_COLUMNS[1]]), 3))
print('Spearman\'s rho LogP/LogD', \
      round(logP_logD_cross_section[VALUE_COLUMNS[0]].corr(logP_logD_cross_section[VALUE_COLUMNS[1]], method='spearman'), 3))
print('Kendall\'s tau LogP/LogD', \
      round(logP_logD_cross_section[VALUE_COLUMNS[0]].corr(logP_logD_cross_section[VALUE_COLUMNS[1]], method='kendall'), 3))

Pearson's r LogP/LogD 0.647
Spearman's rho LogP/LogD 0.645
Kendall's tau LogP/LogD 0.503


### Logp + LogS

In [22]:
DATASET_PATH = '../data/3_final_data'
SMILES_COLUMN = 'smiles'
VALUE_COLUMNS = ['logP','logS']
DATASET_NAMES = ['logp_wo_averaging.csv', 'esol.csv']
file = 'logp_esol_wo_averaging'

data = pd.read_csv(os.path.join(DATASET_PATH, file+'.csv'))
dataset_logP = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[0]))
dataset_logS = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[1]))

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14043 entries, 0 to 14042
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    14043 non-null  int64  
 1   Unnamed: 0.1  14043 non-null  int64  
 2   smiles        14043 non-null  object 
 3   logP          13777 non-null  float64
 4   logS          1058 non-null   float64
dtypes: float64(2), int64(2), object(1)
memory usage: 548.7+ KB


In [24]:
logP_logS_cross_section = data.dropna()

In [25]:
print('Pearson\'s r LogP/LogS', \
      round(logP_logS_cross_section[VALUE_COLUMNS[0]].corr(logP_logS_cross_section[VALUE_COLUMNS[1]]), 3))
print('Spearman\'s rho LogP/LogS', \
      round(logP_logS_cross_section[VALUE_COLUMNS[0]].corr(logP_logS_cross_section[VALUE_COLUMNS[1]], method='spearman'), 3))
print('Kendall\'s tau LogP/LogS', \
      round(logP_logS_cross_section[VALUE_COLUMNS[0]].corr(logP_logS_cross_section[VALUE_COLUMNS[1]], method='kendall'), 3))

Pearson's r LogP/LogS -0.853
Spearman's rho LogP/LogS -0.841
Kendall's tau LogP/LogS -0.666


### LogP + Energy

In [26]:
DATASET_PATH = '../data/3_final_data'
SMILES_COLUMN = 'smiles'
VALUE_COLUMNS = ['logP','Energy']
DATASET_NAMES = ['logp_wo_averaging.csv', 'freesolv.csv']
file = 'logp_FreeSolv_wo_averaging'

data = pd.read_csv(os.path.join(DATASET_PATH, file+'.csv'))
dataset_logP = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[0]))
dataset_Energy = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[1]))

In [27]:
logP_Energy_cross_section = data.dropna()

In [28]:
print('Pearson\'s r LogP/LogS', \
      round(logP_Energy_cross_section[VALUE_COLUMNS[0]].corr(logP_Energy_cross_section[VALUE_COLUMNS[1]]), 3))
print('Spearman\'s rho LogP/LogS', \
      round(logP_Energy_cross_section[VALUE_COLUMNS[0]].corr(logP_Energy_cross_section[VALUE_COLUMNS[1]], method='spearman'), 3))
print('Kendall\'s tau LogP/LogS', \
      round(logP_Energy_cross_section[VALUE_COLUMNS[0]].corr(logP_Energy_cross_section[VALUE_COLUMNS[1]], method='kendall'), 3))

Pearson's r LogP/LogS 0.427
Spearman's rho LogP/LogS 0.453
Kendall's tau LogP/LogS 0.306


# Number of acid groups and amins

In [28]:
DATASET_PATH = '../data/3_final_data'
SMILES_COLUMN = 'smiles'
VALUE_COLUMNS = ['logP','logD']
DATASET_NAMES = ['logp_wo_averaging.csv', 'logd_Lip_wo_averaging.csv']
file = 'logp_logd_Lip_wo_averaging'

data = pd.read_csv(os.path.join(DATASET_PATH, file+'.csv'))
dataset_logP = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[0]))
dataset_logD = pd.read_csv(os.path.join(DATASET_PATH, DATASET_NAMES[1]))

In [29]:
import rdkit
from rdkit.Chem import Fragments
from descriptastorus.descriptors import rdDescriptors, rdNormalizedDescriptors

## Description

### Amins

fr_NH2 - primary amines

fr_NH1 - Secondary amines

fr_NH0 - Tertiary amines

fr_Ar_NH - aromatic amines


### Acids

fr_COO2 - carboxylic acids

fr_COO - carboxylic acids

fr_Ar_COO - Aromatic carboxylic acide

fr_Al_COO - carboxylic acids


In [30]:
amins = ['fr_NH2', 'fr_NH1', 'fr_NH0',  'fr_Ar_NH', ]
acids = ['fr_COO2', 'fr_COO', 'fr_Ar_COO', 'fr_Al_COO']

In [31]:
generator = rdDescriptors.RDKit2D(amins+acids)
features ={}
for feature in amins+acids:
    features[feature] = []
from tqdm import tqdm

for i in tqdm(range(len(data))):
    feature = dict(zip(amins+acids,generator.process(data[SMILES_COLUMN][i])[1:]))
    for feat in feature.keys():
        features[feat].append(feature[feat])
    

100%|██████████| 17685/17685 [00:07<00:00, 2262.09it/s]


In [32]:
for feature in features.keys():
    data[feature] = features[feature]

In [33]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,smiles,logP,logD,fr_NH2,fr_NH1,fr_NH0,fr_Ar_NH,fr_COO2,fr_COO,fr_Ar_COO,fr_Al_COO
0,0,0,BrC(Br)(Br)Br,3.42,,0,0,0,0,0,0,0,0
1,1,1,BrC1C2CC3C(C2)C13,3.11,,0,0,0,0,0,0,0,0
2,2,2,BrC1CC2CCC1C2,3.54,,0,0,0,0,0,0,0,0
3,3,3,BrC1CCCCC1,3.2,,0,0,0,0,0,0,0,0
4,4,4,BrC=C(Br)Br,3.2,,0,0,0,0,0,0,0,0


In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17685 entries, 0 to 17684
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    17685 non-null  int64  
 1   Unnamed: 0.1  17685 non-null  int64  
 2   smiles        17685 non-null  object 
 3   logP          13777 non-null  float64
 4   logD          4166 non-null   float64
 5   fr_NH2        17685 non-null  int64  
 6   fr_NH1        17685 non-null  int64  
 7   fr_NH0        17685 non-null  int64  
 8   fr_Ar_NH      17685 non-null  int64  
 9   fr_COO2       17685 non-null  int64  
 10  fr_COO        17685 non-null  int64  
 11  fr_Ar_COO     17685 non-null  int64  
 12  fr_Al_COO     17685 non-null  int64  
dtypes: float64(2), int64(10), object(1)
memory usage: 1.8+ MB


In [37]:
data['Total fragments']= data[amins+acids].sum(axis=1)

In [38]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,smiles,logP,logD,fr_NH2,fr_NH1,fr_NH0,fr_Ar_NH,fr_COO2,fr_COO,fr_Ar_COO,fr_Al_COO,Total fragments
0,0,0,BrC(Br)(Br)Br,3.42,,0,0,0,0,0,0,0,0,0
1,1,1,BrC1C2CC3C(C2)C13,3.11,,0,0,0,0,0,0,0,0,0
2,2,2,BrC1CC2CCC1C2,3.54,,0,0,0,0,0,0,0,0,0
3,3,3,BrC1CCCCC1,3.2,,0,0,0,0,0,0,0,0,0
4,4,4,BrC=C(Br)Br,3.2,,0,0,0,0,0,0,0,0,0


In [51]:
print('Number of molecule with acids or amins group  in LogP+LogD dataset = ', len(data[data['Total fragments']>0]), '; ', \
      round(len(data[data['Total fragments']>0])/len(data)*100,1), '% of all data')

print('Number of molecule with acids or amins group  in LogD dataset = ', \
      len(data[data['Total fragments']>0][data[SMILES_COLUMN].isin(dataset_logD[SMILES_COLUMN])]), '; ', \
      round(len(data[data['Total fragments']>0][data[SMILES_COLUMN].isin(dataset_logD[SMILES_COLUMN])])/len(dataset_logD)*100,1), '% of all data')

print('Number of molecule with acids or amins group  in LogP dataset = ', \
      len(data[data['Total fragments']>0][data[SMILES_COLUMN].isin(dataset_logP[SMILES_COLUMN])]), '; ', \
      round(len(data[data['Total fragments']>0][data[SMILES_COLUMN].isin(dataset_logP[SMILES_COLUMN])])/len(dataset_logP)*100,1), '% of all data')

Number of molecule with acids or amins group  in LogP+LogD dataset =  14819 ;  83.8 % of all data
Number of molecule with acids or amins group  in LogD dataset =  4066 ;  97.6 % of all data
Number of molecule with acids or amins group  in LogP dataset =  10993 ;  79.8 % of all data


  len(data[data['Total fragments']>0][data[SMILES_COLUMN].isin(dataset_logD[SMILES_COLUMN])]), '; ', \
  round(len(data[data['Total fragments']>0][data[SMILES_COLUMN].isin(dataset_logD[SMILES_COLUMN])])/len(dataset_logD)*100,1), '% of all data')
  len(data[data['Total fragments']>0][data[SMILES_COLUMN].isin(dataset_logP[SMILES_COLUMN])]), '; ', \
  round(len(data[data['Total fragments']>0][data[SMILES_COLUMN].isin(dataset_logP[SMILES_COLUMN])])/len(dataset_logP)*100,1), '% of all data')
