In [1]:
import pandas as pd
from rdkit import Chem
import os

# Datasets split

In [2]:
DATASET_PATH = '../data/3_final_data'

In [3]:
def train_test_validation_split(df, train_fraction):
    from sklearn.model_selection import train_test_split
    train_data, rest_data = train_test_split(df, test_size=1 - train_fraction)
    test_data, validation_data = train_test_split(rest_data, test_size=0.5)
    return train_data.reset_index(drop=True), validation_data.reset_index(drop=True), test_data.reset_index(drop=True)

In [4]:
logp_datasets = ['logp_wo_logp_json_wo_averaging', 'logp_wo_logp_json_251_Lip_wo_averaging',
                 'logp_wo_logp_json_logd_Lip_wo_averaging']
logd_datasets = ['logd_Lip_wo_averaging', 'logd_251_logp_wo_logp_json_wo_averaging']

In [5]:
train_fraction = 0.7
for file in logp_datasets:
    data = pd.read_csv(os.path.join(DATASET_PATH, file+'.csv'))
    print(file, 'shape: ', data.shape)    
    train, validation, test = train_test_validation_split(data, train_fraction)
    print('SPLITTED SHAPES:\n\ttrain: {0}\n\tvalidation: {1}\n\ttest: {2}\n'.format(train.shape, validation.shape, test.shape))

    train.to_csv(os.path.join(DATASET_PATH, 'split_data', file + '_train.csv'))
    validation.to_csv(os.path.join(DATASET_PATH, 'split_data', file + '_validation.csv'))
    test.to_csv(os.path.join(DATASET_PATH, 'split_data',  file + '_test.csv'))

logp_wo_logp_json_wo_averaging shape:  (13688, 2)
SPLITTED SHAPES:
	train: (9581, 2)
	validation: (2054, 2)
	test: (2053, 2)

logp_wo_logp_json_251_Lip_wo_averaging shape:  (13688, 3)
SPLITTED SHAPES:
	train: (9581, 3)
	validation: (2054, 3)
	test: (2053, 3)

logp_wo_logp_json_logd_Lip_wo_averaging shape:  (17603, 4)
SPLITTED SHAPES:
	train: (12322, 4)
	validation: (2641, 4)
	test: (2640, 4)



In [6]:
train_fraction = 0.8
for file in logd_datasets:
    data = pd.read_csv(os.path.join(DATASET_PATH, file+'.csv'))
    print(file, 'shape: ', data.shape)    
    train, validation, test = train_test_validation_split(data, train_fraction)
    print('SPLITTED SHAPES:\n\ttrain: {0}\n\tvalidation: {1}\n\ttest: {2}\n'.format(train.shape, validation.shape, test.shape))

    train.to_csv(os.path.join(DATASET_PATH, 'split_data', file + '_train.csv'))
    validation.to_csv(os.path.join(DATASET_PATH, 'split_data', file + '_validation.csv'))
    test.to_csv(os.path.join(DATASET_PATH, 'split_data',  file + '_test.csv'))

logd_Lip_wo_averaging shape:  (4166, 2)
SPLITTED SHAPES:
	train: (3332, 2)
	validation: (417, 2)
	test: (417, 2)

logd_251_logp_wo_logp_json_wo_averaging shape:  (4166, 3)
SPLITTED SHAPES:
	train: (3332, 3)
	validation: (417, 3)
	test: (417, 3)



## Analyze symmetric molecules distribution

In [5]:
def analyze_data_split_symmetry(args):
    dataset_size = sum(part.smiles.count() for part in args.values())
    for name, arg in args.items():
        mols = [Chem.MolFromSmiles(s) for s in arg.smiles]
        values_list = [pd.Series(Chem.CanonicalRankAtoms(mol, breakTies=False)).value_counts() for mol in mols]
        symmetry_list = [(len(values) - 1 <= len(values[values % 2 == 0])) 
                         or (len(values) - 1 <= len(values[(values % 2 == 1) & (values > 1)])) for values in values_list]
        symmetry_data = arg[symmetry_list]
        symmetric_mols_count = symmetry_data.smiles.count()
        print('{0}: {1} symmetric molecules ({2}%)'.format(name, symmetric_mols_count, 
                                                        symmetric_mols_count / len(mols) * 100))

In [13]:
dataset_names = ['logp_wo_logp_json_wo_averaging', 'logd_Lip_wo_averaging', 'logp_wo_logp_json_251_Lip_wo_averaging', 
            'logd_251_logp_wo_logp_json_wo_averaging', 'logp_wo_logp_json_logd_Lip_wo_averaging']
DATA_PATH = "../data/3_final_data/split_data/"

In [12]:
for name in dataset_names:
    data_train = pd.read_csv(DATA_PATH + name + '_train.csv')
    data_valid = pd.read_csv(DATA_PATH + name + '_validation.csv')
    data_test = pd.read_csv(DATA_PATH + name + '_test.csv')
    print(name)
    analyze_data_split_symmetry({'train':data_train, 'valid':data_valid, 'test':data_test})

logp_wo_logp_json_wo_averaging
train: 412 symmetric molecules (4.300177434505793%)
valid: 99 symmetric molecules (4.8198636806231745%)
test: 97 symmetric molecules (4.72479298587433%)
logd_Lip_wo_averaging
train: 25 symmetric molecules (0.7503001200480192%)
valid: 3 symmetric molecules (0.7194244604316548%)
test: 5 symmetric molecules (1.1990407673860912%)
logp_wo_logp_json_251_Lip_wo_averaging
train: 404 symmetric molecules (4.216678843544515%)
valid: 103 symmetric molecules (5.0146056475170395%)
test: 101 symmetric molecules (4.919629810034096%)
logd_251_logp_wo_logp_json_wo_averaging
train: 27 symmetric molecules (0.8103241296518607%)
valid: 2 symmetric molecules (0.4796163069544364%)
test: 4 symmetric molecules (0.9592326139088728%)
logp_wo_logp_json_logd_Lip_wo_averaging
train: 466 symmetric molecules (3.7818535951955856%)
valid: 84 symmetric molecules (3.180613404013631%)
test: 84 symmetric molecules (3.1818181818181817%)


# Symmetry based splits

In [6]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [7]:
DATASET_PATH = '../data/3_final_data'

In [8]:
def train_test_validation_split(df, train_fraction):
    train_data, rest_data = train_test_split(df, test_size=1 - train_fraction)
    test_data, validation_data = train_test_split(rest_data, test_size=0.5)
    return train_data.reset_index(drop=True), validation_data.reset_index(drop=True), test_data.reset_index(drop=True)

In [9]:
logp_datasets = ['logp_wo_logp_json_wo_averaging', 'logp_wo_logp_json_251_Lip_wo_averaging',
                 'logp_wo_logp_json_logd_Lip_wo_averaging']
logd_datasets = ['logd_Lip_wo_averaging', 'logd_251_logp_wo_logp_json_wo_averaging']

In [10]:
train_fraction = 0.7
for file in logp_datasets:
    data = pd.read_csv(os.path.join(DATASET_PATH, file+'.csv'))
    print(file, 'shape: ', data.shape)
    mols = [Chem.MolFromSmiles(s) for s in data.smiles]
    values_list = [pd.Series(Chem.CanonicalRankAtoms(mol, breakTies=False)).value_counts() for mol in mols]
    symmetry_list = pd.Series([(len(values) - 1 <= len(values[values % 2 == 0])) 
                     or (len(values) - 1 <= len(values[(values % 2 == 1) & (values > 1)])) for values in values_list])
    symmetry_data = data[symmetry_list]
    no_symmetry_data = data[~symmetry_list]
    train_symm, validation_symm, test_symm = train_test_validation_split(symmetry_data, train_fraction)
    train_no_symm, validation_no_symm, test_no_symm = train_test_validation_split(no_symmetry_data, train_fraction)
    train = pd.concat([train_symm, train_no_symm], ignore_index=True)
    train = shuffle(train).reset_index(drop=True)
    validation = pd.concat([validation_symm, validation_no_symm], ignore_index=True)
    validation = shuffle(validation).reset_index(drop=True)
    test = pd.concat([test_symm, test_no_symm], ignore_index=True)
    test = shuffle(test).reset_index(drop=True)
    print('SPLITTED SHAPES:\n\ttrain: {0}\n\tvalidation: {1}\n\ttest: {2}\n'.format(train.shape, validation.shape, test.shape))

    train.to_csv(os.path.join(DATASET_PATH, 'split_data_symmetry', file + '_train.csv'))
    validation.to_csv(os.path.join(DATASET_PATH, 'split_data_symmetry', file + '_validation.csv'))
    test.to_csv(os.path.join(DATASET_PATH, 'split_data_symmetry',  file + '_test.csv'))

logp_wo_logp_json_wo_averaging shape:  (13688, 2)
SPLITTED SHAPES:
	train: (9580, 2)
	validation: (2055, 2)
	test: (2053, 2)

logp_wo_logp_json_251_Lip_wo_averaging shape:  (13688, 3)
SPLITTED SHAPES:
	train: (9580, 3)
	validation: (2055, 3)
	test: (2053, 3)

logp_wo_logp_json_logd_Lip_wo_averaging shape:  (17603, 4)
SPLITTED SHAPES:
	train: (12321, 4)
	validation: (2642, 4)
	test: (2640, 4)



In [11]:
DATA_PATH = "../data/3_final_data/split_data_symmetry/"
for name in logp_datasets:
    data_train = pd.read_csv(DATA_PATH + name + '_train.csv')
    data_valid = pd.read_csv(DATA_PATH + name + '_validation.csv')
    data_test = pd.read_csv(DATA_PATH + name + '_test.csv')
    print(name)
    analyze_data_split_symmetry({'train':data_train, 'valid':data_valid, 'test':data_test})

logp_wo_logp_json_wo_averaging
train: 425 symmetric molecules (4.4363256784968685%)
valid: 92 symmetric molecules (4.476885644768856%)
test: 91 symmetric molecules (4.432537749634681%)
logp_wo_logp_json_251_Lip_wo_averaging
train: 425 symmetric molecules (4.4363256784968685%)
valid: 92 symmetric molecules (4.476885644768856%)
test: 91 symmetric molecules (4.432537749634681%)
logp_wo_logp_json_logd_Lip_wo_averaging
train: 443 symmetric molecules (3.5954873792711632%)
valid: 96 symmetric molecules (3.6336109008327027%)
test: 95 symmetric molecules (3.5984848484848486%)


In [12]:
train_fraction = 0.8
for file in logd_datasets:
    data = pd.read_csv(os.path.join(DATASET_PATH, file+'.csv'))
    print(file, 'shape: ', data.shape)
    mols = [Chem.MolFromSmiles(s) for s in data.smiles]
    values_list = [pd.Series(Chem.CanonicalRankAtoms(mol, breakTies=False)).value_counts() for mol in mols]
    symmetry_list = pd.Series([(len(values) - 1 <= len(values[values % 2 == 0])) 
                     or (len(values) - 1 <= len(values[(values % 2 == 1) & (values > 1)])) for values in values_list])
    symmetry_data = data[symmetry_list]
    no_symmetry_data = data[~symmetry_list]
    train_symm, validation_symm, test_symm = train_test_validation_split(symmetry_data, train_fraction)
    train_no_symm, validation_no_symm, test_no_symm = train_test_validation_split(no_symmetry_data, train_fraction)
    train = pd.concat([train_symm, train_no_symm], ignore_index=True)
    train = shuffle(train).reset_index(drop=True)
    validation = pd.concat([validation_symm, validation_no_symm], ignore_index=True)
    validation = shuffle(validation).reset_index(drop=True)
    test = pd.concat([test_symm, test_no_symm], ignore_index=True)
    test = shuffle(test).reset_index(drop=True)
    print('SPLITTED SHAPES:\n\ttrain: {0}\n\tvalidation: {1}\n\ttest: {2}\n'.format(train.shape, validation.shape, test.shape))

    train.to_csv(os.path.join(DATASET_PATH, 'split_data_symmetry', file + '_train.csv'))
    validation.to_csv(os.path.join(DATASET_PATH, 'split_data_symmetry', file + '_validation.csv'))
    test.to_csv(os.path.join(DATASET_PATH, 'split_data_symmetry',  file + '_test.csv'))

logd_Lip_wo_averaging shape:  (4166, 2)
SPLITTED SHAPES:
	train: (3332, 2)
	validation: (418, 2)
	test: (416, 2)

logd_251_logp_wo_logp_json_wo_averaging shape:  (4166, 3)
SPLITTED SHAPES:
	train: (3332, 3)
	validation: (418, 3)
	test: (416, 3)



In [13]:
DATA_PATH = "../data/3_final_data/split_data_symmetry/"
for name in logd_datasets:
    data_train = pd.read_csv(DATA_PATH + name + '_train.csv')
    data_valid = pd.read_csv(DATA_PATH + name + '_validation.csv')
    data_test = pd.read_csv(DATA_PATH + name + '_test.csv')
    print(name)
    analyze_data_split_symmetry({'train':data_train, 'valid':data_valid, 'test':data_test})

logd_Lip_wo_averaging
train: 26 symmetric molecules (0.78031212484994%)
valid: 4 symmetric molecules (0.9569377990430622%)
test: 3 symmetric molecules (0.7211538461538461%)
logd_251_logp_wo_logp_json_wo_averaging
train: 26 symmetric molecules (0.78031212484994%)
valid: 4 symmetric molecules (0.9569377990430622%)
test: 3 symmetric molecules (0.7211538461538461%)
