In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
data = pd.read_csv('./cleaned.csv')
properties = data.columns[1:]

transforms = []

### Log

In [3]:
transforms.append('log')

data.loc[:, properties] = np.log(data.loc[:, properties])

### Standardize

In [3]:
transforms.append('standardize')

for property_name in properties:

    values = data.loc[:, property_name]

    mu, sigma = values.mean(), values.std()

    data.loc[:, property_name] = (values - mu)/sigma

### Prefix

In [4]:
transforms.append('prefix')

prefixed_columns = [
    'property_' + column_name 
    if column_name != 'SMILES' else column_name 
    for column_name in data.columns
]

data.columns = prefixed_columns

## Split and Save

In [5]:
n_train, n_test = data.shape[0] * 8 // 10, len(data) // 10

In [6]:
train, test, validation = data.iloc[:n_train], data.iloc[n_train:n_train+n_test], data.iloc[n_train+n_test:]

In [7]:
directory_path = '_'.join(transforms)

train.to_csv(f'{directory_path}/train.csv', index=None)
test.to_csv(f'{directory_path}/test.csv', index=None)
validation.to_csv(f'{directory_path}/validation.csv', index=None)