In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import os

## Read XYZs

In [2]:
labels = ['SMILES', 'A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']
bad_samples = set([
    21725, 87037, 59827, 117523, 128113, 
    129053, 129152, 129158, 130535, 6620, 
    59818, 21725, 59827, 128113, 129053, 
    129152, 130535, 6620, 59818
])

In [3]:
csv_fname = 'qm9.csv'

## Transform Data

In [4]:
data = pd.read_csv('qm9.csv')

In [5]:
# Shuffle
data = data.sample(frac=1).reset_index(drop=True)

In [6]:
# Filter Extreme Outliers
data = data[data['A'] < 500]
data = data[data['A'] > 0]

In [7]:
# Separate SMILES and targets
smiles = data['SMILES']
y = data.iloc[:, 1:]

In [8]:
# Log of A, B, C
y['A'] = np.log(y['A'])
y['B'] = np.log(y['B'])
y['C'] = np.log(y['C'])

y.rename({'A': 'logA', 'B': 'logB', 'C': 'logC'}, axis=1, inplace=True)

In [9]:
# Normalize
mu = y.mean()
std = y.std()
norm_y = (y - mu)/std

## Create Splits

In [11]:
# 8/1/1 Split
n_train, n_test = smiles.shape[0] * 8 // 10, len(smiles) // 10

In [12]:
# Split data
train_smiles, validation_smiles, test_smiles= smiles.iloc[:n_train], smiles.iloc[n_train:n_train+n_test], smiles.iloc[n_train+n_test:]
train_y, validation_y, test_y = norm_y.iloc[:n_train], norm_y.iloc[n_train:n_train+n_test], norm_y.iloc[n_train+n_test:]