In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import os

## Read XYZs

In [2]:
labels = ['SMILES', 'A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']
bad_samples = set([
    21725, 87037, 59827, 117523, 128113, 
    129053, 129152, 129158, 130535, 6620, 
    59818, 21725, 59827, 128113, 129053, 
    129152, 130535, 6620, 59818
])

In [3]:
csv_fname = 'qm9.csv'

## Transform Data

In [2]:
data = pd.read_csv('qm9.csv')

In [3]:
# Shuffle
data = data.sample(frac=1).reset_index(drop=True)

In [None]:
# Filter bad SMILES
from rdkit.Chem import MolFromSmiles
valid_smiles = [MolFromSmiles(smile) is not None for smile in data['SMILES']]

data = data[valid_smiles]

In [5]:
# Filter Extreme Outliers
data = data[data['A'] < 500]
data = data[data['A'] > 0]

In [6]:
# Separate SMILES and targets
smiles = data['SMILES']
y = data.iloc[:, 1:]

In [7]:
# Log of A, B, C
y['A'] = np.log(y['A'])
y['B'] = np.log(y['B'])
y['C'] = np.log(y['C'])

y.rename({'A': 'logA', 'B': 'logB', 'C': 'logC'}, axis=1, inplace=True)

In [8]:
# Normalize
mu = y.mean()
std = y.std()
norm_y = (y - mu)/std

In [14]:
# Save
smiles.to_csv('smiles.csv', index=False)
norm_y.to_csv('norm_y.csv', index=False)

## Create Splits