In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from rdkit import Chem
from rdkit.Chem import AllChem

import os

## Read XYZs

In [None]:
labels = ['SMILES', 'A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']
bad_samples = set([
    21725, 87037, 59827, 117523, 128113, 
    129053, 129152, 129158, 130535, 6620, 
    59818, 21725, 59827, 128113, 129053, 
    129152, 130535, 6620, 59818
])

In [None]:
csv_fname = 'qm9.csv'

In [None]:
coordinates = np.stack(coordinates)

## Filter Data

In [None]:
data = pd.read_csv('qm9.csv')

In [None]:
# Filter Extreme Outliers
mask = (data['A'] < 500) & (data['A'] > 0)

In [None]:
data = data[mask]
coordinates = coordinates[mask]

## Transform

In [None]:
# Separate SMILES and targets
smiles = data['SMILES']
y = data.iloc[:, 1:]

In [None]:
# Log of A, B, C
y['A'] = np.log(y['A'])
y['B'] = np.log(y['B'])
y['C'] = np.log(y['C'])

y.rename({'A': 'logA', 'B': 'logB', 'C': 'logC'}, axis=1, inplace=True)

In [None]:
# Normalize
mu = y.mean()
std = y.std()
norm_y = (y - mu)/std

In [None]:
# Save mean and standard deviation
norm_statistics = pd.concat((mu, std), axis=1)
norm_statistics.columns = ['mean', 'std']
norm_statistics.to_csv('norm_statistics.csv')

## Shuffle

In [None]:
indices = np.random.permutation(len(smiles))

In [None]:
smiles_shuffled = smiles.iloc[indices]
norm_y_shuffled = norm_y.iloc[indices]
coordinates_shuffled = coordinates[indices]

## Save

In [None]:
smiles_shuffled.to_csv('smiles.csv', index=False)
norm_y_shuffled.to_csv('norm_y.csv', index=False)
np.save('coordinates.npy', coordinates_shuffled)

In [None]:
d = np.nan_to_num(
    np.linalg.norm(
        np.expand_dims(coordinates, axis=1) - np.expand_dims(coordinates, axis=2), 
        axis=-1
    ), 
    nan=np.inf, posinf=np.inf, neginf=np.inf
)

In [None]:
np.save('distances.npy', d)