In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from rdkit import Chem
from rdkit.Chem import AllChem

import os

In [4]:
csv_file = 'raw/raw_qm9.csv'
coordinates_file = 'raw/raw_coordinates.npy'

## Load Data

In [5]:
data = pd.read_csv(csv_file)

smiles = data['SMILES']
y = data.iloc[:, 1:]
coordinates = np.load(coordinates_file)

## Remove outliers

In [6]:
# Filter Extreme Outliers
mask = (y['A'] < 500) & (y['A'] > 0)

In [7]:
smiles, y, coordinates = smiles[mask], y[mask], coordinates[mask]

## Normalize

In [8]:
# Normalize
mu = y.mean()
std = y.std()
norm_y = (y - mu)/std

In [9]:
# Save mean and standard deviation
norm_statistics = pd.concat((mu, std), axis=1)
norm_statistics.columns = ['mean', 'std']

## Save

In [10]:
smiles.to_csv('smiles.csv', index=None)
norm_y.to_csv('norm_y.csv', index=None)
norm_statistics.to_csv('norm_statistics.csv', index=None)
np.save('coordinates.npy', coordinates)