In [1]:
!pip install torch gpytorch dscribe ase



In [2]:
import sys

import pandas as pd
import matplotlib
%matplotlib inline

import torch
import gpytorch
import dscribe 
import numpy as np
import ase

sys.path.append("..")
from utils import helpers

pd.set_option('display.width',5000)
pd.set_option('display.max_columns',200)

In [3]:
# download file if it doesn't exist
# !curl -Lo ../df_full_split_ids_with_smiles_v15.json.tar.gz https://users.aalto.fi/ghoshk1/df_full_split_ids_with_smiles_v15.json.tar.gz
# !tar -xvzf ../df_full_split_ids_with_smiles_v15.json.tar.gz -C ..

# Use local file otherwise
json_file = "../df_full_split_ids_with_smiles_v15.json"

In [4]:
df_62k = pd.read_json(json_file, orient='split')

In [5]:
# compute HOMO_PBE (low fidelity) for all 62k molecules
get_level = helpers.get_level
df_62k['HOMO_PBE'] = df_62k.apply (lambda row: get_level(row, level_type='HOMO', subset='PBE+vdW_vacuum'), axis=1)

In [6]:
# get 5k subset which has high fidelity data
df_5k = df_62k[df_62k.energies_occ_gw_qzvp.notnull()]

In [7]:
# compute high fidelity levels for the 5k
df_5k['HOMO_GOWO'] = df_5k.apply(lambda row: get_level(row, level_type='HOMO', subset='GOWO_at_PBE0_qzvp'), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [8]:
len(df_5k)

5239

# High-fidelity train and predict : 
Of the 5k molecules with high-fidelity energy levels, train on 4k and predict the remaining 1k

In [9]:
test_5k = df_5k[-1000:]
train_5k = df_5k[:-1000]
print(f"Number of training data = {len(train_5k)}, test data = {len(test_5k)}")

Number of training data = 4239, test data = 1000


In [62]:
import ase.io
from dscribe.descriptors import MBTR

# get all the atoms present in dataset
xyz2ase = helpers.xyz2ase
atoms_in_mols = train_5k['xyz_pbe_relaxed'].apply(lambda data: xyz2ase(data).numbers)
atom_set = set()
for atom in atoms_in_mols:
    atom_set = atom_set.union(set(atom))


mbtr = MBTR(
#     species=["H", "O"],
    species = list(atom_set),
    k1={
        "geometry": {"function": "atomic_number"},
        "grid": {"min": 0, "max": 8, "n": 100, "sigma": 0.1},
    },
    k2={
        "geometry": {"function": "inverse_distance"},
        "grid": {"min": 0, "max": 1, "n": 100, "sigma": 0.1},
        "weighting": {"function": "exponential", "scale": 0.5, "cutoff": 1e-3},
    },
    k3={
        "geometry": {"function": "cosine"},
        "grid": {"min": -1, "max": 1, "n": 100, "sigma": 0.1},
        "weighting": {"function": "exponential", "scale": 0.5, "cutoff": 1e-3},
    },
    periodic=False,
    normalization="l2_each",
)

In [None]:
train_X = train_5k['xyz_pbe_relaxed'].apply(lambda data: mbtr.create(xyz2ase(data)))
train_y = np.asarray(train_5k['HOMO_GOWO'].tolist())

In [43]:
a = np.asarray([1, 2, 1, 3, 4])

In [47]:
set(a)

{1, 2, 3, 4}