In [53]:
## 1. extract all slabs from ocp and cathub(mamun)
## 2. generate slab descriptor with dimenet++
## 3. extract all product from ocp and cathub(mamun)
## 4. generate product descriptor with chEMBL
## 5. run experiments-
##    - multitask learner -> (cathub, ocp) x (xgboost) x (original 1024+1024, pca ncomponents, imr ncomponents)
##    - solves n^2 to n, solves descriptor generation for slab/surface

In [54]:
import json
import pickle
import numpy as np
from ocpmodels.datasets import SinglePointLmdbDataset 
import os
import ase.io
from ase.constraints import FixAtoms
from ase.build import add_adsorbate, molecule, surface
from pymatgen.ext.matproj import MPRester
from pymatgen.core.surface import generate_all_slabs, SlabGenerator
from pymatgen.io.ase import AseAtomsAdaptor
from ocpmodels.common.relaxation.ase_utils import OCPCalculator
from pprint import pprint
import pubchempy as pcp
from transformers import AutoTokenizer, AutoModel, pipeline
import pandas as pd
import ray
import torch
import random

In [55]:
with open('datasets/ocp_reactions_info.pickle', 'rb') as f:
    loaded_list_data = pickle.load(f)
    
## this data only contains the lowest energy reaction info
## also this includes both train and validation from OCP
## but the data can be separated using the adslab_slab_key
## check which adslab_slab_key are in train then separate those as train data
## check which adslab_slab_key are in validation then separate those as validation data
## then your model can be trained
print(len(loaded_list_data)) 

446885


In [56]:
#####################################################################
## data from ocp
#####################################################################

In [57]:
loaded_list_data[0]

{'bulk_id': 2085,
 'ads_id': 29,
 'bulk_mpid': 'mp-976273',
 'bulk_symbols': 'Hf6Ge4',
 'ads_symbols': '*COCH2O',
 'miller_index': (2, 1, 0),
 'shift': 0.022,
 'top': True,
 'adsorption_site': ((9.3, 4.58, 26.89),),
 'class': 1,
 'anomaly': 1,
 'adslab_slab_key': 'random2024607',
 'energy': -9.992999119999922}

In [58]:
#####################################################################
## product descriptors
#####################################################################

In [59]:
for lld in loaded_list_data[:]:
    ads_symbols = lld['ads_symbols']
    pkey = ads_symbols.replace('*', '')
    lld['pkey'] = pkey

pprint(loaded_list_data[-1])

list_pkey =  []
for data in loaded_list_data:
    pkey = data['pkey']
    list_pkey.append(pkey)

list_pkey = list(set(list_pkey))

len(list_pkey)

{'ads_id': 67,
 'ads_symbols': '*NO2NO2',
 'adslab_slab_key': 'random1645311',
 'adsorption_site': ((6.08, 5.63, 18.81),),
 'anomaly': 0,
 'bulk_id': 2106,
 'bulk_mpid': 'mp-11329',
 'bulk_symbols': 'P4W2',
 'class': 2,
 'energy': 9.999100859999942,
 'miller_index': (1, 1, 1),
 'pkey': 'NO2NO2',
 'shift': 0.022,
 'top': True}


68

In [60]:
d_pkey_vs_smiles = {
    'CH2CO': 'C=C=O',
    'COHCHO': 'C(=O)C=O',
    'OH2': 'O',
    'CH2O': 'C=O',
    'COCHO': 'C(=C=O)[O-]',
    'CHOH': 'C=O',
    'ONN(CH3)2': 'ONN(C)(C)',
    'OHCH2CH3': 'CCO',
    'NONH': 'N=N[O-]',
    'NO2': 'N(=O)[O-]',
    'CCHO': 'C#C[O-]',
    'OHCH3': 'CO',
    'NH': '[NH]',
    'COHCOH': 'C(=O)C=O',
    'OCH2CH3': 'CC[O-]',
    'NNO': '[N-]=[N+]=O',
    'CHCH2OH': 'C1CO1',
    'CHOCH2OH': 'CC(=O)O',
    'CHCOH': 'C=C=O',
    'NO': '[N]=O',
    'CHOHCHOH': 'CC(=O)O',
    'CH2OH': 'C[O-]',
    'NNH': '[NH+]#N',
    'CCH2': 'C#C',
    'OCH3': 'C[O-]',
    'CCH3': 'C=[CH]',
    'CH2CH3': 'C[CH2+]',
    'CHOHCH2': 'C1CO1',
    'COHCH2OH': 'CC(=O)O',
    'OCH2CHOH': 'CC(=O)O',
    'CN': '[C-]#N',
    'CHOHCH3': 'CC[O-]',
    'CCH2OH': 'C[C]=O',
    'CHOCHO': 'C(=O)C=O',
    'CHCH': 'C#C',
    'CH4': 'C',
    'COCH2O': 'C(=O)C=O',
    'CHOCHOH': 'CC(=O)[O-]',
    'CCH': 'C#[C-]',
    'CHOHCH2OH': 'C([CH]O)O',
    'N2': 'N#N',
    'ONNH2': 'NN=O',
    'N': '[N]',
    'OCHCH3': 'C1CO1',
    'C': '[C]',
    'ONH': 'N=O',
    'CHCO': 'C#C[O-]',
    'CCO': 'C1#CO1',
    'CH2CH2OH': 'CC[O-]',
    'NO3': '[N+](=O)([O-])[O-]',
    'O': '[O]',
    'NO2NO2': '[N+](=O)([N+](=O)[O-])[O-]',
    'CH2': '[CH2]',
    'CHCH2': 'C=[CH]',
    'OH': '[OH-]',
    'CC': '[C-]#[C+]',
    'NHNH': 'N=N',
    'H': '[H+]',
    'COHCHOH': 'CC(=O)[O-]',
    'CH3': '[CH3+]',
    'NH3': 'N',
    'OHNNCH3': 'C(=O)(N)N',
    'CHCHOH': 'C[C]=O',
    'COCH3': 'C[C]=O',
    'CCHOH': 'C=C=O',
    'OHNH2': 'NO',
    'COHCH3': 'C1CO1',
    'CHCHO': 'C=C=O'
}

In [61]:
tokenizer = AutoTokenizer.from_pretrained("mrm8488/chEMBL26_smiles_v2")
model = AutoModel.from_pretrained("mrm8488/chEMBL26_smiles_v2")
fe = pipeline('feature-extraction', model=model, tokenizer=tokenizer, device=-1) ## device= (0 for GPU, -1 for CPU)

Some weights of the model checkpoint at mrm8488/chEMBL26_smiles_v2 were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [62]:
list_pkey= list(d_pkey_vs_smiles.keys())
list_smiles= list(d_pkey_vs_smiles.values())
list_emb_1 = fe(list_smiles)

list_emb_2 = []
for emb_1 in list_emb_1:
    emb_2 = np.mean(np.array(emb_1[0]), axis=0)
    list_emb_2.append(emb_2)
arr_emb = np.array(list_emb_2)
print(arr_emb.shape)

d_pkey_vs_desc = {}
for i, pkey in enumerate(list_pkey):
    d_pkey_vs_desc[pkey] = arr_emb[i]

(68, 768)


In [63]:
for rinfo in loaded_list_data:
    rinfo['pdesc'] = d_pkey_vs_desc[rinfo['pkey']]

In [65]:
loaded_list_data[0].keys()

dict_keys(['bulk_id', 'ads_id', 'bulk_mpid', 'bulk_symbols', 'ads_symbols', 'miller_index', 'shift', 'top', 'adsorption_site', 'class', 'anomaly', 'adslab_slab_key', 'energy', 'pkey', 'pdesc'])

In [70]:
# loaded_list_data[0]

In [71]:
#####################################################################
## slab descriptors
#####################################################################

In [73]:
m = MPRester('Yct0KDbJbqMLWluZEovkwrLXh2VRHXbc')



In [68]:
try:
    config_yml_path = "ocp/configs/is2re/all/dimenet_plus_plus/dpp.yml"
    checkpoint_path = "dimenetpp_all.pt"
    # Define the calculator
    calc = OCPCalculator(config_yml=config_yml_path, checkpoint=checkpoint_path)
except Exception as e:
    print(e)
    config_yml_path = "ocp/configs/is2re/all/dimenet_plus_plus/dpp.yml"
    checkpoint_path = "dimenetpp_all.pt"
    # Define the calculator
    calc = OCPCalculator(config_yml=config_yml_path, checkpoint=checkpoint_path)    

No module named 'ocpmodels.models.'


In [72]:
calc

<ocpmodels.common.relaxation.ase_utils.OCPCalculator at 0x7f56a82f0730>

In [74]:
# File path for the dataset
folder_path = 'datasets/slabs/'

# File names
file_names = [
    "list_rinfo_0to100000.pkl",
    "list_rinfo_100000to200000.pkl",
    "list_rinfo_200000to300000.pkl",
    "list_rinfo_300000to400000.pkl",
    "list_rinfo_400000to500000.pkl"
]

# Initialize the combined list
list_rinfo_all = []

# Iterate through each file and append its contents to the combined list
for file_name in file_names:
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'rb') as file:
        list_rinfo = pickle.load(file)
        list_rinfo_all.extend(list_rinfo)

# Check the length of the combined list
len(list_rinfo_all)

443368

In [76]:
for lld in list_rinfo_all[:]:
    ads_symbols = lld['ads_symbols']
    pkey = ads_symbols.replace('*', '')
    lld['pkey'] = pkey

In [77]:
list_pkey =  []
for data in list_rinfo_all:
    pkey = data['pkey']
    list_pkey.append(pkey)

list_pkey = list(set(list_pkey))

len(list_pkey)

68

In [21]:
d_pkey_vs_smiles = {
    'CH2CO': 'C=C=O',
    'COHCHO': 'C(=O)C=O',
    'OH2': 'O',
    'CH2O': 'C=O',
    'COCHO': 'C(=C=O)[O-]',
    'CHOH': 'C=O',
    'ONN(CH3)2': 'ONN(C)(C)',
    'OHCH2CH3': 'CCO',
    'NONH': 'N=N[O-]',
    'NO2': 'N(=O)[O-]',
    'CCHO': 'C#C[O-]',
    'OHCH3': 'CO',
    'NH': '[NH]',
    'COHCOH': 'C(=O)C=O',
    'OCH2CH3': 'CC[O-]',
    'NNO': '[N-]=[N+]=O',
    'CHCH2OH': 'C1CO1',
    'CHOCH2OH': 'CC(=O)O',
    'CHCOH': 'C=C=O',
    'NO': '[N]=O',
    'CHOHCHOH': 'CC(=O)O',
    'CH2OH': 'C[O-]',
    'NNH': '[NH+]#N',
    'CCH2': 'C#C',
    'OCH3': 'C[O-]',
    'CCH3': 'C=[CH]',
    'CH2CH3': 'C[CH2+]',
    'CHOHCH2': 'C1CO1',
    'COHCH2OH': 'CC(=O)O',
    'OCH2CHOH': 'CC(=O)O',
    'CN': '[C-]#N',
    'CHOHCH3': 'CC[O-]',
    'CCH2OH': 'C[C]=O',
    'CHOCHO': 'C(=O)C=O',
    'CHCH': 'C#C',
    'CH4': 'C',
    'COCH2O': 'C(=O)C=O',
    'CHOCHOH': 'CC(=O)[O-]',
    'CCH': 'C#[C-]',
    'CHOHCH2OH': 'C([CH]O)O',
    'N2': 'N#N',
    'ONNH2': 'NN=O',
    'N': '[N]',
    'OCHCH3': 'C1CO1',
    'C': '[C]',
    'ONH': 'N=O',
    'CHCO': 'C#C[O-]',
    'CCO': 'C1#CO1',
    'CH2CH2OH': 'CC[O-]',
    'NO3': '[N+](=O)([O-])[O-]',
    'O': '[O]',
    'NO2NO2': '[N+](=O)([N+](=O)[O-])[O-]',
    'CH2': '[CH2]',
    'CHCH2': 'C=[CH]',
    'OH': '[OH-]',
    'CC': '[C-]#[C+]',
    'NHNH': 'N=N',
    'H': '[H+]',
    'COHCHOH': 'CC(=O)[O-]',
    'CH3': '[CH3+]',
    'NH3': 'N',
    'OHNNCH3': 'C(=O)(N)N',
    'CHCHOH': 'C[C]=O',
    'COCH3': 'C[C]=O',
    'CCHOH': 'C=C=O',
    'OHNH2': 'NO',
    'COHCH3': 'C1CO1',
    'CHCHO': 'C=C=O'
}

In [21]:
tokenizer = AutoTokenizer.from_pretrained("mrm8488/chEMBL26_smiles_v2")
model = AutoModel.from_pretrained("mrm8488/chEMBL26_smiles_v2")
fe = pipeline('feature-extraction', model=model, tokenizer=tokenizer, device=-1) ## device= (0 for GPU, -1 for CPU)

In [78]:
list_pkey= list(d_pkey_vs_smiles.keys())
list_smiles= list(d_pkey_vs_smiles.values())
list_emb_1 = fe(list_smiles)

list_emb_2 = []
for emb_1 in list_emb_1:
    emb_2 = np.mean(np.array(emb_1[0]), axis=0)
    list_emb_2.append(emb_2)
arr_emb = np.array(list_emb_2)
print(arr_emb.shape)

d_pkey_vs_desc = {}
for i, pkey in enumerate(list_pkey):
    d_pkey_vs_desc[pkey] = arr_emb[i]

(68, 768)


In [79]:
for rinfo in list_rinfo_all:
    rinfo['pdesc'] = d_pkey_vs_desc[rinfo['pkey']]

In [81]:
list_rinfo_all[0].keys()

dict_keys(['bulk_id', 'ads_id', 'bulk_mpid', 'bulk_symbols', 'ads_symbols', 'miller_index', 'shift', 'top', 'adsorption_site', 'class', 'anomaly', 'adslab_slab_key', 'energy', 'slab', 'pkey', 'pdesc'])

In [83]:
# list_rinfo_all[0]

In [21]:
for i, rinfo in enumerate(list_rinfo):
    if i%500==0:
        print(i)
    ase_slab = rinfo['slab']
    adslab = ase_slab.copy()            
        
    ## set additional info
    tags = np.zeros(len(adslab))
    tags[18:27] = 1
    tags[27:] = 2
    adslab.set_tags(tags)
    cons= FixAtoms(indices=[atom.index for atom in adslab if (atom.tag == 0)])
    adslab.set_constraint(cons)
    adslab.center(vacuum=13.0, axis=2)
    adslab.set_pbc(True)

    ## calculator, energy, embeddings
    adslab.calc = calc
    ads_energy = adslab.get_potential_energy()
    min_ads_embs = torch.min(calc.trainer.model.module.embs[0], dim=0).values        
    mean_ads_embs = torch.mean(calc.trainer.model.module.embs[0], dim=0)    
    max_ads_embs = torch.max(calc.trainer.model.module.embs[0], dim=0).values    
    # sum_ads_embs = torch.sum(calc.trainer.model.module.embs[0], dim=0)                
    # print(min_ads_embs.shape, mean_ads_embs.shape, max_ads_embs.shape)
    combined_embs = torch.cat((min_ads_embs, mean_ads_embs, max_ads_embs))    
    rinfo['sdesc'] = combined_embs.numpy()

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000


In [25]:
list_rinfo[-1].keys()

dict_keys(['sc', 'slab', 'star', 'facet', 'pkey', 'pval', 're', 'nre', 'pdesc', 'sdesc'])

In [26]:
sdesc_columns = [f's{i}' for i in range(768)]
pdesc_columns = [f'p{i}' for i in range(768)]
columns = sdesc_columns + pdesc_columns + ['energy']
data = []
for i, rinfo in enumerate(list_rinfo):
    try:
        sdesc_flat = rinfo['sdesc'].flatten()
        pdesc_flat = rinfo['pdesc'].flatten()
        energy = rinfo['nre']
        row = list(sdesc_flat) + list(pdesc_flat) + [energy]
        data.append(row)
    except Exception as e:
        print(i, e)
df = pd.DataFrame(data, columns=columns)

In [27]:
df.to_pickle('v3/cathub_df.pickle')
print(df.shape)
df.head(2)

(11257, 1537)


Unnamed: 0,s0,s1,s2,s3,s4,s5,s6,s7,s8,s9,...,p759,p760,p761,p762,p763,p764,p765,p766,p767,energy
0,-1.94954,-1.75761,-1.190355,-2.212169,0.684488,0.847683,-2.514778,-1.68597,-1.849821,-1.727471,...,0.912192,0.652321,0.835792,-2.122195,-0.916445,-1.751576,0.783765,-1.780947,-0.90128,-2.810391
1,-1.588849,-2.084145,-2.29249,-1.887519,-0.122827,-0.544024,-1.935229,-1.545543,-1.634236,-0.2579,...,0.133178,0.967194,0.530776,-0.38646,0.284255,-1.429858,1.077861,-0.831704,-0.738524,-4.468474
