In [2]:
# from ocdata.utils.vasp import write_vasp_input_files
import pickle
import pandas as pd 
import numpy as np
from ase.io import read
import re
import warnings
warnings.filterwarnings("ignore")

In [3]:
## get mappings of sid (a random number) to systems (adslabs) that are in the OC20 dataset 
with open("oc20_data_mapping.pkl", "rb") as input_file:
    mappings = pickle.load(input_file)
with open("mapping_adslab_slab.pkl", "rb") as input_file:
    slab_adslab_mappings = pickle.load(input_file)

In [4]:
## Adeesh data from the perspecive paper
df = pd.read_csv("gemnet-relaxations.csv", index_col=0)
df["error"] = np.abs(df["energy dE [eV]"] - df["genmet_T_predE"])
# sort error column in descending order
df.sort_values(by=['error'], inplace=True, ascending=False)

In [5]:
surfaces = []
for id in df["ID"].values:
    surfaces.append(mappings[id]["bulk_symbols"])
df["bulk_symbols"] = surfaces
nonmetals_df = df.query("material_type == 'nonmetals'")

### Pick Gallium Nitride and Silicon Phosphide as examples of semiconductors to investigate
### will check ISMEAR on the bare slabs because it is cheaper
### Use Gaussian smearing ISMEAR=0 with SIGMA=0.05

In [6]:
sid  = nonmetals_df.query("bulk_symbols=='NGa'").iloc[0]["ID"]
slab_sid= slab_adslab_mappings[sid]
mappings[sid]

{'bulk_id': 315,
 'ads_id': 78,
 'bulk_mpid': 'mp-830',
 'bulk_symbols': 'NGa',
 'ads_symbols': '*NO3',
 'miller_index': (2, 1, 1),
 'shift': 0.062,
 'top': False,
 'adsorption_site': ((3.98, 2.11, 18.75),),
 'class': 2,
 'anomaly': 0}

In [7]:
atoms = read(f"/home/jovyan/shared-datasets/OC20/trajs/slabs/trajectories_all_02_05/{slab_sid}.traj", -1)
atoms.get_potential_energy()

-493.1095156

In [8]:
new_atoms = read("/home/jovyan/shared-scratch/kabdelma/oc20_data_quality_project/oc20_analysis/ismear_study/gan/OUTCAR", -1)
new_atoms.get_potential_energy()

-493.13173391

In [9]:
np.abs(atoms.get_potential_energy() - new_atoms.get_potential_energy())

0.022218309999971098

In [10]:
# VASP_FLAGS = {
#     "ismear":0,
#     "sigma":0.05,
#     "ibrion": 2,
#     "nsw": 2000,
#     "isif": 0,
#     "isym": 0,
#     "lreal": "Auto",
#     "ediffg": -0.03,
#     "symprec": 1e-10,
#     "encut": 350.0,
#     "laechg": True,
#     "lwave": False,
#     "ncore": 4,
#     "gga": "RP",
#     "pp": "PBE",
#     "xc": "PBE",
# }
# vasp_path = "/home/jovyan/shared-scratch/kabdelma/oc20_data_quality_project/oc20_analysis/ismear_study/gan"
# write_vasp_input_files(atoms, vasp_flags =VASP_FLAGS, outdir = vasp_path)

In [11]:
sid = nonmetals_df.query("bulk_symbols=='Si4P8'").iloc[0]["ID"]
slab_sid= slab_adslab_mappings[sid]
mappings[sid]

{'bulk_id': 2542,
 'ads_id': 7,
 'bulk_mpid': 'mp-21065',
 'bulk_symbols': 'Si4P8',
 'ads_symbols': '*CHO',
 'miller_index': (2, 2, 1),
 'shift': 0.185,
 'top': False,
 'adsorption_site': ((0.32, 9.08, 25.76),),
 'class': 2,
 'anomaly': 1}

In [12]:
atoms = read(f"/home/jovyan/shared-datasets/OC20/trajs/slabs/trajectories_all_02_05/{slab_sid}.traj", -1)
atoms.get_potential_energy()

-298.79240844

In [13]:
new_atoms = read("/home/jovyan/shared-scratch/kabdelma/oc20_data_quality_project/oc20_analysis/ismear_study/sip/OUTCAR", -1)
new_atoms.get_potential_energy()

-298.79697237

In [14]:
np.abs(atoms.get_potential_energy() - new_atoms.get_potential_energy())

0.004563930000017535

In [7]:
# VASP_FLAGS = {
#     "ismear":0,
#     "sigma":0.05,
#     "ibrion": 2,
#     "nsw": 2000,
#     "isif": 0,
#     "isym": 0,
#     "lreal": "Auto",
#     "ediffg": -0.03,
#     "symprec": 1e-10,
#     "encut": 350.0,
#     "laechg": True,
#     "lwave": False,
#     "ncore": 4,
#     "gga": "RP",
#     "pp": "PBE",
#     "xc": "PBE",
# }
# vasp_path = "/home/jovyan/shared-scratch/kabdelma/oc20_data_quality_project/oc20_analysis/ismear_study/sip"
# write_vasp_input_files(atoms, vasp_flags =VASP_FLAGS, outdir = vasp_path)

In [42]:
# # Step 1: Extract unique elements from the bulk_symbols column of intermetallic compounds
# nonmetals_elements = []
# for i in range(len(nonmetals_df)):
#     nonmetals_elements += list(np.unique(np.array(re.findall('[A-Z][a-z]?', f'{nonmetals_df.iloc[i]["bulk_symbols"]}'))))

# nonmetals_element_counts = []
# for i in range(len(nonmetals_df)):
#     nonmetals_element_counts.append(len(list(np.unique(np.array(re.findall('[A-Z][a-z]?', f'{nonmetals_df.iloc[i]["bulk_symbols"]}'))))))

# # Remove duplicates and sort the elements
# unique_nonmetals_elements = sorted(list(set(nonmetals_elements)))
# bulk_element_encoding = []
# for bulk in nonmetals_df["bulk_symbols"].values:
#     elements_encoding = []
#     for element in unique_nonmetals_elements:
#         if element in np.unique(np.array(re.findall('[A-Z][a-z]?', f'{bulk}'))):
#             elements_encoding.append(1)
#         else: 
#             elements_encoding.append(0)
#     bulk_element_encoding.append(elements_encoding)

# # Create a DataFrame from bulk_element_encoding with column names as unique_intermetallics_elements
# bulk_elements_encodings_df = pd.DataFrame(bulk_element_encoding, columns=unique_nonmetals_elements)
# # Reset the index of bulk_elements_encodings_df
# bulk_elements_encodings_df.reset_index(drop=True, inplace=True)
# # Concatenate bulk_elements_encodings_df with the existing DataFrame df
# nonmetals_df = pd.concat([nonmetals_df.reset_index(drop=True), bulk_elements_encodings_df], axis=1)
# nonmetals_df

In [41]:
# nonmetals_df.query("Ga==1 & N ==1")[20:50]
# nonmetals_df.query("Si==1 & P ==1")[20:50]