In [4]:
import pandas as pd
import numpy as np
import os
from pymatgen.ext.matproj import MPRester

#read in csv with mpid, initial/final site, and vacancy formation energies of the initial and final site 
df=pd.read_csv("C:\\Users\\johns\\OneDrive\\Desktop\\Evac_NB_data.csv")

path=df["path"]
VFE_0=df["Evac_00"]
VFE_f=df["Evac_04"]

In [5]:
os.environ['MP_API_KEY'] = 'pLcOgKYK8HYMgkvXPkE9zYTjsn5WsYiL'

def get_dos_data(mpids): # gets dos data from mpids from MP 
    dos_data = []
    with MPRester(os.environ.get('MP_API_KEY')) as mpr:
        for mpid in mpids:
            try:
                dos = mpr.get_dos_by_material_id(mpid)
                if dos is not None:
                    dos_data.append({'mpid': mpid, 'dos': dos})
                else:
                    print(f"Warning: No DOS data available for MPID {mpid}")
            except Exception as e:
                print(f"Error fetching DOS for MPID {mpid}: {str(e)}")
    return dos_data

mpids = df['mpid'].tolist()
dos_data = get_dos_data(mpids)

Retrieving ElectronicStructureDoc documents: 100%|██████████| 1/1 [00:00<?, ?it/s]
Retrieving ElectronicStructureDoc documents: 100%|██████████| 1/1 [00:00<?, ?it/s]
Retrieving ElectronicStructureDoc documents: 100%|██████████| 1/1 [00:00<?, ?it/s]
Retrieving ElectronicStructureDoc documents: 100%|██████████| 1/1 [00:00<?, ?it/s]
Retrieving ElectronicStructureDoc documents: 100%|██████████| 1/1 [00:00<?, ?it/s]
Retrieving ElectronicStructureDoc documents: 100%|██████████| 1/1 [00:00<?, ?it/s]
Retrieving ElectronicStructureDoc documents: 100%|██████████| 1/1 [00:00<00:00, 1947.22it/s]
Retrieving ElectronicStructureDoc documents: 100%|██████████| 1/1 [00:00<?, ?it/s]
Retrieving ElectronicStructureDoc documents: 100%|██████████| 1/1 [00:00<00:00, 1096.26it/s]
Retrieving ElectronicStructureDoc documents: 100%|██████████| 1/1 [00:00<00:00, 1062.12it/s]
Retrieving ElectronicStructureDoc documents: 100%|██████████| 1/1 [00:00<?, ?it/s]
Retrieving ElectronicStructureDoc documents: 100%|███████

In [14]:
dos_data_df=pd.DataFrame(dos_data)
file_path = "C:\\Users\\johns\\Downloads\\dos_data_df.csv"
dos_data_df.to_csv(file_path, index=False)

In [49]:
def extract_site_dos(dos_data, df):
    site_dos_list = []
    for row_index, row in df.iterrows():
        mpid = row['mpid']
        path_parts = row['path'].split('_')
        initial_site = int(path_parts[1])
        final_site = int(path_parts[2])
        material_dos = next((item for item in dos_data if item["mpid"] == mpid), None)
        if material_dos:
            dos = material_dos['dos'] # get pdos for site of interest
            initial_site_dos = None
            final_site_dos = None
            for site, site_dos in dos.pdos.items():
                if site.specie == dos.structure.species[initial_site] and not initial_site_dos:
                    initial_site_dos = site_dos
                    #print(f"Initial site DOS found for {mpid}")
                elif site.specie == dos.structure.species[final_site] and not final_site_dos:
                    final_site_dos = site_dos
                    #print(f"Final site DOS found for {mpid}")
                if initial_site_dos and final_site_dos:
                    break
            site_dos_list.append({
                'mpid': mpid,
                'initial_site_dos': initial_site_dos,
                'final_site_dos': final_site_dos
            })
        else:
            print(f"No DOS data found for MPID {mpid}")
    return site_dos_list

site_dos_list = extract_site_dos(dos_data, df)

site_dos_df = pd.DataFrame(site_dos_list)
site_dos_df.to_csv("C:\\Users\\johns\\Downloads\\site_dos_df_.csv", index=False)


In [69]:
vacancy_energies = pd.read_csv('C:/Users/johns/Downloads/Evac_NB_data.csv') #get vacancy energies

vacancy_energies_mapping = dict(zip(vacancy_energies_df['mpid'], vacancy_energies_df[['Evac_00', 'Evac_04']].values)) # Match vacancy formation energies with mpids

dataset = []
for entry in site_dos_list:
    mpid = entry['mpid']
    if mpid in vacancy_energies_mapping:
        vacancy_energies = vacancy_energies_mapping[mpid]
        dataset.append({
            'mpid': mpid,
            'initial_site_dos': entry['initial_site_dos'],
            'final_site_dos': entry['final_site_dos'],
            'Evac_00': vacancy_energies[0],  # Vacancy formation energy of initial site
            'Evac_04': vacancy_energies[1]   # Vacancy formation energy of final site
        })
    else:
        break

dataset_df = pd.DataFrame(dataset)



In [70]:
initial_site_dos_array = np.array([np.array(dos) for dos in dataset_df['initial_site_dos']])
final_site_dos_array = np.array([np.array(dos) for dos in dataset_df['final_site_dos']])

In [75]:
initial_site_dos_array = np.expand_dims(initial_site_dos_array, axis=-1)
final_site_dos_array = np.expand_dims(final_site_dos_array, axis=-1)
X_dos = np.concatenate((initial_site_dos_array, final_site_dos_array), axis=2)
y_vacancy_energies = dataset_df[['Evac_00', 'Evac_04']].values

X_surface_dos_df = pd.DataFrame(X_dos.reshape(X_dos.shape[0], -1))
y_vacancy_energies_df = pd.DataFrame(y_vacancy_energies, columns=['Evac_00', 'Evac_04'])

training_data_df = pd.concat([X_surface_dos_df, y_vacancy_energies_df], axis=1)
training_data_df.to_csv("C:\\Users\\johns\\Downloads\\training_data_df.csv", index=False)