In [None]:
import pandas as pd
import numpy as np
import os
from pymatgen.ext.matproj import MPRester
from scipy import interpolate
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt
from pathlib import Path
import pickle
import os
from pymatgen.io.vasp.outputs import CompleteDos

path = Path("C:\\Users\\johns\\Downloads\\EVAC_data_updated.csv")
df = pd.read_csv(path)
# get initial and final sites
f[[d'initial_site', 'final_site']] = df['path'].str.split('_', expand=True).iloc[:, 1:3]
df

In [None]:
os.environ['MP_API_KEY'] = 'pLcOgKYK8HYMgkvXPkE9zYTjsn5WsYiL'
from pymatgen.electronic_structure.core import Spin


def get_site_dos_data(df):
    dos_data = []
    with MPRester(os.environ.get('MP_API_KEY')) as mpr:
        for _, row in df.iterrows():
            mpid = row['mpid']
            initial_site = int(row['initial_site'])
            final_site = int(row['final_site'])

            try:
                dos = mpr.get_dos_by_material_id(mpid)
                if dos is not None:
                    # get DOS for initial and final sites
                    structure = dos.structure
                    initial_site = structure[initial_site]
                    final_site = structure[final_site]
                    initial_dos = dos.get_site_dos(initial_site)
                    final_dos = dos.get_site_dos(final_site)
                    
                    if initial_dos is not None and final_dos is not None:
                        # subtract Fermi energy from energies
                        initial_energies = initial_dos.energies - dos.efermi
                        final_energies = final_dos.energies - dos.efermi
                        # get spin up and spin down densities for both sites
                        initial_spin_up = initial_dos.densities.get(Spin.up)
                        initial_spin_down = initial_dos.densities.get(Spin.down)
                        final_spin_up = final_dos.densities.get(Spin.up)
                        final_spin_down = final_dos.densities.get(Spin.down)
                        
            
                        if initial_spin_up is None and initial_spin_down is not None:
                            initial_spin_up = -1 * initial_spin_down
                        elif initial_spin_down is None and initial_spin_up is not None:
                            initial_spin_down = -1 * initial_spin_up
                        
                        if final_spin_up is None and final_spin_down is not None:
                            final_spin_up = -1 * final_spin_down
                        elif final_spin_down is None and final_spin_up is not None:
                            final_spin_down = -1 * final_spin_up
                        
                        dos_data.append({
                            'mpid': mpid,
                            'initial_site_energies': initial_energies,
                            'initial_site_spin_up_densities': initial_spin_up,
                            'initial_site_spin_down_densities': initial_spin_down,
                            'final_site_energies': final_energies,
                            'final_site_spin_up_densities': final_spin_up,
                            'final_site_spin_down_densities': final_spin_down
                        })
                    else:
                        print(f"no DOS data available for MPID {mpid} at the specified sites")
                else:
                    print(f"Warning: No DOS data available for MPID {mpid}")
            except Exception as e:
                print(f"error fetching DOS for MPID {mpid}: {str(e)}")
                print(f"mpid: {mpid}, initial site: {initial_site}, final site: {final_site}")
    return dos_data

In [None]:
def modify_dos_graph(dos_energy, dos_density_up, dos_density_down, min_lim=-15, max_lim=15):
    step = dos_energy[1] - dos_energy[0] 

    while dos_energy[0] > min_lim:
        dos_energy = np.insert(dos_energy, 0, dos_energy[0] - step)
        dos_density_up = np.insert(dos_density_up, 0, 0)
        dos_density_down = np.insert(dos_density_down, 0, 0)

    while dos_energy[-1] < max_lim:
        dos_energy = np.append(dos_energy, dos_energy[-1] + step)
        dos_density_up = np.append(dos_density_up, 0)
        dos_density_down = np.append(dos_density_down, 0)

    modified_energy = dos_energy[(dos_energy >= min_lim) & (dos_energy <= max_lim)]
    dos_density_modified_up = dos_density_up[(dos_energy >= min_lim) & (dos_energy <= max_lim)]
    dos_density_modified_down = dos_density_down[(dos_energy >= min_lim) & (dos_energy <= max_lim)]

    modified_energy[0] = min_lim
    modified_energy[-1] = max_lim

    return modified_energy, dos_density_modified_up, dos_density_modified_down

def interpolate_to_n_data(dos_energy, dos_density_up, dos_density_down, npoints =2000, min_lim=-15, max_lim=15):
    interpolated_energy = np.linspace(min_lim, max_lim, npoints)
    interpolator_up = interpolate.interp1d(dos_energy, dos_density_up, kind='linear', bounds_error=False)
    interpolator_down = interpolate.interp1d(dos_energy, dos_density_down, kind='linear', bounds_error=False)
    dos_interpolated_up = interpolator_up(interpolated_energy)
    dos_interpolated_down = interpolator_down(interpolated_energy)
    return interpolated_energy, dos_interpolated_up, dos_interpolated_down


In [None]:
def modify_and_interpolate(dos_energy, dos_density_up, dos_density_down, npoints=2000, min_lim= -15, max_lim = 15):
    modified_energy, dos_density_modified_up, dos_density_modified_down = modify_dos_graph(dos_energy, dos_density_up, dos_density_down, min_lim, max_lim)
    #print(min(modified_energy),max(modified_energy))
    training_dos_energy, training_dos_density_up, training_dos_density_down = interpolate_to_n_data(modified_energy, dos_density_modified_up, dos_density_modified_down, npoints, min_lim, max_lim)
    return training_dos_energy, training_dos_density_up, training_dos_density_down


In [None]:
def process_dos_data(df):
    processed_data = []
    dos_data = get_site_dos_data(df)
    
    for entry in dos_data:
        mpid = entry['mpid']
        initial_site_energies = entry['initial_site_energies']
        initial_site_spin_up_densities = entry['initial_site_spin_up_densities']
        initial_site_spin_down_densities = entry['initial_site_spin_down_densities']
        final_site_energies = entry['final_site_energies']
        final_site_spin_up_densities = entry['final_site_spin_up_densities']
        final_site_spin_down_densities = entry['final_site_spin_down_densities']
        
        initial_training_energy_up, initial_training_density_up, initial_training_density_down = modify_and_interpolate(
            initial_site_energies, 
            initial_site_spin_up_densities, 
            initial_site_spin_down_densities
        )
        
        final_training_energy_up, final_training_density_up, final_training_density_down = modify_and_interpolate(
            final_site_energies, 
            final_site_spin_up_densities, 
            final_site_spin_down_densities
        )
        
        initial_vacancy_energy = df.loc[df['mpid'] == mpid, 'Evac_00'].values[0]
        final_vacancy_energy = df.loc[df['mpid'] == mpid, 'Evac_04'].values[0]

        processed_data.append({
            'mpid': mpid,
            'initial_training_energy_up': initial_training_energy_up,
            'initial_training_density_up': initial_training_density_up,
            'initial_training_density_down': initial_training_density_down,
            'final_training_energy_up': final_training_energy_up,
            'final_training_density_up': final_training_density_up,
            'final_training_density_down': final_training_density_down,
            'initial_vacancy_energy': initial_vacancy_energy,
            'final_vacancy_energy': final_vacancy_energy
        })
    
    return processed_data

processed_data = process_dos_data(df)

In [None]:
features = []
target = []
# extracts initial/final info from dicts in processed_data 
for data_dict in processed_data:
    initial_density_up = data_dict['initial_training_density_up']
    initial_density_down = data_dict['initial_training_density_down']
    initial_vacancy_energy = data_dict['initial_vacancy_energy']
    final_density_up = data_dict['final_training_density_up']
    final_density_down = data_dict['final_training_density_down']
    final_vacancy_energy = data_dict['final_vacancy_energy']

    entry1 = [initial_density_up, initial_density_down]
    entry2 = [final_density_up, final_density_down]

    features.append(entry1)
    features.append(entry2)
    
    vfe1 = [initial_vacancy_energy]
    vfe2 = [final_vacancy_energy]
    
    target.append(vfe1)
    target.append(vfe2)

In [None]:
data = {'targetd_train': target, 'featured_train ': features}

file_path = "C:\\Users\\johns\\Downloads\\trainingdata.pkl"

with open(file_path, 'wb') as file:
    pickle.dump(data, file)

In [None]:
with open(file_path, 'rb') as file:
        loaded_data = pickle.load(file)

target_training = loaded_data['targetd_train']
features  = loaded_data['featured_train '] 
DOS_training = [ [list(pair) for pair in zip(*item)] for item in features]
DOS_feat_training = np.array(DOS_training)
targets = np.array(target_training)
DOS_feat_training.shape

In [None]:
################################################################################################################

In [None]:
def process_dos_data_to_numpy(processed_training_data):
    total_entries = len(processed_training_data) * 2
    dos_length = 2000
    features = np.zeros((total_entries, 2, dos_length))
    
    targets = np.zeros((total_entries, 1))
    
    for i, entry in enumerate(processed_training_data):
        features[i * 2] = [entry['initial_training_density_up'], entry['initial_training_density_down']]
        targets[i * 2] = entry['initial_vacancy_energy']
        features[i * 2 + 1] = [entry['final_training_density_up'], entry['final_training_density_down']]
        targets[i * 2 + 1] = entry['final_vacancy_energy']
        
    return features, targets

processed_training_data = process_dos_data(df)
features, targets = process_dos_data_to_numpy(processed_training_data)

In [None]:
def visualize_dos_array(pickle_file):
    
    with open(pickle_file, 'rb') as f:
        dos_array = pickle.load(f)

    for sample_index, sample_data in enumerate(dos_array):
        energy = sample_data[:, 0] 
        density_up = sample_data[:, 1]  
        density_down = sample_data[:, 2]  

        plt.figure(figsize=(8, 6))
        
        # Plot spin up density
        plt.plot(energy, density_up, label='spin up density')
        
        # plot spin down density
        plt.plot(energy, density_down, label='spin down density')
        
        plt.xlabel('Energy')
        plt.ylabel('Density')
        plt.title(f'DOS for Sample {sample_index}')
        plt.legend()
        plt.grid(True)
        plt.show()

pickl = 'training_data.pkl'
visualize_dos_array(pickl)



In [None]:
# short DOS
shortie = ["mp-17387"]
dos_data_short = get_dos_data(shortie)
test_dos = dos_data_short[0]
dos_energy_short = test_dos['energies']
dos_density_up_short = test_dos['densities'][Spin.up]
dos_density_down_short = test_dos['densities'][Spin.down]*-1

In [None]:
# long DOS
mpids = df['mpid'].tolist()
mpids = mpids[0:1]
dos_data = get_dos_data(mpids)
test_dos = dos_data[0]
dos_energy_long = test_dos['energies']
dos_density_up_long = test_dos['densities'][Spin.up]
dos_density_down_long = test_dos['densities'][Spin.down]*-1