In [10]:
import os
import glob
import time

from pymatgen.core import Structure
from pymatgen.io.vasp import Vasprun, Kpoints, Incar

import numpy as np
import pandas as pd

import utils.generic as gen_tools
from utils.parallel import parallelise
from utils.vasp.parser.outcar import Outcar

from utils.vasp.vasp import find_vasp_directories, parse_VASP_directory
from utils.vasp.vasp_database import parse_vasp_directory

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
class DatabaseGenerator():
    
    def __init__(self,
                 parent_dir,
                 max_workers=16):
        self.parent_dir = parent_dir
        self.max_workers = max_workers
    def build_database(self,
                       target_directory = None,
                       extract_directories = False,
                       tarball_extensions = (".tar.gz", "tar.bz2"),
                       read_error_dirs = False,
                       read_multiple_runs_in_dir = False,
                       cleanup = False,
                       keep_filenames_after_cleanup = [],
                       keep_filename_patterns_after_cleanup = [],
                       max_dir_count = None,
                       filenames_to_qualify=["vasp.log", "INCAR", "POTCAR", "CONTCAR", "KPOINTS", "OUTCAR", "vasprun.xml"],
                       all_present=False,
                       df_filename = None):
        
        start_time = time.time()
        
        if target_directory:
            dirs = find_vasp_directories(parent_dir = target_directory,
                                         extract_tarballs = extract_directories,
                                         all_present = all_present,
                                         filenames = filenames_to_qualify,
                                         tarball_extensions = tarball_extensions)
        else:
            dirs = find_vasp_directories(parent_dir = self.parent_dir,
                                         extract_tarballs = extract_directories,
                                         all_present = all_present,
                                         filenames = filenames_to_qualify,
                                         tarball_extensions = tarball_extensions)
        print(f"The total number of vasp directories that we are building the database out of is {len(dirs)}")
        
        if max_dir_count:

            pkl_filenames = []
            
            for i, chunks in enumerate(gen_tools.chunk_list(dirs, max_dir_count)):
                step_time = time.time()
                df = pd.concat(parallelise(parse_vasp_directory, 
                                            [(chunk,) for chunk in chunks],
                                            max_workers=self.max_workers,
                                            extract_error_dirs=read_error_dirs, 
                                            parse_all_in_dir=read_multiple_runs_in_dir))
                if df_filename:
                    db_filename = f"{i}_{df_filename}.pkl"
                else:
                    db_filename = f"{i}.pkl"
                pkl_filenames.append(os.path.join(self.parent_dir, db_filename))
                df.to_pickle(os.path.join(self.parent_dir, db_filename))
                step_taken_time = np.round(time.time() - step_time ,3)
                print(f"Step {i}: {step_taken_time} seconds taken for {len(chunks)} parse steps")
                
            df = pd.concat([pd.read_pickle(partial_df) for partial_df in pkl_filenames])
            df.to_pickle(os.path.join(self.parent_dir, f"vasp_database.pkl"))
            
        else:
            df = pd.concat(parallelise(parse_vasp_directory, 
                                        [(chunk,) for chunk in chunks],
                                        max_workers=self.max_workers,
                                        extract_error_dirs=read_error_dirs, 
                                        parse_all_in_dir=read_multiple_runs_in_dir))
            if df_filename:
                df.to_pickle(os.path.join(self.parent_dir, f"vasp_database.pkl"))
        
        end_time = time.time()
        elapsed_time = end_time - start_time
        
        # not optional - keep the tarballs/zips..
        keep_filename_patterns_after_cleanup += ".tar.gz"
        keep_filename_patterns_after_cleanup += ".tar.bz2"
        keep_filename_patterns_after_cleanup += ".zip"

        if cleanup:
            gen_tools.cleanup_dir(directory_path=dirs, keep=True, files=[], file_patterns=[])
            parallelise(gen_tools.cleanup_dir, dirs, [True] * len(dirs), keep_filenames_after_cleanup*len(dirs), keep_filename_patterns_after_cleanup*len(dirs))
        
        print("Elapsed time:", np.round(elapsed_time,3), "seconds")

        return df
    
    def update_database(self,
                    new_calculation_directory,
                    existing_database_filename = "vasp_database.pkl",
                    extract_directories = True,
                    cleanup=False,
                    keep_filenames_after_cleanup = [],
                    keep_filename_patterns_after_cleanup = [],
                    max_dir_count = None,
                    df_filename = None):
        
        update_df = self.build_database(target_directory = existing_database_filename,
                                        extract_directories = extract_directories,
                                        cleanup=cleanup,
                                        keep_filenames_after_cleanup = keep_filenames_after_cleanup,
                                        keep_filename_patterns_after_cleanup = keep_filename_patterns_after_cleanup,
                                        max_dir_count = max_dir_count,
                                        df_filename = df_filename)
        def _get_job_dir(filepath):
            return os.path.basename(filepath.rstrip("/OUTCAR"))
        
        update_df["job_dir"] = [_get_job_dir(row.filepath) for _, row in update_df.iterrows()]
        base_df["job_dir"] = [_get_job_dir(row.filepath) for _, row in base_df.iterrows()]

        base_df = pd.read_pickle(existing_database_filename)
        
        # Merge df1 and df2 based on the common dirname
        interm_df = base_df.merge(update_df, on='job_dir', suffixes=('_df1', '_df2'), how='left')

        # Loop through the columns and update them dynamically
        for column in base_df.columns:
            if column not in ('filepath', 'job_dir'):
                # Check if the column with suffix '_df2' exists
                if (f'{column}_df2' in interm_df.columns):
                    base_df[column].update(interm_df[column + '_df2'].combine_first(interm_df[column + '_df1']))
                    
        base_df.drop(columns=['job_dir'], inplace=True)
        
        return base_df

In [12]:
datagen = DatabaseGenerator("/root/personal_python_utilities/development/test_read_all_output_in_vaspdir",
                            max_workers=2)

df = datagen.build_database(max_dir_count=2000,
                            extract_directories=False,
                            filenames_to_qualify=["OUTCAR"],
                            all_present=True)
df

The total number of vasp directories that we are building the database out of is 2
2 ['/root/personal_python_utilities/development/test_read_all_output_in_vaspdir/S11-RA110-S3-32-GB-a-0.4-b-0.3', '/root/personal_python_utilities/development/test_read_all_output_in_vaspdir/S5-RA100-S210-iP-site-32']
# Processes: 2
Processors available: 16
CPUs used: 2
Step 0: 1.751 seconds taken for 2 parse steps
Elapsed time: 1.756 seconds


Unnamed: 0,calc_start_time,consumed_time,structures,energy,energy_zero,forces,stresses,magmoms,scf_steps,scf_convergence,KPOINTS,INCAR,element_list,element_count,potcar_electron_count
0,2023-01-20 09:46:08,"{'cpu_time': None, 'user_time': None, 'system_...","[{""@module"": ""pymatgen.core.structure"", ""@clas...","[-325.27244404, 1009.42726936, -333.29286441, ...","[-325.27396603, 1009.42291646, -333.29301403, ...","[[[0.0006, -0.013028, -0.011505], [0.002869, 0...","[[[0.010789053860789178, -0.006894633066521508...","[[2.867, 2.646, 2.399, 2.501, 2.26, 2.301, 2.2...","[46, 120, 89, 20, 24, 39, 40, 19, 9, 5, 25, 30...","[True, False, True, True, True, True, True, Tr...",KSPACING: 0.5,"{'ALGO': 'Fast', 'AMIX': 0.01, 'AMIX_MAG': 0.1...",[Fe],[42],[8.0]
0,2023-12-10 23:05:48,"{'cpu_time': 743.871, 'user_time': 742.394, 's...","[{""@module"": ""pymatgen.core.structure"", ""@clas...",[-618.20201978],[-618.20537708],"[[[-0.001061, 0.002638, -0.000238], [-0.000907...","[[[-0.01331748918666658, -0.000115143359397152...","[[2.827, 2.827, 2.611, 2.603, 2.602, 2.598, 2....",[42],[True],KSPACING: 0.5,"{'ALGO': 'Fast', 'AMIX': 0.01, 'AMIX_MAG': 0.1...","[Fe, P]","[76, 1]","[8.0, 5.0]"


In [7]:
from utils.vasp.vasp import grab_electron_info

In [10]:
test_dir = "/root/personal_python_utilities/development/test_read_all_output_in_vaspdir/S5-RA100-S210-iP-site-32"
element_list, element_count, electron_of_potcar = grab_electron_info(directory_path=test_dir,
                                                                            potcar_filename="POTCAR")
element_list

['Fe', 'P']

In [11]:
element_count

[76, 1]

In [12]:
electron_of_potcar

[8.0, 5.0]

In [13]:
from utils.vasp.vasp_database import parse_vasp_directory

In [17]:
parse_vasp_directory(test_dir,
                     extract_error_dirs=False,
                     parse_all_in_dir=False)

/root/personal_python_utilities/development/test_read_all_output_in_vaspdir/S5-RA100-S210-iP-site-32 {'ALGO': 'Fast', 'AMIX': 0.01, 'AMIX_MAG': 0.1, 'BMIX': 0.0001, 'BMIX_MAG': 0.0001, 'EDIFF': 1e-05, 'EDIFFG': -0.01, 'ENCUT': 400, 'GGA': 'Pe', 'IBRION': 2, 'ISIF': 2, 'ISMEAR': 1, 'ISPIN': 2, 'ISTART': 0, 'KPAR': 2, 'KSPACING': 0.5, 'LAECHG': True, 'LCHARGE': True, 'LORBIT': 10, 'LPLANE': False, 'LREAL': 'Auto', 'MAGMOM': [2.828, 2.828, 2.612, 2.604, 2.602, 2.598, 2.237, 2.244, 2.33, 2.332, 2.323, 2.335, 2.201, 2.206, 2.239, 2.241, 2.287, 2.267, 2.165, 2.172, 2.243, 2.208, 2.336, 2.348, 2.129, 2.148, 2.424, 2.361, 1.865, 2.427, 2.114, 2.275, 2.342, 2.353, 2.299, 2.164, 2.439, 1.869, 2.386, 2.428, 2.15, 2.161, 2.344, 2.342, 2.26, 2.228, 2.182, 2.185, 2.217, 2.225, 2.199, 2.21, 2.162, 2.163, 2.177, 2.17, 2.146, 2.145, 2.24, 2.242, 2.21, 2.211, 2.202, 2.201, 2.329, 2.332, 2.344, 2.34, 2.239, 2.241, 2.608, 2.61, 2.619, 2.615, 2.828, 2.824, -0.095], 'NCORE': 4, 'NELM': 400, 'NELMIN': 8, 'NS

Unnamed: 0,calc_start_time,consumed_time,structures,energy,energy_zero,forces,stresses,magmoms,scf_steps,scf_convergence,KPOINTS,INCAR,element_list,element_count,potcar_electron_count
0,2023-12-10 23:05:48,"{'cpu_time': 743.871, 'user_time': 742.394, 's...","[{""@module"": ""pymatgen.core.structure"", ""@clas...",[-618.20201978],[-618.20537708],"[[[-0.001061, 0.002638, -0.000238], [-0.000907...","[[[-0.01331748918666658, -0.000115143359397152...","[[2.827, 2.827, 2.611, 2.603, 2.602, 2.598, 2....",[42],[True],KSPACING: 0.5,"{'ALGO': 'Fast', 'AMIX': 0.01, 'AMIX_MAG': 0.1...","[Fe, P]","[76, 1]","[8.0, 5.0]"


In [5]:
df

Unnamed: 0,calc_start_time,consumed_time,structures,energy,energy_zero,forces,stresses,magmoms,scf_steps,scf_convergence,KPOINTS,INCAR,element_list,element_count,potcar_electron_count
0,2023-01-20 09:46:08,"{'cpu_time': None, 'user_time': None, 'system_...","[{""@module"": ""pymatgen.core.structure"", ""@clas...","[-325.27244404, 1009.42726936, -333.29286441, ...","[-325.27396603, 1009.42291646, -333.29301403, ...","[[[0.0006, -0.013028, -0.011505], [0.002869, 0...","[[[0.010789053860789178, -0.006894633066521508...","[[2.867, 2.646, 2.399, 2.501, 2.26, 2.301, 2.2...","[46, 120, 89, 20, 24, 39, 40, 19, 9, 5, 25, 30...","[True, False, True, True, True, True, True, Tr...",KSPACING: 0.5,"{'ALGO': 'Fast', 'AMIX': 0.01, 'AMIX_MAG': 0.1...",,,
0,2023-11-05 05:39:31,"{'cpu_time': 852.419, 'user_time': 850.845, 's...","[{""@module"": ""pymatgen.core.structure"", ""@clas...",[-606.88776885],[-606.89089296],"[[[0.002666, -0.00722, 0.032352], [-0.002648, ...","[[[0.013119427379221336, -0.000407389538769054...","[[2.826, 2.826, 2.621, 2.618, 2.599, 2.599, 2....",[46],[True],KSPACING: 0.5,"{'ALGO': 'Fast', 'AMIX': 0.01, 'AMIX_MAG': 0.1...",,,
1,2023-11-06 01:59:15,"{'cpu_time': 1728.472, 'user_time': 1725.948, ...","[{""@module"": ""pymatgen.core.structure"", ""@clas...","[-617.90646062, -617.90692022, -617.90716935]","[-617.90758972, -617.90800659, -617.90823546]","[[[-0.004177, 0.003889, 0.013069], [-0.001966,...","[[[-0.01575292602734138, -0.000207055822020876...","[[2.816, 2.814, 2.602, 2.588, 2.587, 2.586, 2....","[67, 17, 19]","[True, True, True]",KSPACING: 0.9,"{'ALGO': 'Fast', 'AMIX': 0.01, 'AMIX_MAG': 0.1...",,,
2,2023-11-06 02:28:22,"{'cpu_time': None, 'user_time': None, 'system_...","[{""@module"": ""pymatgen.core.structure"", ""@clas...","[-618.17543242, -618.17958736, -618.18178849, ...","[-618.1785236, -618.18279683, -618.18523137, -...","[[[-0.00512, -0.00324, -0.000564], [-0.008373,...","[[[-0.013621662889878896, -0.00013892350896909...","[[2.828, 2.826, 2.614, 2.605, 2.603, 2.601, 2....","[46, 8, 16, 8, 8, 8, 8, 8, 8, 8, 13, 17, 14, 8...","[True, True, True, True, True, True, True, Tru...",KSPACING: 0.5,"{'ALGO': 'Fast', 'AMIX': 0.01, 'AMIX_MAG': 0.1...",,,
3,2023-11-06 03:53:51,"{'cpu_time': None, 'user_time': None, 'system_...","[{""@module"": ""pymatgen.core.structure"", ""@clas...","[-618.18248938, -618.18299501, -618.18138123, ...","[-618.18597965, -618.18649926, -618.18486849, ...","[[[-0.003883, -0.006159, 0.000484], [-0.003965...","[[[-0.013838886130181315, -0.00014635714627622...","[[2.828, 2.828, 2.614, 2.606, 2.603, 2.6, 2.23...","[45, 8, 8, 12, 8, 14, 8]","[True, True, True, True, True, True, True]",KSPACING: 0.5,"{'ALGO': 'Fast', 'AMIX': 0.01, 'AMIX_MAG': 0.1...",,,
4,2023-11-06 04:24:04,"{'cpu_time': None, 'user_time': None, 'system_...","[{""@module"": ""pymatgen.core.structure"", ""@clas...","[-618.18466271, -618.18500881, -618.18512696, ...","[-618.1882385, -618.18856267, -618.1886936, -6...","[[[-0.004074, -0.003373, 0.002102], [-0.00425,...","[[[-0.013750905818274213, -0.00015822225502590...","[[2.828, 2.828, 2.613, 2.605, 2.603, 2.6, 2.23...","[45, 8, 8, 8, 12, 13, 16, 8, 16, 8, 8, 8, 13, ...","[True, True, True, True, True, True, True, Tru...",KSPACING: 0.5,"{'ALGO': 'Fast', 'AMIX': 0.01, 'AMIX_MAG': 0.1...",,,
5,2023-11-06 05:19:20,"{'cpu_time': None, 'user_time': None, 'system_...","[{""@module"": ""pymatgen.core.structure"", ""@clas...","[-618.18636071, -618.18657559, -618.18667133, ...","[-618.19004718, -618.19022774, -618.19030674, ...","[[[-0.003288, -0.000916, 0.002239], [-0.003025...","[[[-0.013483107629945168, -0.00016367733395657...","[[2.827, 2.827, 2.612, 2.604, 2.603, 2.599, 2....","[43, 8, 8, 8, 13, 8, 8, 13, 13]","[True, True, True, True, True, True, True, Tru...",KSPACING: 0.5,"{'ALGO': 'Fast', 'AMIX': 0.01, 'AMIX_MAG': 0.1...",,,
6,2023-12-10 14:01:15,"{'cpu_time': None, 'user_time': None, 'system_...","[{""@module"": ""pymatgen.core.structure"", ""@clas...","[-618.1837052, -618.18413452, -618.18299209, -...","[-618.18744244, -618.18785052, -618.18676056, ...","[[[-0.002933, -0.000329, 0.001344], [-0.003077...","[[[-0.01395866693082045, -0.000170124812830018...","[[2.828, 2.828, 2.612, 2.605, 2.603, 2.6, 2.23...","[46, 8, 14, 13, 13, 13]","[True, True, True, True, True, True]",KSPACING: 0.5,"{'ALGO': 'Fast', 'AMIX': 0.01, 'AMIX_MAG': 0.1...",,,
7,2023-12-10 14:31:29,"{'cpu_time': None, 'user_time': None, 'system_...","[{""@module"": ""pymatgen.core.structure"", ""@clas...","[-618.17921036, -618.18000895, -618.1795725, -...","[-618.18312064, -618.18386785, -618.18341927, ...","[[[-0.002385, -0.000973, 0.002177], [-0.00213,...","[[[-0.013821640840609856, -0.00017599183135957...","[[2.828, 2.827, 2.612, 2.604, 2.603, 2.6, 2.24...","[47, 8, 8, 14, 8, 13, 16, 14, 8, 13, 13, 8, 13...","[True, True, True, True, True, True, True, Tru...",KSPACING: 0.5,"{'ALGO': 'Fast', 'AMIX': 0.01, 'AMIX_MAG': 0.1...",,,
8,2023-12-10 15:36:45,"{'cpu_time': None, 'user_time': None, 'system_...","[{""@module"": ""pymatgen.core.structure"", ""@clas...","[-618.16445764, -618.16504223, -618.16646459, ...","[-618.16856462, -618.16912448, -618.17054074, ...","[[[-0.009797, -0.015654, 0.000134], [-0.010073...","[[[-0.013633746451446159, -0.00018790063067277...","[[2.826, 2.826, 2.614, 2.606, 2.606, 2.601, 2....","[43, 8, 13, 15, 8, 8, 12, 13, 15, 16, 13, 17, ...","[True, True, True, True, True, True, True, Tru...",KSPACING: 0.5,"{'ALGO': 'Fast', 'AMIX': 0.01, 'AMIX_MAG': 0.1...",,,
