In [2]:
import lmdb
import pickle

from pymatgen.core import Structure, Lattice
import numpy as np

In [3]:
# Dataset generation

class Dataset():
  """
  Custom class for reading NOMAD dataset from MatSciML Zenodo
  
  """

  def __init__(self, lmdb_path, max_readers=1, transform=None, pre_transform=None):
    """
    Constructor for dataset
    param: lmdb_path -> path to lmdb_file
    param: max_readers -> maximum number of concurrent read processes accessing lmdb file
    """
    self.env = lmdb.open(lmdb_path,
        subdir=False,
        readonly=True,
        lock=False,
        readahead=False,
        meminit=False,
        max_readers=max_readers)
    self.txn = self.env.begin()
    
  def len(self):
    
    return self.txn.stat()['entries']
    

  def get(self, index):
    """
    Return a  datapoint
    """
    # Select graph sample
    id = f"{index}".encode("ascii")
    datapoint = pickle.loads(self.txn.get(id))
    
    return datapoint



In [4]:
lmdb_path = "/home/nawaf/n0w0f/material_db/nomad/all/data.lmdb"
dataset = Dataset(lmdb_path, 1)

In [5]:
dataset.len()

138820

In [6]:
mat_dict = dataset.get(1)

In [30]:

atom_species = mat_dict["properties"]["structures"]["structure_original"]["species_at_sites"]
lattice_vectors = mat_dict['properties']['structures']['structure_original']['lattice_vectors']
cartesian_pos = mat_dict['properties']['geometry_optimization']['structure_optimized']['cartesian_site_positions']
scale_factor : int = 1e10
lattice  = [[v * scale_factor for v in row] for row in lattice_vectors ]
atom_positions = [[v * scale_factor for v in row] for row in cartesian_pos]


In [25]:
from math import pi

def create_lattice(lattice_params : dict):

    lattice_abc = (
                    lattice_params["a"] * scale_factor,
                    lattice_params["b"] * scale_factor,
                    lattice_params["c"] * scale_factor,
                )
    lattice_angles = (
                    lattice_params["alpha"],
                    lattice_params["beta"],
                    lattice_params["gamma"],
                )
    a, b, c  = lattice_abc
    alpha, beta, gamma = lattice_angles       
    lattice = Lattice.from_parameters(
                a, b, c, alpha * 180/pi, beta * 180/pi, gamma * 180/pi
            )
    return lattice

In [15]:
mat_dict = dataset.get(1)

In [21]:
cartesian_pos = mat_dict['properties']["structures"]["structure_original"]['cartesian_site_positions']

In [22]:
cartesian_pos

[[1.373914e-10, 1.373914e-10, 1.373914e-10],
 [4.1217410000000005e-10, 4.1217410000000005e-10, 1.373914e-10],
 [4.1217410000000005e-10, 1.373914e-10, 4.1217410000000005e-10],
 [1.373914e-10, 4.1217410000000005e-10, 4.1217410000000005e-10],
 [0.0, 0.0, 0.0],
 [0.0, 2.747828e-10, 2.747828e-10],
 [2.747828e-10, 0.0, 2.747828e-10],
 [2.747828e-10, 2.747828e-10, 0.0]]

In [26]:
def create_cif(mat_dict:dict):

    atom_species = mat_dict["properties"]["structures"]["structure_original"]["species_at_sites"]
    cartesian_pos = mat_dict['properties']["structures"]["structure_original"]['cartesian_site_positions']


    scale_factor : int = 1e10
    atom_positions = [
            [v * scale_factor for v in row] for row in cartesian_pos
        ]
    

    lattice_params = mat_dict["properties"]["structures"]["structure_original"][
            "lattice_parameters"
        ]
    lattice = create_lattice(lattice_params)
    
    
    
    # Handle atom species which could be either symbols or atomic numbers
    atom_symbols = []
    for species in atom_species:
        if isinstance(species, int) and species != 0:
            element = Element.from_Z(species)
            atom_symbols.append(element.symbol)
        else:
            atom_symbols.append(species)


    # Create a pymatgen Structure
    pymatgen_structure = Structure(lattice, species=atom_symbols, coords=atom_positions )

    # Generate CIF content using pymatgen
    cif_content = pymatgen_structure.to(fmt="cif")
    return cif_content
            

In [27]:
def prepare_dict(mat_dict:dict):

    cif_content = create_cif(mat_dict)

    # crystal structure properties
    material_name = mat_dict['material']['material_name']
    chemical_formula = mat_dict['material']['chemical_formula_descriptive']
    space_group_symbol = mat_dict['material']['symmetry']['space_group_symbol']
    crystal_system = mat_dict['material']['symmetry']['crystal_system']


    mass_density = mat_dict['properties']['structures']['structure_original']['mass_density']


    structural = {
        "space_group_symbol": space_group_symbol,
        "crystal_system":crystal_system,
        "mass_density": mass_density
    } 

    # Electronic structure properties
    elec_structure = mat_dict['properties']['electronic']['dos_electronic']
    spin_polarized = elec_structure['spin_polarized']
    energy_fermi = elec_structure['energy_fermi']
    energy_highest_occupied = elec_structure['band_gap'][0]['energy_highest_occupied']
    energy_lowest_unoccupied = elec_structure['band_gap'][0]['energy_lowest_unoccupied']

    electronic = {
        "spin_polarized" : spin_polarized ,
        "energy_fermi" : energy_fermi,
        "energy_highest_occupied" : energy_highest_occupied,
        "energy_lowest_unoccupied" :energy_lowest_unoccupied
    }


    # Energy
    total_energy = mat_dict['energies']['total']['value']
    fermi = mat_dict['energies']['fermi']

    #method 
    method = mat_dict['method']

    return {"material_name": material_name, "method":method, "total_energy":total_energy,"fermi":fermi , "chemical_formula":chemical_formula, "structural_info" :structural,"electronic": electronic, "cif": cif_content}
    
        

In [28]:
def prepare_dict(mat_dict:dict):

    cif_content = create_cif(mat_dict)

    # crystal structure properties
    material_name = mat_dict['material']['material_name']
    chemical_formula = mat_dict['material']['chemical_formula_descriptive']
    structural = {
        "space_group_symbol": mat_dict['material']['symmetry']['space_group_symbol'],
        "crystal_system": mat_dict['material']['symmetry']['crystal_system'],
        "mass_density": mat_dict['properties']['structures']['structure_original']['mass_density']
    } 

    # Electronic structure properties
    elec_structure = mat_dict['properties']['electronic']['dos_electronic']
    electronic = {
        "spin_polarized" : elec_structure['spin_polarized'] ,
        "energy_fermi" : elec_structure['energy_fermi'],
        # "energy_highest_occupied" : elec_structure['band_gap'][0]['energy_highest_occupied'],
        # "energy_lowest_unoccupied" :elec_structure['band_gap'][0]['energy_lowest_unoccupied']
    }

    # Energy
    total_energy = mat_dict['energies']['total']['value']
    fermi = mat_dict['energies']['fermi']

    #method 
    method = mat_dict['method']

    return {"material_name": material_name, "method":method, "total_energy":total_energy,"fermi":fermi , "chemical_formula":chemical_formula, "structural_info" :structural,"electronic": electronic, "cif": cif_content}
    
        

In [30]:
import json
from tqdm import tqdm 

materials_list = []
scale_factor : int = 1e10

#loop through data points in lmdb
for index in tqdm(range(10)):
    datapoint = dataset.get(index)
    data_dict = prepare_dict(datapoint)
    materials_list.append(data_dict)


output_file = "output.json"
with open(output_file, 'w') as json_file:
    json.dump(materials_list, json_file)





100%|██████████| 10/10 [00:00<00:00, 155.71it/s]
