In [1]:
import pandas as pd

In [7]:
df = pd.read_csv("/work/so87pot/structllm/materials_project/train.csv")

In [8]:
columns = df.columns.str.replace(' ', '_')

In [9]:
columns

Index(['Zr_N_0_1_o_o_-_0_1_o_o_o_0_1_+_o_-_0_1_+_o_o_', 'Zr1_N1', 'Hexagonal',
       '0.0', '6.3537155', '-9.93008659', 'False', '-1.4879888125000011',
       '['P-6m2',_187]'],
      dtype='object')

In [11]:
df.columns 

Index(['Zr N 0 1 o o - 0 1 o o o 0 1 + o - 0 1 + o o', 'Zr1 N1', 'Hexagonal',
       '0.0', '6.3537155', '-9.93008659', 'False', '-1.4879888125000011',
       '['P-6m2', 187]'],
      dtype='object')

In [12]:
df.columns = df.columns.str.strip()
slice_column = df['Zr N 0 1 o o - 0 1 o o o 0 1 + o - 0 1 + o o']
slice_column.to_csv("/work/so87pot/structllm/materials_project/train_slice.csv", header=["slices"], index=False)

In [1]:
import lmdb
import pickle

from pymatgen.core import Structure, Lattice
import numpy as np

In [2]:
# Dataset generation

class Dataset():
  """
  Custom class for reading NOMAD dataset from MatSciML Zenodo
  
  """

  def __init__(self, lmdb_path, max_readers=1, transform=None, pre_transform=None):
    """
    Constructor for dataset
    param: lmdb_path -> path to lmdb_file
    param: max_readers -> maximum number of concurrent read processes accessing lmdb file
    """
    self.env = lmdb.open(lmdb_path,
        subdir=False,
        readonly=True,
        lock=False,
        readahead=False,
        meminit=False,
        max_readers=max_readers)
    self.txn = self.env.begin()
    
  def len(self):
    
    return self.txn.stat()['entries']
    

  def get(self, index):
    """
    Return a  datapoint
    """
    # Select graph sample
    id = f"{index}".encode("ascii")
    datapoint = pickle.loads(self.txn.get(id))
    
    return datapoint



In [6]:
lmdb_path = "/work/so87pot/material_db/materials_project/materials-project/base/test/data.lmdb"
dataset = Dataset(lmdb_path, 1)

In [7]:
dataset.len()

15456

In [11]:
site = dataset.get(2)
print(site.keys())


dict_keys(['formula_pretty', 'symmetry', 'structure', 'uncorrected_energy_per_atom', 'energy_per_atom', 'formation_energy_per_atom', 'is_stable', 'band_gap', 'efermi', 'is_metal', 'is_magnetic', 'fields_not_requested'])


In [50]:
site['energy_per_atom']
site['formation_energy_per_atom']
site['is_stable']
site['band_gap']
site['efermi']


9.16940669

In [39]:
site['formula_pretty']

'SbIr'

'Hexagonal'

In [71]:
from pymatgen.core import Structure, Element
def create_cif(structure:dict):

    atom_species = structure.atomic_numbers
    cartesian_pos = structure.cart_coords

    lattice = structure.lattice
    
    
    
    # Handle atom species which could be either symbols or atomic numbers
    atom_symbols = []
    for species in atom_species:
        if isinstance(species, int) and species != 0:
            element = Element.from_Z(species)
            atom_symbols.append(element.symbol)
        else:
            atom_symbols.append(species)


    # Create a pymatgen Structure
    pymatgen_structure = Structure(lattice, species=atom_symbols, coords=cartesian_pos )

    # Generate CIF content using pymatgen
    cif_content = pymatgen_structure.to(fmt="cif")
    return cif_content

In [74]:
            
def prepare_dict(mat_dict:dict):

    cif_content = create_cif(mat_dict['structure'])
    energy_per_atom = mat_dict['energy_per_atom']
    formation_energy_per_atom = mat_dict['formation_energy_per_atom']
    is_stable = mat_dict['is_stable']
    band_gap = mat_dict['band_gap']
    efermi =mat_dict['efermi']
    crystal_type = mat_dict['symmetry'].crystal_system.value
    space_group = mat_dict['structure'].get_space_group_info()
    chemical_formula = mat_dict['structure'].formula

   
    return {"formation_energy_per_atom":formation_energy_per_atom , 
            "is_stable":is_stable, 
            "chemical_formula":chemical_formula,
            "space_group":space_group,
            "crystal_type":crystal_type,
            "band_gap":band_gap,
            "efermi":efermi,
            "energy_per_atom":energy_per_atom,
            "cif": cif_content}
    

In [75]:
datapoint = dataset.get(5)
data_dict = prepare_dict(datapoint)
        

In [76]:
data_dict

{'formation_energy_per_atom': -0.7402652036342595,
 'is_stable': False,
 'chemical_formula': 'Te2 Mo1 W2 Se4',
 'space_group': ('P-6m2', 187),
 'crystal_type': 'Hexagonal',
 'band_gap': 0.7511999999999999,
 'efermi': 2.09750388,
 'energy_per_atom': -7.077172788888889,
 'cif': "# generated using pymatgen\ndata_Te2Mo(WSe2)2\n_symmetry_space_group_name_H-M   'P 1'\n_cell_length_a   3.39225611\n_cell_length_b   3.39225611\n_cell_length_c   31.47442500\n_cell_angle_alpha   90.00000000\n_cell_angle_beta   90.00000000\n_cell_angle_gamma   120.00000216\n_symmetry_Int_Tables_number   1\n_chemical_formula_structural   Te2Mo(WSe2)2\n_chemical_formula_sum   'Te2 Mo1 W2 Se4'\n_cell_volume   313.66473487\n_cell_formula_units_Z   1\nloop_\n _symmetry_equiv_pos_site_id\n _symmetry_equiv_pos_as_xyz\n  1  'x, y, z'\nloop_\n _atom_site_type_symbol\n _atom_site_label\n _atom_site_symmetry_multiplicity\n _atom_site_fract_x\n _atom_site_fract_y\n _atom_site_fract_z\n _atom_site_occupancy\n  Te  Te0  1  0.00

In [None]:
def prep_data(lmdb_path:str,output_file:str)->None:
    materials_list = []
    dataset = Dataset(lmdb_path, 1)

    #loop through data points in lmdb
    for index in tqdm(range(dataset.len())):
        datapoint = dataset.get(index)
        data_dict = prepare_dict(datapoint)
        materials_list.append(data_dict)


    with open(output_file, 'w') as json_file:
        json.dump(materials_list, json_file)