In [1]:
import matbench_discovery.data
import matbench_discovery.energy

In [2]:
mp_trj_path = matbench_discovery.data.DataFiles.mp_trj_extxyz.path

In [3]:
structures = matbench_discovery.data.ase_atoms_from_zip(mp_trj_path, filename_to_info=True)

Reading ASE Atoms from zip_filename='/home/kna/.cache/matbench-discovery/mp/2024-09-03-mp-trj.extxyz.zip': 100%|██████████| 145923/145923 [06:07<00:00, 396.87it/s]


In [4]:
from pymatgen.core import Composition
from pymatgen.io.ase import AseAtomsAdaptor
from pymatgen.io.cif import CifWriter

def atoms_to_record(ase_atoms):
    pmg_structure = AseAtomsAdaptor.get_structure(ase_atoms)
    res = {
        "formation_energy_per_atom": matbench_discovery.energy.get_e_form_per_atom(
            {'composition': pmg_structure.composition, 'energy': ase_atoms.info['mp2020_corrected_energy']}),
        "cif": str(CifWriter(pmg_structure))}
    res.update(ase_atoms.info)
    return res

from multiprocessing import Pool
with Pool(20) as p:
    records = p.map(atoms_to_record, structures)

In [5]:
import pandas as pd
dataset_pd = pd.DataFrame.from_records(records)

In [6]:
dataset_pd.head()

Unnamed: 0,formation_energy_per_atom,cif,material_id,formula,task_id,calc_id,ionic_step,frame_id,mp2020_corrected_energy,filename
0,-2.904883,# generated using pymatgen\ndata_Mg14TiZnO16\n...,mp-1034899,Mg14 Ti1 Zn1 O16,mp-1034899,0,16,mp-1034899-0-16,-203.662848,mp-1034899.extxyz
1,-2.904936,# generated using pymatgen\ndata_Mg14TiZnO16\n...,mp-1034899,Mg14 Ti1 Zn1 O16,mp-1034899,0,14,mp-1034899-0-14,-203.66454,mp-1034899.extxyz
2,-2.904891,# generated using pymatgen\ndata_Mg14TiZnO16\n...,mp-1034899,Mg14 Ti1 Zn1 O16,mp-1034899,0,12,mp-1034899-0-12,-203.663116,mp-1034899.extxyz
3,-2.904586,# generated using pymatgen\ndata_Mg14TiZnO16\n...,mp-1034899,Mg14 Ti1 Zn1 O16,mp-1034899,0,10,mp-1034899-0-10,-203.653348,mp-1034899.extxyz
4,-2.904689,# generated using pymatgen\ndata_Mg14TiZnO16\n...,mp-1034899,Mg14 Ti1 Zn1 O16,mp-1034899,0,5,mp-1034899-0-5,-203.656655,mp-1034899.extxyz


In [7]:
from pathlib import Path
from sklearn.model_selection import train_test_split
train, val = train_test_split(dataset_pd, test_size=0.1, random_state=42)
print(len(train), len(val))
data_path = Path("..", "..", "data", "matbench_discovery_mp_trj_full")
train.to_csv(data_path / "train.csv.gz")
val.to_csv(data_path / "val.csv.gz")

1422355 158040
