In [1]:
# modules for structure decoration
import pandas as pd
import os
import itertools
from tqdm.notebook import tqdm
import networkx as nx
import glob
from glob import iglob
from copy import deepcopy
from collections import defaultdict

from pymatgen.core import Composition, Structure
from pymatgen.analysis import local_env

In [2]:
# print(f"pymatgen version: {pymatgen.__version__}")
import pip
pip.main(["show","pymatgen"])

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


Name: pymatgen
Version: 2022.0.8
Summary: Python Materials Genomics is a robust materials analysis code that defines core object representations for structures and molecules with support for many electronic structure codes. It is currently the core analysis code powering the Materials Project (https://www.materialsproject.org).
Home-page: http://www.pymatgen.org
Author: Pymatgen Development Team
Author-email: ongsp@eng.ucsd.edu
License: MIT
Location: /home/jlaw/.conda/envs/crystals/lib/python3.8/site-packages
Requires: networkx, numpy, tabulate, plotly, palettable, spglib, monty, requests, matplotlib, uncertainties, scipy, sympy, pandas, ruamel.yaml
Required-by: pyxtal


0

In [3]:
os.chdir('..')
os.getcwd()

'/home/jlaw/projects/arpa-e/crystals/crystal-gnn-fork'

In [4]:
base_dir = "/projects/rlmolecule/shubham/file_transfer/decorations/relaxed"
icsd_structures_file = f"{base_dir}/icsd/icsd_energies.csv"
# hypothetical_structures_file = f"{base_dir}/{hypo_type}_hypotheticals/relaxed_energies.csv"

# Read energy data
icsd_df = pd.read_csv(icsd_structures_file)

In [5]:
icsd_df.head()

Unnamed: 0,composition,id,energyperatom
0,Hf1N1,icsd_183420,-9.854916
1,Hf1N1,icsd_183419,-9.854908
2,Hf3N4,icsd_097997,-9.821796
3,Hf1N1,icsd_167875,-9.663267
4,Hf1O2,icsd_060903,-9.62478


In [6]:
icsd_df.id.nunique()

16445

In [8]:
structures = {}
for icsd_strc_file in tqdm(glob.glob(f"{base_dir}/icsd/structures/*.cif")):
    struc = Structure.from_file(icsd_strc_file, primitive=True)
    structures[icsd_strc_file.replace('.cif','')] = struc.as_dict()

  0%|          | 0/16445 [00:00<?, ?it/s]



In [12]:
structures = {os.path.basename(key): val for key, val in structures.items()}
print(list(structures.keys())[:10])

['icsd_085777', 'icsd_173786', 'icsd_056827', 'icsd_078912', 'icsd_076029', 'icsd_072295', 'icsd_076967', 'icsd_000429', 'icsd_279616', 'icsd_624016']


In [13]:
working_dir = "inputs"
os.makedirs(working_dir, exist_ok=True)

In [14]:
# now store them in a single file
import json
import gzip

# https://pymatgen.org/usage.html#side-note-as-dict-from-dict
out_file = f"{working_dir}/icsd_structures.json.gz"
with gzip.open(out_file, 'w') as out:
    out.write(json.dumps(structures, indent=2).encode())

In [15]:
# read them back
with gzip.open(out_file, 'r') as f:
    structures_dict = json.loads(f.read().decode())

new_structures = {}
for key, structure_dict in structures_dict.items():
    new_structures[key] = Structure.from_dict(structure_dict)

In [20]:
# now do the same thing for the unrelaxed and relaxed structures
for hypo_type in ('zintl', 'battery'):
    for dft_status in ('relaxed', 'unrelaxed'):
        print(hypo_type, dft_status)
        structures = {}
        file_type = "CONTCAR" if hypo_type == "battery" and dft_status == "relaxed" else "POSCAR"
        search_str = f"{base_dir}/{hypo_type}_hypotheticals/{dft_status}_original/{file_type}_*"
        for strc_file in tqdm(glob.glob(search_str)):
            struc = Structure.from_file(strc_file, primitive=True)
            strc_id = os.path.basename(strc_file).replace(file_type+"_","")
            structures[strc_id] = struc.as_dict()
            
        out_file = f"{working_dir}/{hypo_type}_{dft_status}_structures.json.gz"
        print(f"writing to {out_file}")
        with gzip.open(out_file, 'w') as out:
            out.write(json.dumps(structures, indent=2).encode())

zintl relaxed


  0%|          | 0/11028 [00:00<?, ?it/s]

writing to inputs/zintl_relaxed_structures.json.gz
zintl unrelaxed


  0%|          | 0/11028 [00:00<?, ?it/s]

writing to inputs/zintl_unrelaxed_structures.json.gz
battery relaxed


  0%|          | 0/67840 [00:00<?, ?it/s]

writing to inputs/battery_relaxed_structures.json.gz
battery unrelaxed


  0%|          | 0/67840 [00:00<?, ?it/s]

writing to inputs/battery_unrelaxed_structures.json.gz
