In [1]:
import os
os.listdir()

['.ipynb_checkpoints',
 '2022 Qiskit Hackathon Korea.ipynb',
 '21조조.pptx',
 'acs.chemrev.0c00868.pdf',
 'asd.py',
 'dsgdb9nsd.xyz',
 'dsgdb9nsd.xyz.tar.bz2',
 'output.json',
 'preprocess.ipynb',
 'preprocess_notebook.ipynb',
 'qm9.db',
 'qm9tut',
 'schnettest.ipynb',
 '~$21조조.pptx']

In [2]:
import regex as re

ordered_files = sorted(
            os.listdir('dsgdb9nsd.xyz'), key=lambda x: (int(re.sub("\D", "", x)), x)
        )

In [3]:
import tarfile
import tempfile
import numpy as np

from ase.io.extxyz import read_xyz
from ase.units import Debye, Bohr, Hartree, eV

available_properties = [
    "rotational_constant_A",
    "rotational_constant_B",
    "rotational_constant_C",
    "dipole_moment",
    "isotropic_polarizability",
    "homo",
    "lumo",
    "gap",
    "electronic_spatial_extent",
    "zpve",
    "energy_U0",
    "energy_U",
    "enthalpy_H",
    "free_energy",
    "heat_capacity"
]

units = [
    1.0,
    1.0,
    1.0,
    Debye,
    Bohr ** 3,
    Hartree,
    Hartree,
    Hartree,
    Bohr ** 2,
    Hartree,
    Hartree,
    Hartree,
    Hartree,
    Hartree,
    1.0,
]

units = dict(zip(available_properties, units))

tmpdir = tempfile.mkdtemp("gdb9")

irange = np.arange(len(ordered_files), dtype=int)

all_atoms = list(irange+1)
all_quantum_properties = []

for i in range(2):
    xyzfile = os.path.join('dsgdb9nsd.xyz', ordered_files[i])

    if (i + 1) % 10000 == 0:
        print("Parsed: {:6d} / 133885".format(i + 1))
    properties = {}
    quantum_properties = {}
    tmp = os.path.join(tmpdir, "tmp.xyz")
    with open(xyzfile, "r") as f:
        lines = f.readlines()

        quantum_properties['symbol'] = lines[-1].split()[0].split('/')[1]
        
        n_atoms = int(lines[0])
        quantum_properties['n_atoms'] = n_atoms

        l = lines[1].split()[2:]
        coordinates = []

        # 3번째 줄 부터 Coordinates 시작인데, n_atoms 만큼 반복하면 됨
        for j in range(2,n_atoms+2):
            
            # *^ 있는 줄이 있어서 변환 시도
            replaced_line = lines[j].replace("*^", "e")

            # Split 하고
            coordinates_list = replaced_line.split()

            # {'C': [-0.0126981359, 1.0858041578, 0.0080009958]} 형식처럼 만든 뒤 append
            coordinates.append(list(coordinates_list[0]) + list(map(float, coordinates_list[1:4])))

        quantum_properties['coordinates'] = coordinates
        for pn, p in zip(available_properties, l):
            if pn == 'energy_U0':
                quantum_properties[pn] = list([float(p) * units[pn]])

    
    all_quantum_properties.append(quantum_properties)


In [4]:
all_atoms = list(map(int, all_atoms))

In [5]:
all_quantum_properties[:2]

[{'symbol': 'CH4',
  'n_atoms': 5,
  'coordinates': [['C', -0.0126981359, 1.0858041578, 0.0080009958],
   ['H', 0.002150416, -0.0060313176, 0.0019761204],
   ['H', 1.0117308433, 1.4637511618, 0.0002765748],
   ['H', -0.540815069, 1.4475266138, -0.8766437152],
   ['H', -0.5238136345, 1.4379326443, 0.9063972942]],
  'energy_U0': [-1101.4877900833399]},
 {'symbol': 'H3N',
  'n_atoms': 4,
  'coordinates': [['N', -0.0404260543, 1.0241077531, 0.0625637998],
   ['H', 0.0172574639, 0.0125452063, -0.0273771593],
   ['H', 0.9157893661, 1.3587451948, -0.0287577581],
   ['H', -0.5202777357, 1.3435321258, -0.7755426124]],
  'energy_U0': [-1538.147731526762]}]

In [6]:
dictionary = dict(zip(all_atoms, all_quantum_properties))

In [7]:
import json

with open('output.json', 'w') as fp:
    json.dump(dictionary, fp)