# Parse Platinum group metals dataset

In [1]:
import bz2
import collections
import json
import os
import re
import tarfile
import urllib.request

import numpy

## Download data

In [1]:
url = 'https://journals.aps.org/prx/supplemental/10.1103/PhysRevX.3.041035/pgm.tar.bz2'
urllib.request.urlretrieve(url, 'pgm.tar.bz2')

('pgm.tar.bz2', <http.client.HTTPMessage at 0x104894c88>)

## The parser, and the writer

In [2]:
class Parser(object):
    def __init__(self, aflowout_file, contcar_file):
        pairs = [x.strip() for x in aflowout_file.read().decode('ascii').split('|')]
        self.properties = {}
        for pair in pairs:
            name, value = pair.split('=')
            self.properties[name] = value

        contcar_file.readline()
        scaling = float(contcar_file.readline().decode('ascii'))
        latt1 = [float(x)*scaling for x in contcar_file.readline().decode('ascii').strip().split()]
        latt2 = [float(x)*scaling for x in contcar_file.readline().decode('ascii').strip().split()]
        latt3 = [float(x)*scaling for x in contcar_file.readline().decode('ascii').strip().split()]
        contcar_file.readline()
        assert contcar_file.readline().decode('ascii').strip() == 'Direct'

        compounds = self['compound']
        atoms = []
        for z, i in re.findall(r'([a-zA-Z]+)(\d*)', compounds):
            for j in range(int(i) if len(i) else 1):
                atoms.append(z)
        pos = []

        for i in range(len(atoms)):
            pos.append([float(x) for x in contcar_file.readline().decode('ascii').strip().split()])

        self.properties['basis_vectors'] = [latt1, latt2, latt3]
        self.properties['atoms'] = []
        for pos, label in zip(pos, atoms):
            self.properties['atoms'].append((label, pos))

        try:
            for pos_outf, atom in zip(self['positions'].split(';'), self['atoms']):
                pos_calculated = atom[1]
                coord = numpy.array([float(x) for x in pos_outf.split(',')[1:]]).flatten()
                coord_calculated = numpy.dot(numpy.array(self['basis_vectors']).T,
                                                numpy.array(pos_calculated)).flatten()
                assert numpy.allclose(coord, coord_calculated, atol=3e-2)
        except AssertionError as e:
            print(self.properties)
            raise e

    def __getitem__(self, item):
        return self.properties[item]

    def __iter__(self):
        for i in self.properties.items():
            yield i

    def get_json(self):
        return {
            'basis_matrix': self['basis_vectors'],
            'atom_labels': [x[0] for x in self['atoms']],
            'atom_positions_fractional': numpy.array([x[1] for x in self['atoms']]).tolist(),

            'enthalpy_formation': float(self['enthalpy_formation']),
            'enthalpy_formation_atom': float(self['enthalpy_formation_atom']),
            
            'calculation_details': self.properties
        }

def convert_to_txt(data, txt_fn):
    output_str = []
    for nth, i in enumerate(data):
        output_str.append('--- %d ---' % (nth+1))
        output_str.append('Formation enthalpy (eV): %f' % i['enthalpy_formation'])
        output_str.append('Cell Volume: %f' % numpy.dot(numpy.cross(i['basis_matrix'][0], i['basis_matrix'][1]), i['basis_matrix'][2]))
        output_str.append('Coordinates:')
        for cord in i['atom_positions_fractional']:
            output_str.append('%f %f %f' % tuple(cord))
        output_str.append('Cell:')
        for cord in i['basis_matrix']:
            output_str.append('%f %f %f' % tuple(cord))
        output_str.append('Atoms:')
        output_str.append(' '.join(i['atom_labels']))
        output_str.append('Calculation Details:')
        output_str.append(json.dumps(i['calculation_details']))
    f = open(txt_fn, 'w')
    f.write('\n'.join(output_str))
    f.close()

## Load dataset in tar format

In [3]:
data_file = tarfile.open('pgm.tar.bz2')
all_files = data_file.getmembers()

## Convert bz2 to XYZ-like format

In [4]:
from IPython.display import clear_output

all_calculations = collections.defaultdict(lambda : [])

processed = 0
for file_info in all_files:
    if file_info.isfile() and 'aflowlib_entry.out' in file_info.name:
        system_name = file_info.name.split('/')[0]
        contcar_fn = os.path.join(os.path.dirname(file_info.name), 'CONTCAR.relax.bz2')

        aflowout_file = data_file.extractfile(file_info)
        contcar_file = data_file.extractfile(contcar_fn)
        aflowout_file = bz2.BZ2File(aflowout_file, 'r')
        contcar_file = bz2.BZ2File(contcar_file, 'r')
        
        all_calculations[system_name].append(
            Parser(aflowout_file, contcar_file).get_json()
        )

        processed += 1
        if processed % 100 == 0:
            clear_output()
            print('Processed', processed, 'files.')

Processed 39200 files.


In [5]:
if not os.path.exists('Data/'):
    os.mkdir('Data/')
for key, value in all_calculations.items():
    convert_to_txt(value, 'Data/%s.txt' % key)

    cd Data
    zip -9 ptgtm.zip *.txt