Skip to content

Commit

Permalink
Loading CIF files (#3)
Browse files Browse the repository at this point in the history
* Add implementation of read_cif

* Add implementation of CIFTransformer

* Add pandas to requirements

* Simplify logic in read_cif function

* Add missing tests for kristal.io module

* Simplify and udpate tests

* Add missing docstrings
  • Loading branch information
dexter2206 authored and magdalenakrzus committed Dec 19, 2018
1 parent a4c0c5d commit bbc5b70
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 1 deletion.
37 changes: 37 additions & 0 deletions kristal/io/__init__.py
@@ -0,0 +1,37 @@
"""Main file of kristal.io package."""
from pkg_resources import resource_filename
import lark
from kristal.io.transform import CIFTransformer

with open(resource_filename('kristal', 'io/cifgrammar.lark')) as grammar_file:
GRAMMAR = grammar_file.read()

def read_cif(cif_path, transformer=None):
"""Read content of given CIF file.
:param cif_path: path of the CIF file to be read. Can be given
as a `str` object or `pathlib.Path` object.
:type cif_path: `str` or `pathlib.Path`
:param transformer: an optional transformer object that will be used to
transformer the parsed tree. It not given, a default transformer will be used.
:type transformer: a subclass of `lark.Transformer`.
:returns: a mapping name -> datablock holding information read from
all datablocks present in CIF file.
:rtype: A mapping `str` -> :py:class:`kristal.io.transform.DataBlock`.
The `DataBlock` objects have the following attributes:
- name: name of the datablock
- entries: non-loop dataitems
- loop: list of all loops
The non-loop dataitems are stored as :py:class:`pandas.Series` (can be
accessed like a dictionary). The loops are stored as :py:class:`pandas.DataFrame`.
"""
if transformer is None:
transformer = CIFTransformer()

with open(cif_path) as cif:
content = cif.read()

parser = lark.Lark(GRAMMAR, parser='earley', start='cif')
tree = parser.parse(content)
return transformer.transform(tree)
60 changes: 60 additions & 0 deletions kristal/io/transform.py
@@ -0,0 +1,60 @@
"""Features related to transformation of CIF grammer tree."""
from collections import namedtuple
from itertools import islice
import lark
import pandas as pd

DataBlock = namedtuple('DataBlock', ['name', 'loops', 'entries'])


class CIFTransformer(lark.Transformer):
"""Lark Transformer for CIF grammar tree."""
# pylint: disable=no-self-use

def cif(self, matches):
"""Transformation of cif nonterminal."""
return {block.name: block for block in matches}

def datablock(self, matches):
"""Transformation of datablock nonterminal."""
loops = []
entries_data = []
entries_index = []
name = str(matches[0])
for match in islice(matches, 1, len(matches)):
if match.data == 'loop':
loops.append(match.children[0])
else:
entries_data.append(match.children[1])
entries_index.append(match.children[0])
entries = pd.Series(entries_data, index=entries_index)
return DataBlock(name, loops, entries)

def datablock_heading(self, matches):
"""Transform datablock heading."""
return str(matches[0])

def loop(self, matches):
"""Transform loop into pandas.DataFrame."""
columns = matches[0].children
if (len(matches[1].children)) % len(columns):
raise ValueError('Invalid number of items in loop {}'.format(columns))
data = list(zip(*(len(columns) * [iter(matches[1].children)])))
loop_df = pd.DataFrame(data, columns=columns)
return lark.Tree('loop', [loop_df])

def integer(self, matches):
"""Transform integer."""
return int(matches[0])

def float(self, matches):
"""Transform float."""
return float(matches[0])

def string(self, matches):
"""Transform string."""
return str(matches[0])

def tag(self, matches):
"""Transform tag."""
return str(matches[0])
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -14,7 +14,7 @@
license='MIT',
description=__doc__,
long_description=LONG_DESCRIPTION,
install_requires=['numpy', 'lark-parser'],
install_requires=['numpy', 'pandas', 'lark-parser'],
author='Magdalena Krzuś, Konrad Jałowiecki',
author_email='magdalena.krzus@gmail.com',
packages=find_packages(exclude=['tests']),
Expand Down
86 changes: 86 additions & 0 deletions tests/test_io.py
@@ -0,0 +1,86 @@
"""Test cases for kristal.io package."""
from lark import Token, Tree
import pandas as pd
import pytest
from kristal.io import read_cif, GRAMMAR
from kristal.io.transform import CIFTransformer

@pytest.fixture(name='transformer', scope='session')
def create_transformer():
"""Create default transformer for use with tests."""
return CIFTransformer()

@pytest.fixture(name='lark')
def patch_lark(mocker):
"""Replace lark module in kristal.io with mock object."""
return mocker.patch('kristal.io.lark')

@pytest.fixture(name='mock_open')
def patch_mock_open(mocker):
"""Replace open function in kristal.io module."""
return mocker.patch('kristal.io.open', mocker.mock_open(read_data='test_content'))

def test_string_transform(transformer):
"""CIFTransformer should correctly transform string terminals."""
assert transformer.string([Token('NON_BLANK_STRING', 'Bozon')]) == 'Bozon'

@pytest.mark.parametrize(
'float_str,expected',
[['21.37', 21.37],
['2.1e-2', 0.021],
['-5e3', -5000],
['-0.3', -0.3]])
def test_float_transform(transformer, float_str, expected):
"""CIFTransformer should correctly transform float terminals."""
assert transformer.float([Token('FLOAT', float_str)]) == expected

def test_loop_transform_correct_input(transformer):
"""CIFTransformer.loop should transform correct loop into DataFrame."""
header_tree = Tree('loop_header', ['x', 'y', 'label'])
body_tree = Tree('loop_body', [1.2, 2.0, 'a', -3.1, -2.8, 'b'])
expected_df = pd.DataFrame(
[[1.2, 2.0, 'a'], [-3.1, -2.8, 'b']],
columns=['x', 'y', 'label'])
actual_tree = transformer.loop([header_tree, body_tree])
assert actual_tree.data == 'loop'
assert actual_tree.children[0].equals(expected_df)

def test_loop_raises_on_incorrect_input(transformer):
"""CIFTransformer.loop functoin should raise ValueError on incorrect input.
Criterion for the input to be invalid is that number of elements in
loop_body is not divisible by number of elements in header.
"""
header_tree = Tree('loop_header', ['a', 'b', 'c'])
body_tree = Tree('loop_body', [1, 5, 5, 6, 2])

with pytest.raises(ValueError) as exc_info:
transformer.loop([header_tree, body_tree])

assert 'Invalid number of items in loop' in str(exc_info.value)

@pytest.mark.usefixtures('mock_open')
def test_read_cif_uses_lark(lark, mocker):
"""The read_cif function should correctly create Lark instance."""
read_cif('BENZEN01.cif', transformer=mocker.Mock())
lark.Lark.assert_called_once_with(GRAMMAR, parser='earley', start='cif')

@pytest.mark.usefixtures('lark')
def test_read_cif_opens_file(mocker, mock_open):
"""The read_cif function should open file passed as parameter."""
read_cif('BENZEN01.cif', transformer=mocker.Mock())
mock_open.assert_called_once_with('BENZEN01.cif')

@pytest.mark.usefixtures('mock_open')
def test_read_cif_parses_content(lark, mocker):
"""The read_cif function use Lark to parse content read from input file."""
read_cif('BENZEN01.cif', transformer=mocker.Mock())
lark.Lark().parse.assert_called_once_with('test_content')

@pytest.mark.usefixtures('mock_open')
def test_read_cif_returns_transformed_tree(lark, mocker):
"""The read_cif function should return transformed tree."""
transformer = mocker.Mock()
result = read_cif('BENZEN01.cif', transformer=transformer)
transformer.transform.assert_called_once_with(lark.Lark().parse())
assert result == transformer.transform()

0 comments on commit bbc5b70

Please sign in to comment.