In [4]:
from chemdataextractor import Document
from chemdataextractor.model import Compound
from chemdataextractor.doc import Paragraph, Heading

In [5]:
d = Document(
    Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'),
    Paragraph(u'The procedure was followed to yield a pale yellow solid (c.p. 240 mmol g-1)')
)

In [6]:
d

In [7]:
d.records.serialize()

[{'names': ['2,4,6-trinitrotoluene'], 'labels': ['3a'], 'roles': ['product']}]

In [8]:
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType

class Capacity(BaseModel):
    value = StringType()
    units = StringType()
    
Compound.capacities = ListType(ModelType(Capacity))

In [13]:
import re
from chemdataextractor.parse import R, I, W, Optional, merge, join
from chemdataextractor.parse.common import lbrct, dt, rbrct

prefix = (R(u'^c\.?p\.?$', re.I) | I(u'uptake') + I(u'capacity')).hide()
units = (I(u'mmol') + I('g-1'))(u'units').add_action(join)
value = R(u'^\d+(\.\d+)?$')(u'value')
cp = (prefix + value + units)(u'cp')

In [14]:
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

class CpParser(BaseParser):
    root = cp

    def interpret(self, result, start, end):
        compound = Compound(
            capacities=[
                Capacity(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

In [15]:
Paragraph.parsers = [CpParser()]

In [16]:
d = Document(
    Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'),
    Paragraph(u'The procedure was followed to yield a pale yellow solid (c.p. 240 mmol g-1)')
)

In [17]:
d.records.serialize()

[{'names': ['2,4,6-trinitrotoluene'],
  'labels': ['3a'],
  'roles': ['product'],
  'capacities': [{'value': '240', 'units': 'mmol g-1'}]}]