In [9]:
from chemdataextractor import Document
from chemdataextractor.model import Compound
from chemdataextractor.doc import Paragraph, Heading

In [15]:
d = Document(
    Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'),
    Paragraph(u'The procedure was followed to yield a pale yellow solid (syntheses microwave-assisted mechanochemically)')
)

d

In [16]:
d.records.serialize()

[{'names': ['2,4,6-trinitrotoluene'],
  'labels': ['3a'],
  'roles': ['product'],
  'synthesis_routes': [{'synthesis': 'mechanochemically',
    'descriptor': 'microwave-assisted'}]}]

In [17]:
e = Document(
    Heading(u'Preparation of MOF-5/COF (M5C).'), 
    Paragraph(u'MOF-5-NH2 (0.20 g), 0.5 g (3.96 mmol) of melamine, 0.5 g (3.73 mmol) of terephthaldehyde, 25 mL of DMSO, and 5 mL of distilled water')
)
e.records.serialize()

[{'names': ['MOF-5 / COF', 'M5C'], 'roles': ['product']}]

In [18]:
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType

class SynthesisRoute(BaseModel):
    synthesis = StringType()
    descriptor = StringType()
    prefix = StringType()
Compound.synthesis_routes = ListType(ModelType(SynthesisRoute))

In [19]:
import re
from chemdataextractor.parse import R, I, W, Optional, merge

prefix = (R(u'^[Ss]ynthe((si(s|sed|zed|se|ze))|tic|ses)?$')| R(u'^[Pp]repar(ation|ed|e)?$')| R(u'^[Pp]roduc(tion|e|ed)?$')).hide()
descriptor = (R(u'^[Mm]icrowave$', re.I)+ Optional(R('^[\-‐‑⁃‒–—―−－⁻]$'))+R(u'assisted'))(u'descriptor').add_action(merge)
micro = I(u'microwave-assisted')
synthesis = (Optional(micro)+R(u'^[SsHh](olv|ydr)othermal(ly)?$')|R(u'^[eEmMsS](lectr|on|echan)ochemical(ly)?$')| micro)(u'synthesis')
sr = (prefix + Optional(descriptor) + synthesis)(u'sr')

In [20]:
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

class SynthParser(BaseParser):
    root = sr

    def interpret(self, result, start, end):
        compound = Compound(
            synthesis_routes=[
                SynthesisRoute(
                    synthesis=first(result.xpath('./synthesis/text()')),
                    descriptor=first(result.xpath('./descriptor/text()'))
                )
            ]
        )
        yield compound

In [21]:
Paragraph.parsers = [SynthParser()]

d = Document(
    Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'),
    Paragraph(u'The procedure was followed to yield a pale yellow solid (solvothermal synthesis °C)')


d.records.serialize()