In [59]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import re

from chemdataextractor import Document
from chemdataextractor.model import Compound
from chemdataextractor.doc import Paragraph, Heading

from chemdataextractor.parse.cem import cem, chemical_label, lenient_chemical_label, solvent_name
from chemdataextractor.parse.common import lbrct, dt, rbrct
from chemdataextractor.utils import first
from chemdataextractor.model import Compound
from chemdataextractor.parse.actions import merge, join
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.parse.elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore

In [60]:
a = Document(
    Paragraph(u' The precipitated solids were filtered off and repeatedly washed with hot methanol \
              and then dried in a vacuum oven at 100 °C to give the pure organic \
              linker 5,5′-((3′,5′-dicarboxy-[1,1′-biphenyl]-3,5-dicarbonyl)bis(azanediyl))diisophthalic acid (H6DBDBD)\
              as a white solid (0.52 g, 77.8% yield)')
)

In [61]:
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType

#Linker class is created with the BaseModel class inherited
class LinkerYield(BaseModel):
    yield_value = StringType()
    units = StringType(contextual=True)
    
Compound.linkers = ListType(ModelType(LinkerYield))

In [62]:
import re
from chemdataextractor.parse import R, I, W, Optional, merge
from chemdataextractor.parse.common import lbrct, dt, rbrct

In [129]:
units = (R(u'^[$%]$'))('units').add_action(merge)
identifier = I(u'linker').hide()
linker = ((cem))(u'linker').add_action(merge)
abrv = (Optional(lbrct) + I(u'H6DBDBD') + Optional(rbrct))(u'abrv').add_action(merge)
yield_value = (Optional(lbrct) + R(u'^\d+(\.\d+)?$') + Optional(rbrct))(u'yield_value').add_action(merge)
delim = R('^[:;\.,]$')
prefix = (I('yield') | I('of') | R('^,$')).hide()
ly = (prefix + yield_value + units)(u'ly')

bracket_any = lbrct + OneOrMore(Not(ly) + Not(rbrct) + Any()) + rbrct
cem_ly_phrase = (Optional(cem) + Optional(chemical_label) + Optional(lenient_chemical_label) + Optional(I('having')).hide() + Optional(delim).hide() + Optional(bracket_any).hide() + Optional(delim).hide() + Optional(lbrct) + ly + Optional(rbrct))('top_phrase')
to_give_ly_phrase = (Optional((I('defective') + I('to') + (I('give') | I('afford') | I('yield') | I('obtain')) | I('affording') | I('afforded') | I('gave') | I('yielded'))).hide() + Optional(dt).hide() + (cem | chemical_label | lenient_chemical_label) + Optional(ZeroOrMore(Not(ly) + Not(cem) + Any())).hide() + ly)('ly_phrase')
obtained_ly_phrase = ((cem | chemical_label | lenient_chemical_label) + (I('defective') | I('is') | I('are') | I('was')).hide() + Optional((I('afforded') | I('obtained') | I('yielded'))).hide() + Optional(ZeroOrMore(Not(ly) + Not(cem) + Any())).hide() + ly)('ly_phrase')
ly_phrase = cem_ly_phrase | to_give_ly_phrase | obtained_ly_phrase

In [130]:
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

class LinkerParser(BaseParser):
    root = ly_phrase

    def interpret(self, result, start, end):
        compound = Compound(
            linkers=[
                LinkerYield(
                    yield_value =first(result.xpath('./ly/yield_value/text()')), #./ means it is searching relative
                    units =first(result.xpath('./ly/units/text()')), #text() selects the text nodes
                )
            ]
        )
        yield compound
        cem_el = first(result.xpath('./cem'))
        if cem_el is not None:
            compound.names = cem_el.xpath('./name/text()')
            compound.labels = cem_el.xpath('./label/text()')
        #else:
        #    raise ValueError('No cem found')
        yield compound

In [131]:
Paragraph.parsers = [LinkerParser()]

In [132]:
a = Document(
    Paragraph(u' The precipitated solids were filtered off and repeatedly washed with hot methanol \
              and then dried in a vacuum oven at 100 °C to give the pure organic \
              linker 5,5′-((3′,5′-dicarboxy-[1,1′-biphenyl]-3,5-dicarbonyl)bis(azanediyl))diisophthalic acid (H6DBDBD)\
              as a white solid (0.52 g, 77.8% yield)')
)

In [133]:
a.records.serialize()

[{'names': ['5,5′-((3′,5′-dicarboxy-[1,1′-biphenyl]-3,5-dicarbonyl)bis(azanediyl))diisophthalic acid'],
  'linkers': [{'yield_value': '77.8', 'units': '%'}]}]