Introducing 'was' into a statemement

In [32]:
from chemdataextractor import Document
from chemdataextractor.model import Compound
from chemdataextractor.doc import Paragraph, Heading

In [33]:
d = Document(
    Paragraph(u'The CO2 uptake capacity of TMOF-1 was 1.2 mmol g-1') #deleted -6.8 mmol to reduce complexity
)
#The word 'was' is the key difference in this sentence and my parser is not recognizing it

In [34]:
d

In [35]:
d.records.serialize()

[{'capacities': [{'value': '1.2', 'units': 'mmolg-1'}]}]

In [36]:
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType

class Capacity(BaseModel):
    value = StringType()
    units = StringType()
    
Compound.capacities = ListType(ModelType(Capacity))

In [37]:
import re
from chemdataextractor.parse import R, I, W, Optional, merge
from chemdataextractor.parse.common import lbrct, dt, rbrct

prefix = (Optional(I(u'uptake')) + Optional(I(u'capacity')) + Optional(I('was'))).hide()
units = (I(u'mmol') + I('g-1'))(u'units').add_action(merge)
value = R(u'^\d+(\.\d+)?$')(u'value')
cp = (prefix + value + units)(u'cp')

In [38]:
#bracket_any = lbrct + OneOrMore(Not(cp) + Not(rbrct) + Any()) + rbrct
#delim = R('^[:;\.,]$')
#cem_cp_phrase = (Optional(cem) + Optional(I('having')).hide() + Optional(delim).hide() + Optional(bracket_any).hide() + Optional(delim).hide() + Optional(lbrct) + cp + Optional(rbrct))('cp_phrase')

In [39]:
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

class CpParser(BaseParser):
    root = cp

    def interpret(self, result, start, end):
        compound = Compound(
            capacities=[
                Capacity(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

In [40]:
Paragraph.parsers = [CpParser()]

In [41]:
d = Document(
    Paragraph(u'The CO2 uptake capacity of TMOF-1 was 1.2 mmol g-1') #deleted -6.8 mmol to reduce complexity
)

In [42]:
d.records.serialize()

[{'capacities': [{'value': '1.2', 'units': 'mmolg-1'}]}]

Chemical name recognition with value recognition

In [43]:
c = Document(
    Paragraph(u'The CO2 uptake capacity of TMOF-1 was 1.2 mmol g-1') #deleted -6.8 mmol to reduce complexity
)
#The word 'was' is the key difference in this sentence and my parser is not recognizing it

In [44]:
c

In [45]:
c.records.serialize()

[{'capacities': [{'value': '1.2', 'units': 'mmolg-1'}]}]

In [46]:
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType

class Capacity(BaseModel):
    value = StringType()
    units = StringType()
    
Compound.capacities = ListType(ModelType(Capacity))

In [47]:
import re
from chemdataextractor.parse import R, I, W, Optional, merge
from chemdataextractor.parse.common import lbrct, dt, rbrct

prefix = (Optional(I(u'uptake')) + Optional(I(u'capacity')) + Optional(I('was'))).hide()
units = (I(u'mmol') + I('g-1'))(u'units').add_action(merge)
value = R(u'^\d+(\.\d+)?$')(u'value')
cp = (prefix + value + units)(u'cp')

In [55]:
from chemdataextractor.parse.cem import cem, chemical_label, lenient_chemical_label, solvent_name
from chemdataextractor.parse.elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore

bracket_any = lbrct + OneOrMore(Not(cp) + Not(rbrct) + Any()) + rbrct
delim = R('^[:;\.,]$')
cem_cp_phrase = (Optional(cem) + Optional(chemical_label) + Optional(lenient_chemical_label) + Optional(I('having')).hide() + Optional(delim).hide() + Optional(bracket_any).hide() + Optional(delim).hide() + Optional(lbrct) + cp + Optional(rbrct))('cp_phrase')
to_give_cp_phrase = ((I('to') + (I('give') | I('afford') | I('yield') | I('obtain')) | I('affording') | I('afforded') | I('gave') | I('yielded')).hide() + Optional(dt).hide() + (cem | chemical_label | lenient_chemical_label) + ZeroOrMore(Not(cp) + Not(cem) + Any()).hide() + cp)('cp_phrase')
obtained_cp_phrase = ((cem | chemical_label | lenient_chemical_label) + (I('is') | I('are') | I('was')).hide() + (I('afforded') | I('obtained') | I('yielded')).hide() + ZeroOrMore(Not(cp) + Not(cem) + Any()).hide() + cp)('cp_phrase')

cp_phrase = cem_cp_phrase | to_give_cp_phrase | obtained_cp_phrase

In [61]:
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

class cem_CpParser(BaseParser):
    root = cp_phrase

    def interpret(self, result, start, end):
        compound = Compound(
            capacities=[
                Capacity(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        cem_el = first(result.xpath('./cem'))
        if cem_el is not None:
            compound.names = cem_el.xpath('./name/text()')
            compound.labels = cem_el.xpath('./label/text()')
        yield compound

In [62]:
Paragraph.parsers = [cem_CpParser()]

In [64]:
c = Document(
    Paragraph(u'The CO2 uptake capacity of TMOF-1 was 1.2 mmol g-1') #deleted -6.8 mmol to reduce complexity
)

In [65]:
c.records.serialize()

[{'names': ['TMOF-1'], 'capacities': [{}]}]