In [50]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import re

from chemdataextractor import Document
from chemdataextractor.model import Compound
from chemdataextractor.doc import Paragraph, Heading

from chemdataextractor.parse.cem import cem, chemical_label, lenient_chemical_label, solvent_name
from chemdataextractor.parse.common import lbrct, dt, rbrct
from chemdataextractor.utils import first
from chemdataextractor.model import Compound
from chemdataextractor.parse.actions import merge, join
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.parse.elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore

In [51]:
b = Document(
    Paragraph(u'The product of c6h6 yielded methanol')
)

In [52]:
b.records.serialize()

[]

In [53]:
a = Document(
    Paragraph(u' The precipitated solids were filtered off and repeatedly washed with hot methanol \
              and then dried in a vacuum oven at 100 °C to give the pure organic \
              linker 5,5′-((3′,5′-dicarboxy-[1,1′-biphenyl]-3,5-dicarbonyl)bis(azanediyl))diisophthalic acid (H6DBDBD)\
              as a white solid (0.52 g, 77.8% yield)')
)

In [54]:
a.records.serialize()

[{'linkers': [{'yield_value': '100'}]},
 {'linkers': [{'yield_value': '0.52'}]},
 {'linkers': [{'yield_value': '77.8'}]}]

In [55]:
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType

#Linker class is created with the BaseModel class inherited
class Linker(BaseModel):
    linker = StringType() 
    abrv = StringType()
    yield_value = StringType()
    
Compound.linkers = ListType(ModelType(Linker))

In [61]:
import re
from chemdataextractor.parse import R, I, W, Optional, merge
from chemdataextractor.parse.common import lbrct, dt, rbrct
units = (Optional(I(u'g'))).add_action(merge)
identifier = I(u'linker').hide()
linker = ((cem))(u'linker').add_action(merge)
abrv = (Optional(lbrct) + I(u'H6DBDBD') + Optional(rbrct))(u'abrv').add_action(merge)
yield_value = (Optional(lbrct) + R(u'^\d+(\.\d+)?$') + Optional(rbrct))(u'yield').add_action(merge)
delim = R('^[:;\.,]$')
prefix = (I('yield') + I('of')).hide()
ol = (identifier + linker + Optional(abrv) + Optional(delim) + Optional(prefix))(u'ol')
#the identifier makes it so that it cannot find the yield easily

In [66]:
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

class LinkerParser(BaseParser):
    root = ol

    def interpret(self, result, start, end):
        compound = Compound(
            linkers=[
                Linker(
                    linker =first(result.xpath('./linker/text()')), #./ means it is searching relative
                    abrv =first(result.xpath('./abrv/text()')), #text() selects the text nodes
                )
            ]
        )
        yield compound

In [67]:
Paragraph.parsers = [LinkerParser()]

In [68]:
d = Document(
    Paragraph(u'organic \
              linker 5,5′-((3′,5′-dicarboxy-[1,1′-biphenyl]-3,5-dicarbonyl)bis(azanediyl))diisophthalic acid (H6DBDBD)\
              with a yield of 77)')
)

In [69]:
d.records.serialize()

[{'linkers': [{'linker': '5,5′-((3′,5′-dicarboxy-[1,1′-biphenyl]-3,5-dicarbonyl)bis(azanediyl))diisophthalic acid',
    'abrv': 'H6DBDBD'}]}]