In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import re

from chemdataextractor import Document
from chemdataextractor.model import Compound
from chemdataextractor.doc import Paragraph, Heading

from chemdataextractor.parse.cem import cem, chemical_label, lenient_chemical_label, solvent_name
from chemdataextractor.parse.common import lbrct, dt, rbrct
from chemdataextractor.utils import first
from chemdataextractor.model import Compound
from chemdataextractor.parse.actions import merge, join
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.parse.elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore

In [2]:
a = Document(
    Paragraph(u' The precipitated solids were filtered off and repeatedly washed with hot methanol \
              and then dried in a vacuum oven at 100 °C to give the pure organic \
              linker 5,5′-((3′,5′-dicarboxy-[1,1′-biphenyl]-3,5-dicarbonyl)bis(azanediyl))diisophthalic acid (H6DBDBD)\
              as a white solid (0.52 g, 77.8% yield)')
)

In [3]:
a.records.serialize()

[{'names': ['methanol']},
 {'names': ['5,5′-((3′,5′-dicarboxy-[1,1′-biphenyl]-3,5-dicarbonyl)bis(azanediyl))diisophthalic acid']}]

In [104]:
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType

#Linker class is created with the BaseModel class inherited
class Linker(BaseModel):
    linker = StringType() 
    abrv = StringType()
    yield_value = ListType(ModelType(Linker))

class LinkerYield(BaseModel):
    yield_value = ListType(ModelType(Linker))
    
    
Compound.linkers = ListType(ModelType(Linker))

In [105]:
units = (Optional(I(u'g'))).add_action(merge)
identifier = I(u'linker').hide()
linker = ((cem))(u'linker').add_action(merge)
abrv = (Optional(lbrct) + I(u'H6DBDBD') + Optional(rbrct))(u'abrv').add_action(merge)
yield_value = (Optional(lbrct) + R(u'^\d+(\.\d+)?$') + Optional(rbrct))(u'yield_value').add_action(merge)
delim = R('^[:;\.,]$')
prefix = (I('yield') + I('of')).hide()
ol = (identifier + linker + Optional(abrv) + Optional(delim) + Optional(prefix))(u'ol')

In [110]:
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

class LinkerParser(BaseParser):
    root = ol

    def interpret(self, result, start, end):
        compound = Compound()
        
        l = Linker(
                    linker =first(result.xpath('./linker/text()')), #./ means it is searching relative
                    abrv =first(result.xpath('./abrv/text()')), #text() selects the text nodes
                )
        
        for linkers in result.xpath('./linker/text()'):
            product_info = LinkerYield(
            yield_value = first(result.xpath('./yield_value/text()'))
            )
            l.yield_value.append(product_info)

        yield compound

In [111]:
Paragraph.parsers = [LinkerParser()]

In [112]:
d = Document(
    Paragraph(u'organic \
              linker 5,5′-((3′,5′-dicarboxy-[1,1′-biphenyl]-3,5-dicarbonyl)bis(azanediyl))diisophthalic acid (H6DBDBD)\
              with a yield of 77)')
)

In [113]:
d.records.serialize() #Nonetype object is not iterable means there is no data result.xpath('.linker/text')

TypeError: 'NoneType' object is not iterable