In [15]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import re

from chemdataextractor import Document
from chemdataextractor.model import Compound
from chemdataextractor.doc import Paragraph, Heading

from chemdataextractor.parse.cem import cem, chemical_label, lenient_chemical_label, solvent_name
from chemdataextractor.parse.common import lbrct, dt, rbrct
from chemdataextractor.utils import first
from chemdataextractor.model import Compound, Capacity
from chemdataextractor.parse.actions import merge, join
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.parse.elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore

Step 1: Extracting only topology

In [16]:
d = Document(
    Paragraph(u'Another exciting example is found in the application of an organosulfonate-based MOF, TMOF-3, with a defective pcu topology')
)

In [17]:
d

In [18]:
d.records.serialize()

[{'names': ['organosulfonate']}, {'names': ['TMOF-3']}]

In [19]:
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType

#Topology class is created with the BaseModel class as an argument
class Topology(BaseModel):
    full = StringType(contextual = True) 
    abrv = StringType()
    
Compound.topologies = ListType(ModelType(Topology))

In [20]:
import re
from chemdataextractor.parse import R, I, W, Optional, merge
from chemdataextractor.parse.common import lbrct, dt, rbrct

identifier = (I(u'topology')).hide()
topology = (Optional(I(u'dia')) + Optional(u'pcu'))(u'abrv').add_action(merge)
full = I(u'diamondoid')(u'full') #the string corresponds to the reference below. Still not sure what the syntax
tp = (Optional(full) + Optional(topology) + Optional(identifier))(u'tp')

In [21]:
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

class TopParser(BaseParser):
    root = tp

    def interpret(self, result, start, end):
        compound = Compound(
            topologies=[
                Topology(
                    full =first(result.xpath('./full/text()')), #./ means it is searching relative
                    abrv =first(result.xpath('./abrv/text()')) #text() selects the text nodes
                )
            ]
        )
        yield compound

'''The yield statement suspends function’s execution and sends a value back to the caller, but retains enough state to enable
function to resume where it is left off. When resumed, the function continues execution immediately after 
the last yield run. This allows its code to produce a series of values over time, rather than 
computing them at once and sending them back like a list.'''

'The yield statement suspends function’s execution and sends a value back to the caller, but retains enough state to enable\nfunction to resume where it is left off. When resumed, the function continues execution immediately after \nthe last yield run. This allows its code to produce a series of values over time, rather than \ncomputing them at once and sending them back like a list.'

In [22]:
Paragraph.parsers = [TopParser()]

In [23]:
d = Document(
    Paragraph(u'Another exciting example is found in the application of an organosulfonate-based MOF, TMOF-3, with a defective pcu topology')
)

In [24]:
d.records.serialize()

[{'topologies': [{'abrv': 'pcu'}]}]

Step 2: Extracting Chemical Name and Topology

In [3]:
d = Document(
    Paragraph(u'Another exciting example is found in the application of an organosulfonate-based MOF, TMOF-3, with a defective pcu topology')
)

In [4]:
d.records.serialize()

[{'names': ['organosulfonate']}, {'names': ['TMOF-3']}]

In [10]:
import re
from chemdataextractor.parse import R, I, W, Optional, merge
from chemdataextractor.parse.common import lbrct, dt, rbrct

identifier = (I(u'topology')).hide()
topology = (Optional(I(u'dia')) + Optional(u'pcu'))(u'abrv').add_action(merge)
full = I(u'diamondoid')(u'full') #the string corresponds to the reference below. Still not sure what the syntax
tp = (Optional(full) + Optional(topology) + Optional(identifier))(u'tp')

In [11]:
from chemdataextractor.parse.cem import cem, chemical_label, lenient_chemical_label, solvent_name
from chemdataextractor.parse.elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore

bracket_any = lbrct + OneOrMore(Not(tp) + Not(rbrct) + Any()) + rbrct
delim = R('^[:;\.,]$')
cem_tp_phrase = (Optional(cem) + Optional(chemical_label) + Optional(lenient_chemical_label) + Optional(I('having')).hide() + Optional(delim).hide() + Optional(bracket_any).hide() + Optional(delim).hide() + Optional(lbrct) + tp + Optional(rbrct))('top_phrase')
to_give_tp_phrase = ((I('to') + (I('give') | I('afford') | I('yield') | I('obtain')) | I('affording') | I('afforded') | I('gave') | I('yielded')).hide() + Optional(dt).hide() + (cem | chemical_label | lenient_chemical_label) + ZeroOrMore(Not(tp) + Not(cem) + Any()).hide() + tp)('top_phrase')
obtained_tp_phrase = ((cem | chemical_label | lenient_chemical_label) + (I('is') | I('are') | I('was')).hide() + (I('afforded') | I('obtained') | I('yielded')).hide() + ZeroOrMore(Not(tp) + Not(cem) + Any()).hide() + tp)('top_phrase')

tp_phrase = Optional(cem_tp_phrase) | Optional(to_give_tp_phrase) | Optional(obtained_tp_phrase)

In [12]:
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

class cem_TopParser(BaseParser):
    root = tp_phrase

    def interpret(self, result, start, end):
        compound = Compound(
            topologies=[
                Topology(
                    full =first(result.xpath('./full/text()')), #./ means it is searching relative
                    abrv =first(result.xpath('./abrv/text()')) #text() selects the text nodes
                )
            ]
        )
        if cem_el is not None:
            compound.names = cem_el.xpath('./name/text()')
            compound.labels = cem_el.xpath('./label/text()')
        yield compound

In [13]:
c = Document(
    Paragraph(u'Another exciting example is found in the application of an organosulfonate-based MOF, TMOF-3, with a defective pcu topology')
)

In [14]:
c.records.serialize()

[{'names': ['organosulfonate']}, {'names': ['TMOF-3']}]