In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import re

from chemdataextractor import Document
from chemdataextractor.model import Compound
from chemdataextractor.doc import Paragraph, Heading

from chemdataextractor.parse.cem import cem, chemical_label, lenient_chemical_label, solvent_name
from chemdataextractor.parse.common import lbrct, dt, rbrct
from chemdataextractor.utils import first
from chemdataextractor.model import Compound
from chemdataextractor.parse.actions import merge, join
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.parse.elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore

In [2]:
d = Document(
    Paragraph(u'Another exciting example is found in the application of an organosulfonate-based MOF, TMOF-3, with a defective pcu topology')
)

In [3]:
d.records.serialize()

[{'names': ['organosulfonate']}, {'names': ['TMOF-3']}]

In [4]:
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType

#Topology class is created with the BaseModel class as an argument
class Topology(BaseModel):
    full = StringType() 
    abrv = StringType(contextual = True)
    #adding contextual = True puts it in the dictionary but then it's in every dictionary
    
Compound.topologies = ListType(ModelType(Topology))

In [5]:
import re
from chemdataextractor.parse import R, I, W, Optional, merge
from chemdataextractor.parse.common import lbrct, dt, rbrct

identifier = Optional(I(u'topology')).hide()
topology = (I(u'pcu') | I(u'dia'))(u'abrv').add_action(merge)
full = Optional(I(u'diamondoid'))(u'full') #the string corresponds to the reference below. Still not sure what the syntax
tp = (full + topology + identifier)(u'tp')

In [6]:
from chemdataextractor.parse.cem import cem, chemical_label, lenient_chemical_label, solvent_name
from chemdataextractor.parse.elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore

bracket_any = lbrct + OneOrMore(Not(tp) + Not(rbrct) + Any()) + rbrct
delim = R('^[:;\.,]$')
cem_tp_phrase = (Optional(cem) + Optional(chemical_label) + Optional(lenient_chemical_label) + Optional(I('having')).hide() + Optional(delim).hide() + Optional(bracket_any).hide() + Optional(delim).hide() + Optional(lbrct) + tp + Optional(rbrct))('top_phrase')
to_give_tp_phrase = (Optional(( I('defective') + I('to') + (I('give') | I('afford') | I('yield') | I('obtain')) | I('affording') | I('afforded') | I('gave') | I('yielded'))).hide() + Optional(dt).hide() + (cem | chemical_label | lenient_chemical_label) + Optional(ZeroOrMore(Not(tp) + Not(cem) + Any())).hide() + tp)('top_phrase')
obtained_tp_phrase = ((cem | chemical_label | lenient_chemical_label) + (I('defective') | I('is') | I('are') | I('was')).hide() + Optional((I('afforded') | I('obtained') | I('yielded'))).hide() + Optional(ZeroOrMore(Not(tp) + Not(cem) + Any())).hide() + tp)('top_phrase')

tp_phrase = cem_tp_phrase | to_give_tp_phrase | obtained_tp_phrase

In [7]:
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

class cem_TpParser(BaseParser):
    """"""
    root = tp_phrase

    def interpret(self, result, start, end):
        compound = Compound(
            topologies=[
                Topology(
                    full =first(result.xpath('./tp/full/text()')), #./ means it is searching relative
                    abrv =first(result.xpath('./tp/abrv/text()')) #text() selects the text nodes
                )
            ]
        )
        cem_el = first(result.xpath('./cem'))
        if cem_el is not None:
            compound.names = cem_el.xpath('./name/text()')
            compound.labels = cem_el.xpath('./label/text()')
        yield compound

In [8]:
Paragraph.parsers = [cem_TpParser()]

In [9]:
d = Document(
    Paragraph(u'Another exciting example is found in the application of a MOF, TMOF-3, with a defective pcu topology') #took out organosulfate to simplify things

)

In [10]:
d.records.serialize()

[{'names': ['TMOF-3'], 'topologies': [{'abrv': 'pcu'}]}]

In [194]:
#I need to keep messing around with the regex until I get it to recognize the topology