In [97]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import re

from chemdataextractor import Document
from chemdataextractor.model import Compound
from chemdataextractor.doc import Paragraph, Heading

from chemdataextractor.parse.cem import cem, chemical_label, lenient_chemical_label, solvent_name
from chemdataextractor.parse.common import lbrct, dt, rbrct
from chemdataextractor.utils import first
from chemdataextractor.model import Compound
from chemdataextractor.parse.actions import merge, join
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.parse.elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore

In [98]:
d = Document(
    Paragraph(u'Another exciting example is found in the application of an organosulfonate-based MOF, TMOF-3, with a defective pcu topology')
)

In [99]:
c = Document(
    Paragraph(u'On further milling, the kat-phase further rearranges into the dense diamondoid (dia) topology polymorph of ZIF-8.')
)

In [100]:
e = Document(
    Paragraph(u'Although ZIF-8 of tbo topology is quite different from MOF-5 of pcu topology, HKUST-1 could be considered as a net of pcu underlying topology.'),
)
#for some reason CDE doesn't even recognize the chemical name in this sentence

In [101]:
e.records.serialize()

[]

In [102]:
c.records.serialize()

[{'names': ['diamondoid', 'dia'], 'topologies': [{'abrv': 'dia'}]}]

In [103]:
d.records.serialize()

[{'names': ['TMOF-3'], 'topologies': [{'abrv': 'pcu'}]}]

In [104]:
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType

#Topology class is created with the BaseModel class as an argument
class Topology(BaseModel):
    full = StringType() 
    abrv = StringType(contextual = True)
    #adding contextual = True puts it in the dictionary but then it's in every dictionary
    
Compound.topologies = ListType(ModelType(Topology))

In [105]:
import re
from chemdataextractor.parse import R, I, W, Optional, merge
from chemdataextractor.parse.common import lbrct, dt, rbrct


identifier = Optional(I(u'topology')).hide()
topology = (I(u'pcu') | I(u'dia') | I(u'kat') | I(u'SCU') | I(u'tbo') | I(u'dia-a'))(u'abrv').add_action(merge)
full = Optional(I(u'diamondoid'))(u'full') #the string corresponds to the reference below. Still not sure what the syntax
tp = (full + topology + identifier)(u'tp')

In [106]:
from chemdataextractor.parse.cem import cem, chemical_label, lenient_chemical_label, solvent_name
from chemdataextractor.parse.elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore

bracket_any = lbrct + OneOrMore(Not(tp) + Not(rbrct) + Any()) + rbrct
delim = R('^[:;\.,]$')
cem_tp_phrase = (Optional(cem) + Optional(chemical_label) + Optional(lenient_chemical_label) + Optional(I('having')).hide() + Optional(delim).hide() + Optional(bracket_any).hide() + Optional(delim).hide() + Optional(lbrct) + tp + Optional(rbrct))('top_phrase')
to_give_tp_phrase = (Optional((I('defective') + I('to') + (I('give') | I('afford') | I('yield') | I('obtain')) | I('affording') | I('afforded') | I('gave') | I('yielded'))).hide() + Optional(dt).hide() + (cem | chemical_label | lenient_chemical_label) + Optional(ZeroOrMore(Not(tp) + Not(cem) + Any())).hide() + tp)('top_phrase')
obtained_tp_phrase = ((cem | chemical_label | lenient_chemical_label) + (I('defective') | I('is') | I('are') | I('was')).hide() + Optional((I('afforded') | I('obtained') | I('yielded'))).hide() + Optional(ZeroOrMore(Not(tp) + Not(cem) + Any())).hide() + tp)('top_phrase')

tp_phrase = cem_tp_phrase | to_give_tp_phrase | obtained_tp_phrase

In [107]:
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

class cem_TpParser(BaseParser):
    """"""
    root = tp_phrase

    def interpret(self, result, start, end):
        compound = Compound(
            topologies=[
                Topology(
                    full =first(result.xpath('./tp/full/text()')), #./ means it is searching relative
                    abrv =first(result.xpath('./tp/abrv/text()')) #text() selects the text nodes
                )
            ]
        )
        cem_el = first(result.xpath('./cem'))
        if cem_el is not None:
            compound.names = cem_el.xpath('./name/text()')
            compound.labels = cem_el.xpath('./label/text()')
        yield compound

In [108]:
Paragraph.parsers = [cem_TpParser()]

In [109]:
a = Document(
    Paragraph(u'Another exciting example is found in the application of an organosulfate-based MOF, TMOF-3, with a defective pcu topology') #took out organosulfate to simplify things

) #taken from ding et al

In [110]:
a.records.serialize()

[{'names': ['TMOF-3'], 'topologies': [{'abrv': 'pcu'}]}]

In [111]:
b = Document(
    Paragraph(u'ZIF-8 polymorph with kat-topology and its rapid conversion to the thermodynamically-stable and non-porous diatopology')
)
#taken from Julien et al
#how do I handle instances where the chemical name is before or after the first mention of topology?

In [112]:
b.records.serialize()

[{'names': ['ZIF-8'], 'topologies': [{'abrv': 'kat'}]}]

In [113]:
c = Document(
    Paragraph(u'by more bridging by TCPB4- linkers (Fig. 1B) in a SCU topology')
) #from Safaei et al

In [114]:
c.records.serialize()

[{'names': ['TCPB4-'], 'topologies': [{'abrv': 'SCU'}]}]

In [115]:
d = Document(
    Paragraph(u'It is well known that MOF-53 and its isoreticular structures of pcu topology11 † can be obtained by interconnecting the [Zn4O(COO)6] SBUs as a 6-c octahedral node using various rigid linear organic ligands as a 2-c linker.')
) #from Kim et al 

In [116]:
d.records.serialize() #Doesn't recognize if it's just 'MOF-5'

[{'names': ['MOF-53'], 'topologies': [{'abrv': 'pcu'}]}]

In [117]:
e = Document(
    Paragraph(u'Although HKUST-1 of tbo topology is quite different from'),
)
#might need to add HKUST-1 to the cem dictionary

In [118]:
e.records.serialize()

[]

In [119]:
f = Document(
    Paragraph(u'The network [Cd4(SPh)6](SPh)2 (SPh = benzenethiolate) is an example having dia-a topology, where the tetrahedral MOP consisting of four corner-linked 4-c [Cu(I)(SPh)4] nodes in a tetrahedral'),
)

In [120]:
f.records.serialize() #misidentifies the compound - should be [Cd4(SPh)6(SPh)2]

[{'names': ['benzenethiolate'], 'topologies': [{'abrv': 'dia-a'}]}]