In [95]:
from chemdataextractor import Document
from chemdataextractor.model import Compound
from chemdataextractor.doc import Paragraph, Heading

In [96]:
d = Document(
    Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'),
    Paragraph(u'The procedure was followed to yield a pale yellow solid (diamondoid dia topology)')
)

In [97]:
d

In [98]:
d.records.serialize()

[{'names': ['2,4,6-trinitrotoluene'], 'labels': ['3a'], 'roles': ['product']}]

In [150]:
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType

#Topology class is created with the BaseModel class as an argument
class Topology(BaseModel):
    full = StringType(contextual = True) 
    abrv = StringType()
    
Compound.topologies = ListType(ModelType(Topology))

In [160]:
import re
from chemdataextractor.parse import R, I, W, Optional, merge
from chemdataextractor.parse.common import lbrct, dt, rbrct

prefix = (I(u'top')).hide()
abrv = (I(u'dia'))(u'abrv').add_action(merge)
full = I(u'diamondoid')(u'full') #the string corresponds to the reference below. Still not sure what the syntax
tp = (prefix + full + Optional(abrv))(u'tp')

In [161]:
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

class TopParser(BaseParser):
    root = tp

    def interpret(self, result, start, end):
        compound = Compound(
            topologies=[
                Topology(
                    full =first(result.xpath('./full/text()')), #./ means it is searching relative
                    abrv =first(result.xpath('./abrv/text()')) #text() selects the text nodes
                )
            ]
        )
        yield compound

'''The yield statement suspends function’s execution and sends a value back to the caller, but retains enough state to enable
function to resume where it is left off. When resumed, the function continues execution immediately after 
the last yield run. This allows its code to produce a series of values over time, rather than 
computing them at once and sending them back like a list.'''

'The yield statement suspends function’s execution and sends a value back to the caller, but retains enough state to enable\nfunction to resume where it is left off. When resumed, the function continues execution immediately after \nthe last yield run. This allows its code to produce a series of values over time, rather than \ncomputing them at once and sending them back like a list.'

In [162]:
Paragraph.parsers = [TopParser()]

In [163]:
d = Document(
    Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'),
    Paragraph(u'The procedure was followed to yield a pale yellow solid (top diamondoid dia)')
)

In [164]:
d.records.serialize()

[{'names': ['2,4,6-trinitrotoluene'],
  'labels': ['3a'],
  'roles': ['product'],
  'topologies': [{'full': 'diamondoid', 'abrv': 'dia'}]}]