In [6]:
import logging
import re
import pandas as pd
import urllib
import time

import chemdataextractor as cde
from chemdataextractor import Document
from chemdataextractor.reader import acs,base,cssp,HtmlReader,NlmXmlReader,PdfReader,RscHtmlReader,XmlReader
from chemdataextractor.model import Compound, UvvisSpectrum, UvvisPeak, BaseModel, StringType, ListType, ModelType
from chemdataextractor.parse.common import hyphen,lbrct, dt, rbrct
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

from chemdataextractor.parse.actions import strip_stop, merge, join
from chemdataextractor.parse.elements import W, I, T, R, Optional, ZeroOrMore, OneOrMore, Or, And, Not, Any
from chemdataextractor.parse.cem import chemical_name,cem, chemical_label, lenient_chemical_label, solvent_name
from chemdataextractor.doc import Paragraph, Sentence, Caption, Figure,Table, Heading
from chemdataextractor.doc.table import Table, Cell

In [8]:
# VOC Parser

class Voc(BaseModel):
    value = StringType()
    units = StringType()

Compound.voc_pattern = ListType(ModelType(Voc))

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'V') | I(u'v') | I(u'volt') | I(u'volts'))(u'units').add_action(merge)
value = R(u'\d+(\.\d+)?')(u'value')

abbrv_prefix = (I(u'Voc') | I(u'voc') | I(u'VOC')).hide()
words_pref = (I(u'open') + I(u'circuit') + I(u'voltage')).hide()
hyphanated_pref = (I(u'open') + I(u'-') + I('circuit') + I(u'voltage')).hide()
joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge)
spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') +
                                                                        R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge)
to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') +
                                                                    R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join)

# prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('Voc')|W('VOC') + Optional(rbrct) | I('open') + Optional(I('circuit')) + Optional((I('voltage')))
#                                     ).hide() + Optional(lbrct + W('Voc') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('V')).hide()
# Original code from SAM

prefix = Optional(I('a')).hide() + (Optional(lbrct) + I('Voc') + Optional(rbrct) | Optional(I('open')) + Optional(I('circuit')) + Optional((I('voltage')))
                                    ).hide() + Optional(lbrct + W('Voc') + rbrct) + Optional(W('=') | W('¼') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('V')).hide()

voc_first  = (words_pref + (Optional(lbrct) + abbrv_prefix + Optional(rbrct)) + ZeroOrMore(common_text) + Optional(lbrct) + value + Optional(rbrct) + units)(u'voc')
voc_second = (prefix + Optional(lbrct) + value + Optional(rbrct) + units)(u'voc')
voc_pattern = voc_first | voc_second

class VocParser(BaseParser):
    root = voc_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            voc_pattern=[
                Voc(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

Sentence.parsers.append(VocParser())
Paragraph.parsers.append(VocParser())

In [23]:
#Testing on Individual Sentence Test 1

d1 = Paragraph('for annealed device with 93% RR, JSC = 6.28 mA cm−2, VOC = 0.6 V, FF= 39.4%, PCE= 1.8%; for not-annealed device with 90.7% RR, JSC = 3.27 mA cm−2, VOC = 0.62 V, FF= 37.0%, PCE= 0.9%; for annealed device with 90.7% RR, JSC = 3.07 mA cm−2,VOC = 0.61 V, FF= 32.1%, PCE= 0.7%')

d2 = Paragraph('A PCE of 6.3% with Voc of 0.70 V, a shortcircuit current density (Jsc) of 13.1 mA cm-2, and a decent FF of 0.69 was obtained for devices containing the 1:1.5 blend as the photoactive layer (thickness of 90 nm) and processed with the additive, under AM 1.5G simulated solar light illumination(100 mW cm-2)')

d3 = Paragraph('Under optimal processing condition, the PCE of device based on P1 was 2.78% with a Voc of 0.85 V, a Jsc of 8.65 mA/cm2 and an FF of 37.8 %')

d4 = Paragraph('P1 was investigated and the best device performance was obtained in an inverted device structure and a PCE value of 2.78% with a Voc of 0.85 V were achieved.')

d5 = Paragraph('We obtained the highest device performance by using TiOx as a multifunctional interlayer between the photoactive layer and the Al electrode. Figure 4 shows the J-V curves of an optimized device measured under AM 1.5G irradiation 100 mA/cm2 and dark conditions; its efficiency parameters are as follow: Voc=0.77 V, Jsc=9.10 mA/cm2, FF=0.55, and PCE=3.80%.')

d6 = Paragraph('For the best-performing device, a PCE as high as 9.48% was achieved with a Voc of 0.80 V, a Jsc of 17.46 mA/cm2, and a fill factor (FF) of 67.9%')

d7 = Paragraph('Two novel naphtho[1,2-c:5,6-c]bis(1,2,5-thiadiazole) and alkoxylphenyl substituted benzodithiophene based copolymers were developed as the donor materials for polymer solar cells and the  best device performance was achieved by P1, with an open-circuit voltage of 0.85 V,a short-circuit current density of 8.65 mA•cm−2, a fill factor of 37.8%, and a power conversion efficiency of 2.78%.')

d8 = Paragraph('PSC devices with different D/A ratios (P1/PC71BM, w/w) were fabricated to optimize the D/A ratio of the blend.Table 1 showed the photovoltaic parameters of the resulting devices under the illumination of AM 1.5G (100 mW/cm2) with different D/A ratios (1∶1, 1∶2 and 1∶3). It is clear that the optimal D/A ratio of the blend is 1∶2, and a PCE of 1.30% was obtained with a Voc of 0.92 V, a Jsc of 4.81 mA/cm2, and an FF of 29.48%. Compared with PBDT-DTNT with the same device structure (Voc＝0.80 V), a much improved Voc was obtained in P1 based devices.')

d9 = Paragraph('A maximum PCE of 4.1%, an open-circuit voltage (Voc) of 0.84 V, a short-circuit current (Jsc) of 9.8 mA cm-2, and a fill factor (FF) of 49.5% could be achieved based on a PBDTTPD:PC71BM ratio of 1:2 with a thin active-layer thickness of 90 nm')

d10 = Paragraph('Devices with 3 vol% DIO gave the highest PCE of 5.53% with a Voc of 0.98 V, a Jsc of 8.12 mA cm−2, and a FF of 69.5%.')



print(d1.records.serialize()) # Recognizes all values no compounds given
print(d2.records.serialize())  # Recognizes Value no Compound given
print(d3.records.serialize()) # Recognizes Value but not compound
print(d4.records.serialize())   # Recognizes Value but not compound
print(d5.records.serialize())  # Recognizes Compounds and Value
print(d6.records.serialize())  # Recognizes Value not clear compound given
print(d7.records.serialize())  # Recognizes Compounds but not value
print(d8.records.serialize())  # Recognizes first value with no compound and 2nd                                 # value w/ Compound
print(d9.records.serialize())   # Recognizes Compound and value
print(d10.records.serialize())  # Recognizes value but not compound

[{'voc_pattern': [{'value': '0.6', 'units': 'V'}]}, {'voc_pattern': [{'value': '0.62', 'units': 'V'}]}, {'voc_pattern': [{'value': '0.61', 'units': 'V'}]}]
[{'voc_pattern': [{'value': '0.70', 'units': 'V'}]}]
[{'voc_pattern': [{'value': '0.85', 'units': 'V'}]}]
[{'voc_pattern': [{'value': '0.85', 'units': 'V'}]}]
[{'names': ['TiOx']}, {'names': ['Al']}, {'voc_pattern': [{'value': '0.77', 'units': 'V'}]}]
[{'voc_pattern': [{'value': '0.80', 'units': 'V'}]}]
[{'names': ['naphtho[1,2-c:5,6-c]bis(1,2,5-thiadiazole)']}, {'names': ['alkoxylphenyl']}, {'names': ['benzodithiophene']}]
[{'voc_pattern': [{'value': '0.92', 'units': 'V'}]}, {'names': ['PBDT']}, {'voc_pattern': [{'value': 'Voc＝0.80', 'units': 'V'}]}]
[{'names': ['PBDTTPD']}, {'voc_pattern': [{'value': '0.84', 'units': 'V'}]}]
[{'voc_pattern': [{'value': '0.98', 'units': 'V'}]}]
