In [35]:
import logging
import re
import pandas as pd
import urllib
import time

import chemdataextractor as cde
from chemdataextractor import Document
from chemdataextractor.reader import acs,base,cssp,HtmlReader,NlmXmlReader,PdfReader,RscHtmlReader,XmlReader
from chemdataextractor.model import Compound, UvvisSpectrum, UvvisPeak, BaseModel, StringType, ListType, ModelType
from chemdataextractor.parse.common import hyphen,lbrct, dt, rbrct
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

from chemdataextractor.parse.actions import strip_stop, merge, join
from chemdataextractor.parse.elements import W, I, T, R, Optional, ZeroOrMore, OneOrMore, Or, And, Not, Any
from chemdataextractor.parse.cem import chemical_name,cem, chemical_label, lenient_chemical_label, solvent_name
from chemdataextractor.doc import Paragraph, Sentence, Caption, Figure,Table, Heading
from chemdataextractor.doc.table import Table, Cell
import os
import pickle
from chemdataextractor.doc import Paragraph

In [36]:
#PCE Parser

class Pce(BaseModel):
    value = StringType()
    units = StringType()

Compound.pce_pattern = ListType(ModelType(Pce))

# prefix = abbrv_prefix | words_pref | hyphanated_pref
common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'%') | I(u'percent'))(u'units')
value = R(u'\d+(\.\d+)?') | R(u'\b([0-9]|[1-9][0-9]|100)\b') | (u'value')
value = R(u'\d+(\.\d+)?')(u'value')
# value now contains ranged values

#abbrv_prefix = (I(u'PCE') | I(u'PCEs') | I(u'pce') | R('x{\u03B7}')).hide() #greek letter eta for pce 
abbrv_prefix = (I(u'PCE') | I(u'PCEs') | I(u'pce')).hide() 
# words_pref and hyphanated_pref are the samething , just one hyphen diff
words_pref = (I(u'power') + I(u'conversion') + I(u'efficiency')).hide()
hyphanated_pref = (I(u'power') + I(u'-') + I('conversion') + I(u'efficiency') | I(u'efﬁciency')).hide()

 
joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge)
spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') + R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge)
to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') +
                                                                    R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join)

# prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('PCEs') + Optional(rbrct) | I('power') + Optional(I('conversion')) + Optional((I('efficiency') | I('range'))) + Optional((I('temperature') | I('range')))
#                                     ).hide() + Optional(lbrct + W('PCE') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('%')).hide()

prefix = Optional(I('a')).hide() + (Optional(lbrct) + abbrv_prefix + Optional(rbrct) | I('power') + Optional(I('conversion')) + Optional((I('efficiency') | I(u'efﬁciency') | I('range') | words_pref)) + Optional((I('temperature') | I('range')))).hide() + Optional(lbrct + W('PCE') + rbrct) + Optional (W('thus')) + Optional (W('reached')) + Optional (W('result')) + Optional (W('up')) + Optional(W('=') | W('¼') | I('of') | I('was') | I('is') | I('average') | I('high') | I('at') | I('to')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('%')).hide()

# the combination of different components to form the pce pattern
pce_first  = (words_pref + (Optional(lbrct) + abbrv_prefix + Optional(rbrct)) + ZeroOrMore(common_text) + value + units)(u'pce')
# pce_first = (prefix + ZeroOrMore(common_text) + value + units)(u'pce')
pce_second = (prefix + value + units)(u'pce')

# pce_pattern defined here to be included in the PceParser
pce_pattern = pce_first | pce_second

class PceParser(BaseParser):
    root = pce_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            pce_pattern=[
                Pce(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound
    
Sentence.parsers.append(PceParser())
Paragraph.parsers.append(PceParser())







#FF Parser 
class Ff(BaseModel):
    value = StringType()
    units = StringType()

Compound.ff_pattern = ListType(ModelType(Ff))

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'%') | I(u'percent'))(u'units') 
#units = (W(u'%'|W(u' ')|I(u'percent')))('units').add_action(merge)

value = R(u'\d+(\.\d+)?')(u'value')
value = R(u'\d+(\.\d+)?')| R(u'\b([0-9]|[1-9][0-9]|100)\b') | (u'value')

abbrv_prefix = (I(u'FF') | I(u'ff')).hide()
words_pref = (I(u'fill') | I(u'fill') + I(u'factor')).hide()
hyphanated_pref = (I(u'fill') | I(u'fill') + I(u'-') + I('factor')).hide()
joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge)
spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') +
                                                                        R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge)
to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') +
                                                                    R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join)

prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('FF') + Optional(rbrct) | I('fill') | I('ﬁll') + Optional(I('factor'))
                                    ).hide() + Optional(lbrct + W('FF') + rbrct) + Optional(W('=') | W('¼') | W(';') | W(',') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | I('average') | I('to') |I('around')| I ('%')).hide()

ff_first  = (words_pref + (Optional(lbrct) + abbrv_prefix + Optional(rbrct)) + ZeroOrMore(common_text) + value + Optional(units))(u'ff')
ff_second = (prefix + value + Optional(units))(u'ff')
ff_third = (abbrv_prefix + prefix + value)(u'ff')
ff_pattern = ff_first|ff_second|ff_third



class FfParser(BaseParser):
    root = ff_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            ff_pattern=[
                Ff(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

def parse_ff(list_of_sentences):
    
    #Takes a list of sentences and parses for quantified PCE
    #information and relationships to chemicals/chemical labels
    

    Sentence.parsers.append(FfParser())

    cde_senteces = [Sentence(sent).records.serialize()
                    for sent in list_of_sentences]
    return cde_senteces

Sentence.parsers.append(FfParser())
Paragraph.parsers.append(FfParser())






# VOC Parser

class Voc(BaseModel):
    value = StringType()
    units = StringType()

Compound.voc_pattern = ListType(ModelType(Voc))

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'V') | I(u'v') | I(u'volt') | I(u'volts'))(u'units').add_action(merge)
value = R(u'\d+(\.\d+)?')| R(u'\b([0-9]|[1-9][0-9]|100)\b') |(u'value')
value = R(u'\d+(\.\d+)?')(u'value')
abbrv_prefix = (I(u'Voc') | I(u'voc') | I(u'VOC')).hide()
words_pref = (I(u'open') + I(u'circuit') + I(u'voltage')).hide()
hyphanated_pref = (I(u'open') + I(u'-') + I('circuit') + I(u'voltage')).hide()
joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge)
spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') +
                                                                        R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge)
to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') +
                                                                    R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join)

# prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('Voc')|W('VOC') + Optional(rbrct) | I('open') + Optional(I('circuit')) + Optional((I('voltage')))
#                                     ).hide() + Optional(lbrct + W('Voc') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('V')).hide()
# Original code from SAM

prefix = Optional(I('a')).hide() + (Optional(lbrct) + I('Voc') + Optional(rbrct) | Optional(I('open')) + Optional(I('circuit')) + Optional((I('voltage')))
                                    ).hide() + Optional(lbrct + W('Voc') + rbrct) + Optional(W('=') | W('¼') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | I('average') | ('around') | I('V')).hide()

voc_first  = (words_pref + (Optional(lbrct) + abbrv_prefix + Optional(rbrct)) + ZeroOrMore(common_text) + Optional(lbrct) + value + Optional(rbrct) + units)(u'voc')
voc_second = (prefix + Optional(lbrct) + value + Optional(rbrct) + units)(u'voc')
voc_pattern = voc_first | voc_second

class VocParser(BaseParser):
    root = voc_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            voc_pattern=[
                Voc(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

Sentence.parsers.append(VocParser())
Paragraph.parsers.append(VocParser())

In [None]:
# paperparser https://github.com/paper-parser/paper-parser dev notebooks and example -- they are also doing solar cell stuff, the source code uses chemdataextractor

# chemdataextractor https://github.com/mcs07/ChemDataExtractor --> examples inlcude tg, melting point and others, check their regex

# combine both of them

# add samples (unqiue samples, representatives) to a new python file and keep them so its easier to find
# dont parse documents since it is messy

# go to the text csv file, find out 100~ papers from them, from the text content, take the previous 400 words, and thats the training data for this case

# 

In [37]:
import os
import pandas as pd

final = pd.DataFrame(columns = ["REF_NO","PCE_AVERAGE", "PCE_MAX", "VOC", "FF"])

folder_path = "C:/Users/walid/OneDrive/Documents/GitHub/AutoDataMining/Named Entity Recognition/one_compound"
files = os.listdir(folder_path)    

In [44]:
row = 80 #manually need to change this value
# Indexing: [Start #:End #] This is 0 based, so 0:20 does the first 20, 20:40 the next 20, and so on
for x in files[80:100]:
    print(row)
    f = open(folder_path + "/" + x, 'rb') #Extracting pdf article from One Compound folder
    doc = Document.from_file(f) 
    para = doc.elements # Outputs list of paragaph objects
    sentence_records = []
    for p in para: # stores all sentences from paragraphs
        for s in p:
            sentence_records.append(s.records.serialize())
    pce_values = []
    voc_values = []
    ff_values = []

    for x in sentence_records:
        for y in x:
            for z in y:
                if z == 'voc_pattern':
                    for a in y.values():
                        for b in a:
                            counter = 0
                            for c in b.values():
                                if counter % 2 == 0:
                                    voc_values.append(c)
                                counter += 1
                if z == 'pce_pattern':
                    for a in y.values():
                        for b in a:
                            counter = 0
                            for c in b.values():
                                if counter % 2 == 0:
                                    pce_values.append(c)
                                counter += 1
                if z == 'ff_pattern':
                    for a in y.values():
                        for b in a:
                            counter = 0
                            for c in b.values():
                                if counter % 2 == 0:
                                    ff_values.append(c)
                                counter += 1
    
    pce_final = []
    voc_final = []
    ff_final = []

    for item in pce_values:
        try:
            float(item)
            pce_final.append(item)
        except ValueError: 
            print("")

    for item in voc_values:
        try:
            float(item)
            voc_final.append(item)
        except ValueError: 
            print("")

    for item in ff_values:
        try:
            float(item)
            ff_final.append(item)
        except ValueError: 
            print("")
    
    pce_values = list(map(float, pce_final))
    voc_values = list(map(float, voc_final))
    ff_values = list(map(float, ff_final))

    final.at[row, "REF_NO"] = os.path.basename(f.name)[0:os.path.basename(f.name).find(" ")]

    if len(pce_values) > 0:
        final.at[row, "PCE_AVERAGE"] = sum(pce_values) / len(pce_values)
        final.at[row, "PCE_MAX"] = max(pce_values)
    else:
        final.at[row, "PCE_AVERAGE"] = None
        final.at[row, "PCE_MAX"] = None

    final.at[row, "VOC"] = voc_values

    final.at[row, "FF"] = ff_values
    row += 1

final.to_csv('NER_Output.csv')
print(final)

80



















81













82














83
























84


85










86









87
















88









89












90










91







92


















93





94

















95















96





97



98
















99












   REF_NO PCE_AVERAGE PCE_MAX  \
0       1         2.3     2.3   
1      10     1.53333     2.4   
2     102       9.278      10   
3     103        4.45     6.3   
4     110       3.215       6   
..    ...         ...     ...   
95    379       1.832    2.04   
96     38        8.04     9.4   
97     39         5.5     5.5   
98    392        5.36     7.4   
99    393       6.748    10.6   

                                                  VOC  \
0                               [0.525, 0.525, 0.525]   
1      [0.57, 0.61, 0.62, 0.57, 0.56, 0.6, 0.6, 0.85]   
2                                         [0.84, 0.8]   
3     [0.43, 0.74, 0.76, 60.0, 60.0, 2.0, 20.0, 80.0]   
4   [0.85, 0.92, 0.85,

In [69]:
saeki_data = pd.read_csv('C:/Users/walid/OneDrive/Documents/GitHub/AutoDataMining/Named Entity Recognition/Saekis_manual_dataset.csv')
final = pd.read_csv('C:/Users/walid/OneDrive/Documents/GitHub/AutoDataMining/Named Entity Recognition/NER_Output.csv')

print(saeki_data.columns['Ref. No'])
#saeki_data.loc[:,2] = saeki_data.iloc[:,2][1:]

comparison = pd.DataFrame(columns = ["REF_NO","PCE_AVERAGE", "PCE_MAX", "VOC", "FF"])

#for x in range(len(final)):
    #toCompare = saeki_data.loc[saeki_data[,2] == final[x, 'REF_NO']]
    #comparison.at[x, "REF_NO"] = toCompare.iloc[:,2]
    #comparison.at[x, "PCE_AVERAGE"] = toCompare['PCE_ave'] == final[x, "PCE_AVERAGE"]
    #comparison.at[x, "PCE_MAX"] = toCompare['PCE_max'] == final[x, "PCE_MAX"]
    #comparison.at[x, "VOC"] = any(i == toCompare['Voc'] for i in final[x, "VOC"])
    #comparison.at[x, "FF"] = any(i == toCompare['FF'] for i in final[x, "FF"])

#print("PCE_AVERAGE:" + comparison.groupby('PCE_AVERAGE').count())
#print("PCE_MAX:" + comparison.groupby('PCE_MAX').count())
#print("VOC:" + comparison.groupby('VOC').count())
#print("FF:" + comparison.groupby('FF').count())

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
##Examples

#1: Article S1

sentence = "FIG. 3. Color online The I-V curves of the optimized pBTTT:PC71BM cell in the dark and under calibrated 1 sun illumination PCE=2.34%, Jsc=9.37 mA/cm2, Voc=0.525 V, FF=0.48."
output = "[{'names': ['1)Color online(cid']}, {'ff_pattern': [{'value': '0.48'}]}, {'voc_pattern': [{'value': '0.525', 'units': 'V'}]}]"
# FF: 1, VOC: 1, PCE:0 #PCE value in another sentence

sentence = " This cell had a Jsc of 9.37 mA/cm2,aVoc of 0.525 V, and a FF of 0.48 for an overall PCE of 2.3%" 
output = "[{'pce_pattern': [{'value': '2.3', 'units': '%'}]}, {'ff_pattern': [{'value': '0.48'}]}, {'voc_pattern': [{'value': '0.525', 'units': 'V'}]}]"
# FF: 1, VOC: 1, PCE: 1

#2: Article S2 
sentence = "The most efficient devices, obtained using 1,8diiodooctane, had an average power-conversion efficiency of 5.1% under 100 mW/cm2, with short-circuit current Jsc ) 15.7 mA/cm2, fill factor FF ) 0.53, and open-circuit voltage Voc ) 0.61 V, i.e., an∼10% higher efficiency than that obtained with the use of 1,8-octanedithiol"
output = "[{'names': ['1,8- diiodooctane']}, {'names': ['1,8-octanedithiol']}, {'ff_pattern': [{'value': '0.53'}]}, {'voc_pattern': [{'value': '0.61', 'units': 'V'}]}]"
# FF: 1, VOC: 1, PCE: 0 #PCE value in table 

#3: Article S5
sentence = "J–V curves, a signiﬁcant increase in Voc is clearly observed from PBDTTT–E to PBDTTT–CF. AVoc as high as 0.76 V was observed indevicesbasedonPBDTTT–CF" #Does not count -CF as apart of name
output = "[{'names': ['PBDTTT']}, {'voc_pattern': [{'value': '0.76', 'units': 'V'}]}]"
#FF: 0, VOC: 1, PCE: 0

#special case 1
 = ".CombinedwithitshighJsc andﬁll factor (FF), a high PCE of 7.38+0.4% (a 5% device variation), measured in more than 75 devices, was achieved in the PBDTTT– CF system, the highest measured PCE being 7.73%." # output did not pick up ranged values 
sentence = "In the recent work26, higher values of Voc are observed when ﬂuorine, an atom of high electron afﬁnity, is introduced to the thieno[3,4-b]thiophene unit, a PCE of 6.1% having been demonstrated26"
output = "[{'names': ['ﬂuorine']}, {'names': ['thieno[3,4-b]thiophene']}, {'pce_pattern': [{'value': '6.1', 'units': '%'}]}]"
#FF: 0, VOC: 0, PCE: some  
#Special case 2 = Fill factor value was on a figure

#4: Article s6
sentence = " Figure 2a shows the typical I-V curve under illumination of AM 1.5G (100 mW/ cm2) of the devices with Voc 0.68V, Jsc 12.7 mA/cm2, and FF 55%. "
output = "[{'ff_pattern': [{'value': '55', 'units': '%'}]}, {'voc_pattern': [{'value': '0.68', 'units': 'V'}]}]"
#FF: 1, VOC: 1, PCE:0
sentence = "The average power conversion efﬁciency (PCE) of 100 devices reached 4.7 %, and for the best device, a PCE of 5.1% was observed." #Special Case 3: Should add average PCE to regex
output = "[{'pce_pattern': [{'value': '5.1', 'units': '%'}]}]" 
#FF:0, VOC:0, PCE:1 (MAX PCE Value)

#5:  Article s7
sentence = "showed the thickness dependence of photovoltaic performance (Supplementary Figs 3 and 4 and Supplementary Tables 3 and 4). Notably, the PCE reached 10.1% (JSC =19.4 mAcm–2, VOC =0.708 V, FF=73.4%), with an average of 9.77%, for the inverted PC71BM cell with an active layer thickness of ∼290 nm, which is one of the highest PCEs observed in a single-junction cell" #Special case 3 
output = "[{'names': ['PC71BM']}, {'ff_pattern': [{'value': '73.4', 'units': '%'}]}, {'voc_pattern': [{'value': '0.708', 'units': 'V'}]}]"
#FF:1, VOC: 1, PCE:0
Sentence = "It is interesting to note that PCEs close to 10% were also observed for the inverted PC61BM cell with a thickness of 280 nm (PCE=9.80% (average 9.55%), JSC =18.2 mA cm–2, VOC =0.729 V, FF=73.9%)"
output = "[{'names': ['PC61BM']}, {'pce_pattern': [{'value': '9.80', 'units': '%'}]}, {'ff_pattern': [{'value': '73.9', 'units': '%'}]}, {'voc_pattern': [{'value': '0.729', 'units': 'V'}]}]"
#FF: 1, VOC: 1, PCE:1 (ALL values are very close to Saeki's values)

#6: Article s8 
sentence = "The enhanced performance of DBFI-EDOT devices is largely a result of the enhanced Voc  (0.93 V) compared to the PC 71B M cells (0.67 V)." 
output = "[{'names': ['DBFI- EDOT']}], [{'voc_pattern': [{'value': '0.93', 'units': 'V'}]}, {'voc_pattern': [{'value': '0.67', 'units': 'V'}]}]"
#FF:0, VOC:1, PCE:0
sentence = "the best performance seen in nonfullerene polymer solar cells to date. For comparison the reference optimized PC 71B M:PSEHTT photodiodes, gave the best PCE of 5.62% with an average PCE of 5.52% ± 0.09%"
output = " [{'pce_pattern': [{'value': '5.62', 'units': '%'}]}, {'pce_pattern': [{'value': '5.52', 'units': '%'}]}]"
#FF:0, Voc:0, PCE:1 (both average and max)

#7: Article s9
sentence = " Therefore, to the best of our knowledge, in demonstrating high PCEs of B9% with a high VOC of B1V,smallEloss and a small energy offset at the same time, PNOz4T is regarded as quite a unique narrow bandgap polymer."
output = "[{'pce_pattern': [{'value': 'B9', 'units': '%'}]}, {'voc_pattern': [{'value': 'B1', 'units': 'V'}]}]"
#FF:0, VOC:1, PCE:1 (Wrong FF extreacted, extracted 0.59 instead of 0.64)
#B = squiggly line 

#8: Article s10
#FF:0, VOC: 0,PCE:0
#None of Saeki's values could be found in article

#9: Article s13
sentence = "A PCE of about 7.4% has been achieved from... "
output = "[{'names': ['PC71BM¼ phenyl-C71- butyric acid methyl ester']}, {'pce_pattern': [{'value': '7.4', 'units': '%'}]}]"
#PCE:1
sentence = "The fill factor, at the same time, increases to 69%"
output = "FF values of 59.23 and 50.52"
#FF:0
sentence = "...Voc has increases to 0.75 V as expected..."
output = "[{'voc_pattern': [{'value': '0.75', 'units': 'V'}]}, {'voc_pattern': [{'value': '0.58', 'units': 'V'}]}]"
#Saeki's VOC = 0.74 v.s 0.75 given here therefore FF:1

#10: Article s15
sentence = "When illuminated with simulated solar light (AM1.5 Global, light intensity about 100 W m22), the PCz : PDI (1 : 4) cell displayed an Isc of 0.26 mA cm22,aVoc of 0.71 V, an FF of 0.37, and an efficiency of 0.63%, which to the best of our knowledge is the highest efficiency yet reported for a polymerbased solar cell not incorporating PCBM."
output = "[{'pce_pattern': [{'value': '0.6', 'units': '%'}]}]"
#PCE:0 #Saeki's value = 0.07

sentence = "When illuminated with simulated solar light (AM1.5 Global, light intensity about 100 W m22), the PCz : PDI (1 : 4) cell displayed an Isc of 0.26 mA cm22,aVoc of 0.71 V, an FF of 0.37, and an efficiency of 0.63%, which to the best of our knowledge is the highest efficiency yet reported for a polymerbased solar cell not incorporating PCBM. "
output = "[{'names': ['PCz']}, {'names': ['PDI']}, {'names': ['PCBM']}, {'ff_pattern': [{'value': '0.37'}]}, {'voc_pattern': [{'value': '0.71', 'units': 'V'}]}]"
#FF:1, VOC:1

#11: Article s17
sentence = "The PCDTBT/PC70BM solar cells reproducibly yield JSC¼10.6 mA cm22, VOC¼0.88 V, FF¼0.66 and h e¼6.1%." 
output = "[{'pce_pattern': [{'value': '6.1', 'units': '%'}]}, {'ff_pattern': [{'value': '0.66', 'units': '%'}]}, {'voc_pattern': [{'value': '0.88', 'units': 'V'}]}]"
#FF: 1, VOC: 1, PCE: 1

#12:Article s21
sentence = "page 1087 last paragraph above table"
output = "[{'pce_pattern': [{'value': '0.61', 'units': '%'}]}, {'ff_pattern': [{'value': '29', 'units': '%'}]}, {'voc_pattern': [{'value': '0.41', 'units': 'V'}]}]"
#FF: 1, VOC: 1, PCE: 1

#13: Article s22
sentence = "The optimized cells had PDPP3T:[70]PCBM in a 1:2 weight ratio and provided Voc ) 0.65 V, Jsc ) 11.8 mA/cm2, and FF ) 0.60, resulting in an η value of 4.7%"
output = "[{'names': ['[70]PCBM']}, {'ff_pattern': [{'value': '0.60'}]}, {'voc_pattern': [{'value': '0.65', 'units': 'V'}]}]"
#FF:1, Voc:1, PCE:0 , Special Case 4 PCE was represented by greek letter

#14: Article s23
sentence = "The devices prepared using chlorobenzene as solvent show higher performance with g = 2.5%, Jsc = 6.8 mA/cm2, Voc = 0.79 Volt, and FF = 47%. The devices prepared by using ODCB as solvent demonstrate the best performance with g = 3.2%," #Special case 4
output = "['chlorobenzene']}, {'ff_pattern': [{'value': '47', 'units': '%'}]}, {'voc_pattern': [{'value': '0.79', 'units': 'Volt'}]}]", "[{'pce_pattern': [{'value': '3.2', 'units': '%'}]}]" #(Seperate sentence PCE)
#FF: 1, VOC: 1, PCE: 1


#15: Article s24
sentence = "Figure 4 shows the J-V curves of an optimized device measured under AM 1.5G irradiation 100 mA/cm2 and dark conditions; its efﬁciency parameters are as follow: Voc=0.77 V, Jsc=9.10 mA/cm2, FF =0.55, and PCE=3.80%."
output = "[{'pce_pattern': [{'value': '3.80', 'units': '%'}]}, {'ff_pattern': [{'value': '0.55'}]}, {'voc_pattern': [{'value': '0.77', 'units': 'V'}]}]"
#FF: 1, VOC: 1, PCE: 1

#16: Article s27
sentence = "The highest PCE of PSCs based on the PDTP-DTDPP(Bu):PC70BM system reached 2.71%, which was lower than the4-5%oftheP3HT:PCBMdevices.Thiscouldbemainly attributed to the lower VOC (∼0.4 V)." 
output = "[{'voc_pattern': [{'value': '0.4', 'units': 'V'}]}]"
#FF:0. PCE:0, VOC:1

#17: Article s30
sentence = "blend ratio of 1:3 exhibited a high short-circuit current of 10.87 mA/cm2 and a power conversion efficiency of 2.27%"
output = "[{'pce_pattern': [{'value': '2.27', 'units': '%'}]}]" #FF and Voc are found in table 
#PCE: 1 

#18: Article s31
sentence = "The device exhibits a Voc of 0.56 V and a ﬁll factor of 63.3%, which yields an impressive PCE of 5.30%" #PCE in dif. sentence
output = "[{'pce_pattern': [{'value': '5.30', 'units': '%'}]}, {'ff_pattern': [{'value': '63.3', 'units': '%'}]}, {'voc_pattern': [{'value': '0.56', 'units': 'V'}]}]"
#FF: 1, VOC: 1, PCE: 1

#19: Article s32
sentence = "The champion result reached 6.58%, with a Voc of 0.70 V, a Jsc of 14.7 mA/cm2, and an FF of 0.64, which is the highest value so far for polymer solar cells."
output = "[{'names': ['PBDTTT']}, {'pce_pattern': [{'value': '6.58', 'units': '%'}]}]" #PCE output 
"[{'ff_pattern': [{'value': '0.64'}]}, {'voc_pattern': [{'value': '0.70', 'units': 'V'}]}]""
#FF: 1, VOC: 1, PCE: 1

#20: Article s33
sentence = "p.g 21364 1st paragraph left side"
output = "[{'pce_pattern': [{'value': '3.64', 'units': '%'}]}]" #PCE output
#VOC & FF from table 
#PCE: 1






