In [1]:
import sys
import re
import os
import pandas as pd
sys.path.append("../automaterialsdata/")

from chemdataextractor_parsers import NumericalProperty, OPVPropertyParser
from chemdataextractor_parsers import OPVMaterials, OPVMaterialsParser
from chemdataextractor_parsers import sentence_preprocessing

from chemdataextractor import Document
from chemdataextractor.model import Compound,ListType, ModelType
from chemdataextractor.doc import Paragraph, Sentence

Compound.opv_materials = ListType(ModelType(OPVMaterials))
Compound.opv_property = ListType(ModelType(NumericalProperty))
Sentence.parsers.append(OPVMaterialsParser())
Paragraph.parsers.append(OPVMaterialsParser())
Sentence.parsers.append(OPVPropertyParser())
Paragraph.parsers.append(OPVPropertyParser())

In [2]:
saeki_papers_path = "/Users/hanghu/Desktop/msedatapipeline/JPCL2018-papers/_PDF"
saeki_papers = [x for x in os.listdir(saeki_papers_path) if x.endswith('pdf') ]
saeki_papers = sorted(saeki_papers, key=lambda x: int(re.match(r'[0-9]+', x)[0]))

saeki_papers[:10]

['1 PBTTT-C14.pdf',
 '2 Blouin_et_al-2007-Advanced_Materials.pdf',
 '3 PCPDTBT.pdf',
 '4 PffBT4T.pdf',
 '5 PBDTTT-CF.pdf',
 '6 PSBTBT.pdf',
 '7 PNTz4T.pdf',
 '8 PSEHTT.pdf',
 '9 PNOz4T.pdf',
 '10 P3HT.pdf']

In [3]:
def create_empty_opv_property_dataframe():
    colnames = ["Paper", "Material", "Property", "Value", "Unit"]
    return pd.DataFrame(columns=colnames)

def count_record_fields(r):
    counter = {'opv_materials': 0, 'opv_property': 0}
    for entry in r:
        entry_name = list(entry.keys())[0]
        counter[entry_name] += 1
        
    return counter['opv_materials'], counter['opv_property']

def check_record_row(row):
    if (row[2] == 'PCE' or row[2] == 'FF') and float(row[3]) > 100:
        return False
    
    return True 

In [4]:
def merge_records_to_dataframe(article_id, records):
    output_df = create_empty_opv_property_dataframe()
    colnames = list(output_df.columns)
    rows = set() 
    for r in records:
        material   = 'unknown'
        properties = []
        for entry in r:
            if 'opv_materials' in entry.keys():
                material = entry['opv_materials'][0]['name']
            else:
                property_i = entry['opv_property'][0]
                property_i['name'] = OPVPropertyParser.find_unique_name(property_i['name'])
                properties.append([property_i['name'], property_i['value'], property_i.get('unit', 'N/A')])

        for p in properties:
            p_row = tuple([article_id, material] + p)
            if check_record_row(p_row):
                rows.add(p_row)

    for r in rows:
        temp_df = pd.DataFrame(data=[list(r)], columns=colnames)
        output_df = pd.concat([output_df, temp_df], ignore_index=True)


    return output_df

In [9]:
output_df = create_empty_opv_property_dataframe()

for article in saeki_papers:
    print('* parsing ', article)
    f = open(os.path.join(saeki_papers_path, article), 'rb') #Extracting pdf article from One Compound folder
    try:
        doc = Document.from_file(f) 
    except:
        continue
    para = doc.elements # Outputs list of paragaph objects
    sentence_records = []
    for p in para: # stores all sentences from paragraphs
        i = 0
        useprev  = False
        while i < len(p):
            s = sentence_preprocessing(p[i])
            if useprev: s = sentence_preprocessing(p[i-1]) + s
            r = s.records.serialize()
            nMat, nP = count_record_fields(r)
            
            if nMat > 1 or nP == 0 or (nMat == 0 and nP == 0):
                i += 1
                useprev = False
            elif (nMat == 1 and nP > 0):
                sentence_records.append(r)
                i += 1
                useprev = False
            else:
                if (not useprev):
                    useprev = True
                    continue
                sentence_records.append(r)
                i += 1
                useprev = False
    temp_df = merge_records_to_dataframe(article, sentence_records)
    output_df = pd.concat([output_df, temp_df], ignore_index=True)
    
    print(' ... obtained ', len(temp_df), " records, total number of records now is ", len(output_df))
    
output_df.to_csv("../data/saeki_first_full_output_.csv")

* parsing  1 PBTTT-C14.pdf
 ... obtained  7  records, total number of records now is  7
* parsing  2 Blouin_et_al-2007-Advanced_Materials.pdf
 ... obtained  7  records, total number of records now is  14
* parsing  3 PCPDTBT.pdf
 ... obtained  4  records, total number of records now is  18
* parsing  4 PffBT4T.pdf
 ... obtained  10  records, total number of records now is  28
* parsing  5 PBDTTT-CF.pdf
 ... obtained  12  records, total number of records now is  40
* parsing  6 PSBTBT.pdf
 ... obtained  6  records, total number of records now is  46
* parsing  7 PNTz4T.pdf
 ... obtained  18  records, total number of records now is  64
* parsing  8 PSEHTT.pdf
 ... obtained  4  records, total number of records now is  68
* parsing  9 PNOz4T.pdf
 ... obtained  21  records, total number of records now is  89
* parsing  10 P3HT.pdf
 ... obtained  23  records, total number of records now is  112
* parsing  11 PQT-12.pdf
 ... obtained  6  records, total number of records now is  118
* parsing 

 ... obtained  6  records, total number of records now is  577
* parsing  87 Hendriks_et_al-2013-Angewandte_Chemie_(International_ed._in_English).pdf
 ... obtained  1  records, total number of records now is  578
* parsing  88 ja511984q.pdf
 ... obtained  9  records, total number of records now is  587
* parsing  89 ja2089662.pdf
 ... obtained  1  records, total number of records now is  588
* parsing  90Dou_et_al-2013-Advanced_Materials.pdf
 ... obtained  10  records, total number of records now is  598
* parsing  91 c3cc47868h.pdf
 ... obtained  10  records, total number of records now is  608
* parsing  92 c1ee01213d.pdf
 ... obtained  5  records, total number of records now is  613
* parsing  93 ja4101003.pdf
 ... obtained  6  records, total number of records now is  619
* parsing  94 c4ta04118f.pdf
 ... obtained  0  records, total number of records now is  619
* parsing  95 pj201696a.pdf
 ... obtained  17  records, total number of records now is  636
* parsing  96 Zhou_et_al-2012-

 ... obtained  5  records, total number of records now is  1258
* parsing  167 Zhang_et_al-2015-Advanced_Materials.pdf
 ... obtained  11  records, total number of records now is  1269
* parsing  168 cm3017006.pdf
 ... obtained  17  records, total number of records now is  1286
* parsing  169 C6PY00640J.pdf
 ... obtained  4  records, total number of records now is  1290
* parsing  170 art%3A10.1007%2Fs11426-016-0349-7.pdf
 ... obtained  15  records, total number of records now is  1305
* parsing  171 C6TC02915A.pdf
 ... obtained  6  records, total number of records now is  1311
* parsing  172 Huo_et_al-2015-Advanced_Materials.pdf
 ... obtained  10  records, total number of records now is  1321
* parsing  173 Liu_et_al-2017-Advanced_Materials.pdf
 ... obtained  20  records, total number of records now is  1341
* parsing  174 1-s2.0-S1566119914003541-main.pdf
 ... obtained  13  records, total number of records now is  1354
* parsing  175 C3CP54022G.pdf
 ... obtained  1  records, total num

 ... obtained  0  records, total number of records now is  1807
* parsing  244 c3py00391d.pdf
 ... obtained  4  records, total number of records now is  1811
* parsing  245 cm302861s.pdf
 ... obtained  8  records, total number of records now is  1819
* parsing  246 ma300060z.pdf
 ... obtained  5  records, total number of records now is  1824
* parsing  247 Wang_et_al-2014-Macromolecular_Chemistry_and_Physics.pdf
 ... obtained  8  records, total number of records now is  1832
* parsing  248 1-s2.0-S1566119914005345-main.pdf
 ... obtained  17  records, total number of records now is  1849
* parsing  249 c5py00305a.pdf
 ... obtained  15  records, total number of records now is  1864
* parsing  250 c3tc32087a.pdf
 ... obtained  0  records, total number of records now is  1864
* parsing  251 c4ta05445h.pdf
 ... obtained  3  records, total number of records now is  1867
* parsing  252 c4ta01479k.pdf
 ... obtained  2  records, total number of records now is  1869
* parsing  253 c5ta05096k.pdf

 ... obtained  6  records, total number of records now is  2395
* parsing  324 ja1110915.pdf
 ... obtained  16  records, total number of records now is  2411
* parsing  325 ja311700u.pdf
 ... obtained  4  records, total number of records now is  2415
* parsing  326 ma202764v.pdf
 ... obtained  7  records, total number of records now is  2422
* parsing  327 C3EE41948G.pdf
 ... obtained  13  records, total number of records now is  2435
* parsing  328 c3ra47098a.pdf
 ... obtained  7  records, total number of records now is  2442
* parsing  329 Cheng_et_al-2012-Chemistry_-_An_Asian_Journal.pdf
 ... obtained  6  records, total number of records now is  2448
* parsing  330 Bronstein_et_al-2011-Macromolecular_Rapid_Communications.pdf
 ... obtained  2  records, total number of records now is  2450
* parsing  331 c2cc33718e.pdf
 ... obtained  6  records, total number of records now is  2456
* parsing  332 ma302390z.pdf
 ... obtained  1  records, total number of records now is  2457
* parsing  

 ... obtained  8  records, total number of records now is  3043
* parsing  404 ACS Appl. Mater. Interfaces 2017, 9, 24011.pdf
 ... obtained  7  records, total number of records now is  3050
* parsing  405 ACS Appl. Mater. Interfaces 2017, 9, 24020.pdf
 ... obtained  24  records, total number of records now is  3074
* parsing  406 Adv. Energy Mater. 2016, 6, 1600148.pdf
 ... obtained  8  records, total number of records now is  3082
* parsing  407 Adv. Energy Mater. 2016, 6, 1600430.pdf
 ... obtained  12  records, total number of records now is  3094
* parsing  408 Adv. Energy Mater. 2017, 7, 1601138.pdf
 ... obtained  4  records, total number of records now is  3098
* parsing  409 Adv. Energy Mater. 2017, 1702166.pdf
 ... obtained  13  records, total number of records now is  3111
* parsing  410 Adv. Funct. Mater. 2016, 26, 226.pdf
 ... obtained  4  records, total number of records now is  3115
* parsing  411 Adv. Funct. Mater. 2017, 27, 1701491.pdf
 ... obtained  6  records, total num

 ... obtained  19  records, total number of records now is  3681
* parsing  475 Organic Electronics 46 (2017) 192.pdf
 ... obtained  17  records, total number of records now is  3698
* parsing  476 Phys. Chem. Chem. Phys., 2016, 18, 8389.pdf
 ... obtained  14  records, total number of records now is  3712
* parsing  477 Polym Int 2017; 66, 1206.pdf
 ... obtained  3  records, total number of records now is  3715
* parsing  478 Polym. Chem., 2016, 7, 164.pdf
 ... obtained  6  records, total number of records now is  3721
* parsing  479 Polym. Chem., 2016, 7, 2329.pdf
 ... obtained  9  records, total number of records now is  3730
* parsing  480 Polym. Chem., 2017, 8, 2334.pdf
 ... obtained  7  records, total number of records now is  3737
* parsing  481 Polym. Chem., 2017, 8, 2979.pdf
 ... obtained  12  records, total number of records now is  3749
* parsing  482 Polym. Chem., 2017, 8, 3622.pdf
 ... obtained  6  records, total number of records now is  3755
* parsing  483 Polym. Chem., 2

In [10]:
len(output_df[output_df['Material'] != 'unknown'])

2563

In [7]:
output_df.head(5)

Unnamed: 0,Paper,Material,Property,Value,Unit
0,1 PBTTT-C14.pdf,unknown,PCE,2.3,%
1,1 PBTTT-C14.pdf,pBTTT,PCE,2.3,%
2,1 PBTTT-C14.pdf,pBTTT:PC-71-BM,JSC,9.37,mA/cm2
3,1 PBTTT-C14.pdf,pBTTT:PC-71-BM,VOC,0.525,V
4,1 PBTTT-C14.pdf,unknown,VOC,0.525,V
