In [1]:
#Import Statements
import os
import json
import pandas as pd
import csv
import requests
import inspect

#Imports from Chemdataextractor
from chemdataextractor.doc import Document, Heading, Paragraph, Sentence
from chemdataextractor.nlp.tokenize import ChemWordTokenizer
from chemdataextractor.nlp.pos import ChemCrfPosTagger

#Imports from Natural Language Toolkit
from nltk import sent_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [2]:
doi_index = []
text = []

In [3]:
#Importing all the articles and their previous labels
#In the future you would just use the articles classified as having data by the machine learning classifier
pathacs = '/Users/juliachotoo/ScrapyArticles/scrapedjson/acs2/'

for filename in os.listdir(pathacs):
    with open(pathacs + filename, 'r'):
        resultacs = json.load(open(pathacs + filename, mode='r'))
        doi_index.append(resultacs['doi'])
        text.append(resultacs['text'])

pathspr = '/Users/juliachotoo/ScrapyArticles/scrapedjson/spr2/'

for filename in os.listdir(pathspr):
    with open(pathspr + filename, 'r'):
        resultspr = json.load(open(pathspr + filename, mode='r'))
        doi_index.append(resultspr['doi'])
        text.append(resultspr['text'])

pathlabels = '/Users/juliachotoo/ScrapyArticles/datalabels.csv'
labels = []

with open(pathlabels, 'r') as csvfile:
    resultlabels = csv.reader(csvfile, delimiter=' ', quotechar='|')
    for row in resultlabels:
        labels.append(', '.join(row))

labels[0] = labels[0].replace('\ufeff', '')

In [4]:
df = pd.DataFrame({'data': text, 'labels': labels})
df = df[~df['labels'].str.contains('2')]
#print(df.loc[0:9])

In [5]:
#Import NIST API with adsorbate names
host = 'http://dirac.nist.gov/adsorption.nist.gov'
URL = host + '/srd205/api/gases.json'
gas_library = json.loads(requests.get(URL).content.decode('utf-8'))

In [21]:
#print(gas_library)

In [6]:
#only scaled the text classification to one article
doc = Document(text[0])

In [7]:
#looks for all the chemical names in the article (Chemdataextractor)
names = doc.cems

In [8]:
print(names)

[Span('CH4', 606, 609), Span('11282-11285CODEN', 10347, 10363), Span('CO2', 6530, 6533), Span('nitro', 8345, 8350), Span('2,6-DNT', 8467, 8474), Span('triphenylene-2,6,10-tricarboxylic acid (H3TTCA)(7)', 1541, 1591), Span('CH4', 8900, 8903), Span('1,4-dioxane', 1643, 1654), Span('TTCA', 2607, 2611), Span('TNT', 8689, 8692), Span('N2', 4203, 4205), Span('CO2', 5927, 5930), Span('CO2', 6200, 6203), Span('CO2', 6440, 6443), Span('CO2', 6518, 6521), Span('CH4', 6522, 6525), Span('CH4', 4949, 4952), Span('N2', 729, 731), Span('In(NO3)3·H2O', 1597, 1609), Span('2,4-DNT', 8694, 8701), Span('CO2', 297, 300), Span('CO2', 8985, 8988), Span('CH4', 5065, 5068), Span('CO2', 5098, 5101), Span('CO2', 1413, 1416), Span('N2', 5823, 5825), Span('2,4-DNT', 7841, 7848), Span('CO2', 652, 655), Span('N2', 5735, 5737), Span('CO2', 8891, 8894), Span('N2', 4958, 4960), Span('DMF', 3516, 3519), Span('N2', 6454, 6456), Span('N2', 6534, 6536), Span('H2O', 3524, 3527), Span('CO2', 474, 477), Span('nitro', 1473, 14

In [9]:
print(names[0:2])

[Span('CH4', 606, 609), Span('11282-11285CODEN', 10347, 10363)]


In [8]:
print(names[0].__dict__)

{'text': 'DMF', 'start': 1352, 'end': 1355}


In [9]:
print(names[9].text)

N2


In [8]:
#Cross-referencing names from article with NIST API
gas_names = []
for y in gas_library:
    for x in y['Synonyms']:
        gas_names.append(x)

In [9]:
#printing the names that have been successfully cross-referenced
cross_ref = []
for x in gas_names:
    for i in range(len(names)):
        if names[i].text.lower() == x.lower():
            #print(names[i].text)
            cross_ref.append(names[i])
print(cross_ref)

[Span('CO2', 474, 477), Span('CO2', 9018, 9021), Span('CO2', 5525, 5528), Span('CO2', 1413, 1416), Span('CO2', 6399, 6402), Span('CO2', 5711, 5714), Span('CO2', 577, 580), Span('CO2', 652, 655), Span('CO2', 700, 703), Span('CO2', 4944, 4947), Span('CO2', 6000, 6003), Span('CO2', 6530, 6533), Span('CO2', 8891, 8894), Span('CO2', 8985, 8988), Span('CO2', 5098, 5101), Span('CO2', 297, 300), Span('CO2', 6440, 6443), Span('CO2', 6132, 6135), Span('CO2', 6200, 6203), Span('CO2', 5927, 5930), Span('CO2', 6518, 6521), Span('CO2', 5731, 5734), Span('CO2', 4835, 4838), Span('CO2', 5806, 5809), Span('CO2', 4592, 4595), Span('CO2', 5056, 5059), Span('CO2', 5448, 5451), Span('CO2', 4651, 4654), Span('N,N-dimethylformamide', 1358, 1379), Span('H2', 4794, 4796), Span('H2', 4453, 4455), Span('hydrochloric acid', 1614, 1631), Span('CH4', 606, 609), Span('CH4', 8989, 8992), Span('CH4', 721, 724), Span('CH4', 5249, 5252), Span('CH4', 6445, 6448), Span('CH4', 6522, 6525), Span('CH4', 4949, 4952), Span('CH

In [10]:
#spliting the text from the article into sentences using NLTK
sentences = sent_tokenize(text[0])

In [11]:
#finding the cross-referenced chemical names that are found in the sentences
important_sent =[]
for x in sentences:
    for i in range(len(cross_ref)):
        if cross_ref[i].text in x:
            important_sent.append(x)

In [12]:
#sentences with chemical names
def remove_duplicate(alist):
    return list(set(alist))
chem_sent = remove_duplicate(important_sent)
print(chem_sent)

['The result of single X-ray structural analysis reveals that 1 crystallizes in the R3̅c space group, which adopts a In3(CO2)8 cluster as a secondary building unit (SBU; see Figure S1a in the Supporting Information, SI).', '(9) The pores of 1 are filled with disordered Me2NH2+, DMF, and H2O molecules, and the solvent-accessible volume calculated using PLATON(10) is 48.1%.', 'The maximum fluorescent intensity of 1 was reduced by 82.2% upon exposure to 2 mM methanol solutions of TNP [quenching percentage = (I0 – I)/I0 × 100%, where I0 and I are fluorescent intensities of 1 before and after exposure to the nitroaromatic explosives].', 'As shown in Figure 2, N2 adsorption measurement for 1 at 77 K and 1 atm revealed a reversible type I isotherm with saturated N2 uptake of 247 cm3 g–1 (STP), a characteristic of microporous materials, corresponding to a Brunauer–Emmett–Teller surface area of 726.8 m2 g–1.', 'H2 adsorption isotherm of 1 measured at 77 K demonstrated an uptake of 167 cm3 g–1 (

In [13]:
#only sentences with chemical names and one of words defined below
words = ['adsorption', 'sorption', 'adsorb', 'adsorbent', 'adsorbate', 'loading']
iso_sent = []
for x in chem_sent:
    for word in words:
        if word in x:
            iso_sent.append(x)
iso_sent = remove_duplicate(iso_sent)
print(iso_sent)

['Furthermore, the adsorption isotherm of CO2 also shows a typical type I curve, with the amounts of CO2 uptake increasing abruptly at the beginning and then gradually reaching a plateau of 221 cm3 g–1 (STP) at 195 K and 1 atm.Figure 2.', 'Interestingly, 1 shows highly selective gas adsorption for CO2 over CH4 and N2 at 273 and 298 K. The CO2 uptakes at 1 atm reach 105.2 cm3 g–1 (4.69 mmol g–1, 20.6 wt %) at 273 K and 69.0 cm3 g–1 (3.08 mmol g–1, 13.5 wt %) at 298 K. For comparison, the CH4 and N2 uptakes are 30.2/18.3 and 7.0/4.3 cm3 g–1, respectively, under the same conditions (Figures 3a and S5 in the SI).', '(6)Herein, we report a luminescent microporous MOF, (Me2NH2)6[In10(TTCA)12]·24DMF·15H2O (1; TTCA = triphenylene-2,6,10-tricarboxylate and DMF = N,N-dimethylformamide), that exhibits highly selective CO2 adsorption over CH4 and N2 and selective sensing of the nitro explosive 2,4,6-trinitrophenol (TNP).Solvothermal reaction of triphenylene-2,6,10-tricarboxylic acid (H3TTCA)(7) wi

In [23]:
#the rest of this code is trying to parse the sentences above
#nothing below has successfully parsed the sentences
stop_words = set(stopwords.words('english'))

word_tokens = ''

filtered_sentence = []

for x in iso_sent:
    word_tokens = word_tokenize(x)

    filtered_sentence = [w for w in word_tokens if not w in stop_words]



    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    

In [30]:
sentence = iso_sent[1]
nouns = [token for token, pos in pos_tag(word_tokenize(sentence)) if pos.startswith('NNP')]
nouns

['CO2', 'CH4', 'N2', 'K.', 'CO2', 'K', 'K.', 'CH4', 'N2', 'S5', 'SI']

In [18]:
#extracting figures in text
keyword_figs = ['figure', 'fig', 'fig.', 'figures']
fig = []
for x in iso_sent:
    for y in keyword_figs:
        start_index = x.lower().find(y)
        int(start_index)
        end_index = start_index + (len(y) + 3)
        fig.append(x[start_index:end_index])

In [19]:
#figures found in text, there are repeats 
correct_fig = []
for z in fig:
    if len(z) > 3:
        correct_fig.append(z)
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)
for z in correct_fig:
    if hasNumbers(z) == False:
        correct_fig.remove(z)
print(correct_fig)

['Figure 2.', 'Figures 3', 'Figures 3a', 'Figures S7', 'Figure 2,', 'Figure S9', 'Figure']


In [20]:
#tokenizing certain sentence in the text using Chemdataextractor

cwt = ChemWordTokenizer()
sent = Sentence(iso_sent[0])
s = sent.pos_tagged_tokens
print(s)

cpt = ChemCrfPosTagger()


[('Furthermore', 'RB'), (',', ','), ('the', 'DT'), ('adsorption', 'NN'), ('isotherm', 'NN'), ('of', 'IN'), ('CO2', 'NN'), ('also', 'RB'), ('shows', 'VBZ'), ('a', 'DT'), ('typical', 'JJ'), ('type', 'NN'), ('I', 'CD'), ('curve', 'NN'), (',', ','), ('with', 'IN'), ('the', 'DT'), ('amounts', 'NNS'), ('of', 'IN'), ('CO2', 'NN'), ('uptake', 'NN'), ('increasing', 'VBG'), ('abruptly', 'RB'), ('at', 'IN'), ('the', 'DT'), ('beginning', 'NN'), ('and', 'CC'), ('then', 'RB'), ('gradually', 'RB'), ('reaching', 'VBG'), ('a', 'DT'), ('plateau', 'NN'), ('of', 'IN'), ('221', 'CD'), ('cm3', 'NN'), ('g', 'NN'), ('–', 'HYPH'), ('1', 'CD'), ('(', '-LRB-'), ('STP', 'NN'), (')', '-RRB-'), ('at', 'IN'), ('195', 'CD'), ('K', 'NN'), ('and', 'CC'), ('1', 'CD'), ('atm.Figure', 'NN'), ('2', 'CD'), ('.', '.')]


In [21]:
#tokenizing certain sentence in text using NLTK

pos = []
pos = nltk.pos_tag(iso_sent[0].split())
    
print(pos)

[('Furthermore,', 'IN'), ('the', 'DT'), ('adsorption', 'NN'), ('isotherm', 'NN'), ('of', 'IN'), ('CO2', 'NNP'), ('also', 'RB'), ('shows', 'VBZ'), ('a', 'DT'), ('typical', 'JJ'), ('type', 'NN'), ('I', 'PRP'), ('curve,', 'VBP'), ('with', 'IN'), ('the', 'DT'), ('amounts', 'NNS'), ('of', 'IN'), ('CO2', 'NNP'), ('uptake', 'VBP'), ('increasing', 'VBG'), ('abruptly', 'RB'), ('at', 'IN'), ('the', 'DT'), ('beginning', 'NN'), ('and', 'CC'), ('then', 'RB'), ('gradually', 'RB'), ('reaching', 'VBG'), ('a', 'DT'), ('plateau', 'NN'), ('of', 'IN'), ('221', 'CD'), ('cm3', 'NNS'), ('g–1', 'JJ'), ('(STP)', 'VBP'), ('at', 'IN'), ('195', 'CD'), ('K', 'NNP'), ('and', 'CC'), ('1', 'CD'), ('atm.Figure', 'NN'), ('2.', 'CD')]


In [22]:
pos2 = []
pos2 = nltk.pos_tag(iso_sent[3].split())
    
print(pos2)

[('(3)', 'NN'), ('Until', 'IN'), ('now,', 'FW'), ('only', 'RB'), ('a', 'DT'), ('few', 'JJ'), ('porous', 'JJ'), ('MOFs', 'NNP'), ('have', 'VBP'), ('shown', 'VBN'), ('highly', 'RB'), ('selective', 'JJ'), ('adsorption', 'NN'), ('of', 'IN'), ('CO2', 'NNP'), ('over', 'IN'), ('other', 'JJ'), ('gases', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('CH4', 'NNP'), ('and', 'CC'), ('N2,(4)', 'NNP'), ('and', 'CC'), ('the', 'DT'), ('construction', 'NN'), ('of', 'IN'), ('viable', 'JJ'), ('CO2-capture', 'JJ'), ('MOF', 'NNP'), ('materials', 'NNS'), ('that', 'WDT'), ('can', 'MD'), ('exhibit', 'VB'), ('high', 'JJ'), ('CO2', 'NNP'), ('selectivity', 'NN'), ('over', 'IN'), ('CH4', 'NNP'), ('and', 'CC'), ('N2', 'NNP'), ('still', 'RB'), ('remains', 'VBZ'), ('a', 'DT'), ('challenge.', 'NN')]
