In [53]:
# To use this notebook, you need to have the following installed in your conda env
# conda install -c conda-forge nb_conda_kernels
# conda install -c anaconda-nb-extensions nb_conda
# conda install -c anaconda ipython

# And then you need to check that the kernel is set to your conda env. Else things wont work.

This notebook is set up to show how you can perform text mining with a corpus that already has a series of article classes that are saved. We can do this for mining manuscripts corresponding to the CSD, given that we know the DOI of the article. The first steps are to identify the presence of an article. After that is done, we can load in the article and perform text analysis.

This tutorial notebook assumes the corpus at a given path, but this can be easily adjusted given a new corpus path.

In [54]:
import glob, os
corpus_pickle_path = '/media/storage/nandy/Documents/HJKGroup/CSDCorpus/pickles/'

# The above path is where pickle files of the
# article classes are stored. That way we do 
# not need to load in the article from scratch
# every time, but rather get the article that
# we have already parsed. This directory can 
# also be copied over to any local machine to
# move the corpus around.

In [55]:
# Next, we should have a DOI of interest. I will use
# a random DOI corresponding to a Fe complex
# at 10.1039/C3DT53221F --> corresponding to refcode VITQIA

doi_of_interest = '10.1039/C3DT53221F'

# WARNING: You may want to check the lowercase
# version of your DOI --> the files are case
# sensitive. Thus, if your original DOI does 
# not result in a hit, please try the following:
#
# doi_of_interest = doi_of_interest.lower()
#
# and redo your search. If this brings up 
# no hits, then we do not have the article in the
# corpus.

# check if this DOI is in our corpus
pickle_of_doi = glob.glob(corpus_pickle_path+'/'+doi_of_interest+'*')
print('------ pickle file ------')
print(pickle_of_doi)

# If the above prints something, we have the DOI of interest.

------ pickle file ------
['/media/storage/nandy/Documents/HJKGroup/CSDCorpus/pickles//10.1039/C3DT53221F.pkl']


In [56]:
# Next, we should load in the pickle file of interest. 
# Before we do this, lets have all of our text mining tools
# imported for use.
from text_mining_tools.full_text_mine import execute_query
from text_mining_tools.query import Query
from text_mining_tools.article import Article
import pickle

# below, we load in the article class of the corresponding DOI
loaded_article_class = pickle.load(open(pickle_of_doi[0],'rb'))
print(loaded_article_class) 

<text_mining_tools.article.Article object at 0x7fcea07747c0>


In [57]:
# It may not be obvious what is stored in an article class
# We can print the information to get help
print(loaded_article_class.__dict__.keys())

dict_keys(['doi', 'prefix', 'getter', 'basepath', 'elsevier_key', 'f', 'original_f', 'article_type', 'title', 'journal_name', 'authors', 'publication_year', 'abstract', 'abstract_sentences', 'citation_dict', 'section_name_dict', 'section_text_dict', 'section_text_dict_sentences', 'table_caption_dict', 'figure_captions', 'table_dict', 'full_paper', 'full_paper_sentences'])


In [58]:
# Now that we can see what is in the class above, we can access
# certain parts of this that might be useful for text mining.
# The full text will be under full_paper, and if we want the
# paper tokenized by sentences, that would be under full_paper_sentences.

print('authors',loaded_article_class.authors)
print('abstract',loaded_article_class.abstract)
print('figure captions',loaded_article_class.figure_captions)
print('full_paper',loaded_article_class.full_paper)

# When the field is not present or is "False", that means
# We could not automatically parse the manuscript to fill the keys
# of interest. 

authors ['Xiao Ming Lu', 'Guo Wang', 'Yi Feng Cheng']
abstract [CuII(phen)(HIMC−)(H2O)]·[CuII(phen)(HIMC−)(NO3−)]·NO3−·H2O (1) and [CuII(2,2′-bipy)(HIMC−)]·NO3−·xH2O (2) (phen = 1,10-phenanthroline, 2,2′-bipy = 2,2′-bipyridine, HIMC− = 1H-imidazole-4-carboxylate acid anion) have been synthesized at 180 °C, of which the HIMC− is produced by an in situ decarboxylation from H3IDC (1H-imidazole-4,5-dicarboxylic acid) in a one-pot hydrothermal reaction. The anticancer activity experiments in vitro show that 1 exhibited excellent activities against A549, Bel-7402 and HCT-8 cancer cells and is even better than the clinical anticancer drug 5-Fu (5-fluorouracil), while 2 shows little response toward the cancer cells. The single crystal X-ray diffraction indicated that complex 1 possess a co-planar [CuII(N–N)(HIMC−)] coordination geometry. The IR, elemental analysis and solid-state luminescent spectra of complexes 1 and 2 indicated that the composition of these two complexes are similar, whereas

In [59]:
# At this point, we may want to text mine the abstract of the
# above paper for certain keywords. My dummy word will be luminescent.
from text_mining_tools.full_text_mine import VADER_analysis
keywords = ['luminescent']
sentences = loaded_article_class.abstract_sentences
kw_in_list, polarity_list = VADER_analysis(sentences,keywords)

for i, val in enumerate(sentences):
    if i in kw_in_list:
        print('====sentence contains at least one kw=====')
        print(val)
        print('==== VADER SENTIMENT ON SENTENCE! =====')
        print(polarity_list[i])
    else:
        print('!!!! No keywords matched in this sentence. Polarity: '+str(polarity_list[i]))

!!!! No keywords matched in this sentence. Polarity: 0.0
!!!! No keywords matched in this sentence. Polarity: -0.836
!!!! No keywords matched in this sentence. Polarity: 0.0
====sentence contains at least one kw=====
the ir, elemental analysis and solid state luminescent spectra of complexes 1 and 2 indicated that the composition of these two complexes are similar, whereas the 2,2′ bipy in complex 2 replaced phen in complex 1. the calculation by the gaussian 03 program illustrated that the decrease in the energy gaps between π* π from the free to the coordinated ligand for 2,2′ bipy and phen (δe) are 5.3 ev to 4.0 ev and 4.8 ev to 4.5 ev separately, and the relative changes of the gibbs free energies (δg) for complex 1 and 2 decomposing into free cu2+ and ligands are about 0 kcal mol−1 and 7 kcal mol−1 respectively, which revealed that it is more stable when 2,2′ bipy is coordinated with cuii than phen, and 1 is easier to disassociate into free cu2+ than 2. by relating the δe, δg, lumi

In [60]:
# There may be situations where you want to 
# populate a new article class given a DOI
# This can be done as follows:

# This makes a temporary directory 
# and then downloads the articles
# into this temporary directory, 
# which is basically a new corpus
if not os.path.exists('temporary_dir/'):
    os.mkdir('temporary_dir')
basepath = os.getcwd()+'/temporary_dir/'

# Using JP's Chemical Science paper to 
# download the article and create an article class
temp_article = Article(doi='10.1039/c7sc01247k',basepath=basepath)

ATTEMPTING DOWNLOAD! 10.1039/c7sc01247k


In [61]:
# Under the hood, the following is happening
# for article downloads:
#
# from articledownloader.articledownloader import ArticleDownloader
# downloader = ArticleDownloader()
# downloader.get_html_from_doi('10.1039/c7sc01247k')

In [62]:
# we can check to make sure the download happened
os.listdir('temporary_dir/10.1039/')

['c7sc01247k.html']

In [63]:
# Then, we can do a full analysis of
# the article to break it into sections.
# This should take a couple seconds.
temp_article.full_analysis()

In [64]:
# The article is now populated and ready
# for future analysis! The sections can
# be analyzed.
print(temp_article.authors)

['Heather J. Kulik', 'Jon Paul Janet']


In [65]:
# Now that we are done, it is best to store the article
# as a pickle file so that we do not have to do this
# process each time for the same article.

# This step needs to be done because
# some of the article classes can 
# get pretty large and it is needed
# to pickle them
import sys
sys.setrecursionlimit(1000000)

if not os.path.exists('temporary_dir/pickles'):
    os.mkdir('temporary_dir/pickles')
prefix = temp_article.doi.split('/')[0]
if not os.path.exists('temporary_dir/pickles/'+str(prefix)):
    os.mkdir('temporary_dir/pickles/'+str(prefix))
import pickle
with open('temporary_dir/pickles/'+str(temp_article.doi)+'.pkl','wb') as f:
    pickle.dump(temp_article,f)