In [1]:
from platform import python_version
import os, shutil
import numpy as np
print(os.getcwd())
# if os.path.exists('corpus_example'):
#     shutil.rmtree('corpus_example')
# os.mkdir('corpus_example')
print(python_version())

/Users/adityanandy/Desktop/mine_test
3.6.1


In [2]:
'''
These are the two major classes for doing full
text mining. The Query class is used to get papers.
The article class is used to mine papers that have
been downloaded. You can use the article class without
any thing from the query class, should you already 
have the papers that you want to mine downloaded in
HTML form. If you are mining a small number of texts
(i.e. some ACS texts), you can do this directly.
''' 
from text_mining_tools.full_text_mine import execute_query
from text_mining_tools.query import Query
from text_mining_tools.article import Article

In [3]:
'''
Let's first use the query wrapper, which comes
from full_text_mine. This wrapper just makes sure
the query gets pickled, in case we want to access 
it later and forget to save it.

We need to set a basepath. This is where corpus is stored.

Elsevier key is needed if you want to also search the
Elsevier database. By default, we work off of crossref 
which is an open API that doesn't require a key.

journal_limit is the set of journals you want to 
query. 

number_of_results is how many results you want per
journal

query_name is what you want the pickle file to be
saved as after you perform the query.

keywords = what you're searching for.
'''
### Let's try to find JP's paper in chemical science and analyze it. DOI: 10.1039/c7sc01247k. 
### We are doing this WITHOUT explicit knowledge of the DOI!

keywords = ['predicting electronic structure with artificial neural networks']
journal_limit = ['chem_sci']
query_name = 'my_results'
basepath = '/Users/adityanandy/Desktop/mine_test/corpus_example/'
my_query = execute_query(basepath, keywords, elsevier_key=None,journal_limit=journal_limit, \
                        number_of_results=10, query_name=False, automate_download=True)

THESE ARE THE QUERIES ['predicting%20electronic%20structure%20with%20artificial%20neural%20networks']
mapping_to_return chem_sci
Now querying predicting%20electronic%20structure%20with%20artificial%20neural%20networks in chemical_science
------------- only querying 10.1039 ------------
['10.1039/c8sc02648c', '10.1039/c2sc20177a', '10.1039/c4sc03321c', '10.1039/c2sc20688a', '10.1039/c7sc01247k', '10.1039/c8sc02339e', '10.1039/c4sc00603h', '10.1039/d0sc01171a', '10.1039/c9sc05043d', '10.1039/c2sc21018e']
                  doi           journal       issn  \
0  10.1039/c8sc02648c  chemical_science  2041-6539   
1  10.1039/c2sc20177a  chemical_science  2041-6539   
2  10.1039/c4sc03321c  chemical_science  2041-6539   
3  10.1039/c2sc20688a  chemical_science  2041-6539   
4  10.1039/c7sc01247k  chemical_science  2041-6539   
5  10.1039/c8sc02339e  chemical_science  2041-6539   
6  10.1039/c4sc00603h  chemical_science  2041-6539   
7  10.1039/d0sc01171a  chemical_science  2041-6539   
8  10.

In [4]:
# By default, the query class constructs the article classes.
# We can take one and analyze it.
my_article = my_query.article_dict['10.1039/c7sc01247k']

In [5]:
# There are a few routines in the article class. You can look
# at each of these routines to obtain what you want, but the
# majority of the functionality of the article should already 
# be there. Let's populate all the info we can get about this 
# article.
my_article.full_analysis()

In [6]:
#### What do I get? Let's take a look.
print(my_article.authors)
print(my_article.journal_name)
print(my_article.section_name_dict)
print(my_article.abstract)

['Jon Paul Janet', 'Heather J. Kulik']
chemical_science
{1: '1. Introduction', 2: '2. Methods', 3: '3. Results and discussion', 4: '4. Conclusions', 5: 'Acknowledgements', 6: 'References'}
High-throughput computational screening has emerged as a critical component of materials discovery. Direct density functional theory (DFT) simulation of inorganic materials and molecular transition metal complexes is often used to describe subtle trends in inorganic bonding and spin-state ordering, but these calculations are computationally costly and properties are sensitive to the exchange–correlation functional employed. To begin to overcome these challenges, we trained artificial neural networks (ANNs) to predict quantum-mechanically-derived properties, including spin-state ordering, sensitivity to Hartree–Fock exchange, and spin-state specific bond lengths in transition metal complexes. Our ANN is trained on a small set of inorganic-chemistry-appropriate empirical inputs that are both maximally 

In [7]:
# Ok, I want to do sentiment analysis on the abstract.
# What can I do?
from text_mining_tools.full_text_mine import VADER_analysis
keyword = 'neural network' # This argument can also be a list...
keywords = ['ANN','machine learning']
kw_in_list, polarity_list = VADER_analysis(my_article.abstract_sentences,keywords)

In [8]:
print([val for i, val in enumerate(my_article.abstract_sentences) if i in kw_in_list])
print([polarity_list[i] for i, val in enumerate(my_article.abstract_sentences) if i in kw_in_list])

['the ann also outperforms other machine learning models (i.e., support vector regression and kernel ridge regression), demonstrating particularly improved performance in transferability, as measured by prediction errors on the diverse test set.']
[0.5709]


In [9]:
print(len(my_article.abstract_sentences),len(polarity_list))

9 9


In [10]:
# Alright, it's time to go head on into the paper.
# Let's mine the full text of the paper for the keyword ANN and 
# extract all sentences that contain the word.
keyword = 'bond length'

kw_in_list, polarity_list = VADER_analysis(my_article.full_paper_sentences,keyword)

print([val for i, val in enumerate(my_article.full_paper_sentences) if i in kw_in_list])
print([polarity_list[i] for i, val in enumerate(my_article.full_paper_sentences) if i in kw_in_list])
print('----- Average sentiment for '+str(keyword)+':', 
      np.mean([polarity_list[i] for i, val in enumerate(my_article.full_paper_sentences) if i in kw_in_list]),'-----')

['To begin to overcome these challenges, we trained artificial neural networks (ANNs) to predict quantum mechanically derived properties, including spin state ordering, sensitivity to Hartree Fock exchange, and spin state specific bond lengths in transition metal complexes.', 'In Section 3, we provide the Results and discussion on the trained neural networks for spin state ordering, spin state exchange sensitivity, and bond length prediction on both training set representative complexes and diverse experimental complexes.', 'We address overfitting using dropout, wherein robustness of the fit is improved by zeroing out nodes in the network with an equal probability, pdrop, at each stage of training (5% for spin state splitting, 15% for HF exchange sensitivity, and 30% for bond lengths, selected by trial and error).', 'The parameter with the lowest average prediction error is used to select the best regularization parameter: 5 × 10−4 for spin state splitting, 10−2 for HF exchange sensiti

In [11]:
#### OK, now let's try to do this with an ACS pub (my ACS Catal: 10.1021/acscatal.9b02165)
keywords = ['machine learning accelerates stable metal oxo formation']
journal_limit = ['acs_catal']
query_name = 'my_results'
my_query = execute_query(basepath, keywords, elsevier_key=None,journal_limit=journal_limit, \
                        number_of_results=10, query_name=False, automate_download=True)

THESE ARE THE QUERIES ['machine%20learning%20accelerates%20stable%20metal%20oxo%20formation']
mapping_to_return acs_catal
Now querying machine%20learning%20accelerates%20stable%20metal%20oxo%20formation in acs_catalysis
------------- only querying 10.1021 ------------
['10.1021/acscatal.9b02165', '10.1021/acscatal.9b05248', '10.1021/cs501707h', '10.1021/acscatal.8b05080', '10.1021/acscatal.5b01847', '10.1021/acscatal.9b02531', '10.1021/cs502129m', '10.1021/cs401032p', '10.1021/acscatal.7b01939', '10.1021/acscatal.8b04357']
                        doi        journal       issn  \
0  10.1021/acscatal.9b02165  acs_catalysis  2155-5435   
1  10.1021/acscatal.9b05248  acs_catalysis  2155-5435   
2         10.1021/cs501707h  acs_catalysis  2155-5435   
3  10.1021/acscatal.8b05080  acs_catalysis  2155-5435   
4  10.1021/acscatal.5b01847  acs_catalysis  2155-5435   
5  10.1021/acscatal.9b02531  acs_catalysis  2155-5435   
6         10.1021/cs502129m  acs_catalysis  2155-5435   
7         10.10

In [12]:
nandy_article = Article(basepath=basepath,doi='10.1021/acscatal.9b02165')

ACS pub. Not downloading, you can download manually! 10.1021/acscatal.9b02165


In [13]:
nandy_article.g(get_full_paper = False)

TypeError: populate_full_paper() got an unexpected keyword argument 'get_full_paper'

In [None]:
#### What do I get this time? Let's take a look.
print(nandy_article.authors)
print(nandy_article.journal_name)
print(nandy_article.section_name_dict)
print(nandy_article.abstract)

In [None]:
nandy_article.full_paper