## <font color=mediumpurple>Before Doing Anything, Import the Necessary Modules

In [6]:
# Import these modules

# Step 2 Bibcode
import requests
import json

# Step 4 urllib
import urllib.request
from urllib.error import HTTPError

#Step 5 html encoder
import html

# Step 6 bibtex
import re
import logging
from bs4 import BeautifulSoup  
from html.entities import name2codepoint  
from urllib.request import quote, Request, urlopen

# **A Guide to creating citations using only a DOI**
*Note: This Notebook seeks to retrieve citations for papers as 3 outputs; JSON, BibTeX and HTML. The citations in this notebook are retrieving the following information on a paper: Title, Authors, Journal Name, Volume Number, Page Range, Year, Hyperlink(s) to the Article and DOI of the Article. If you are looking for more detailed citations or other outputs then please refer to step 3 after completing steps 1 & 2!*

<font color=blue>Step 1. Get DOI of the paper you want to cite</font> <br>
<font color=red>Step 2. Use DOI to search for the Bibcode -- Enter DOI in prompt provided -- Output will be the Bibcode for the paper</font> <br>
<font color=green>Step 3. The Bibcode will automatically be populated for you to search in ADS method. The Output is customizable, all formats possible. This method includes links to paper by DOI url & ADS url</font> <br>
<font color=magenta>Note. (This May Happen to Some Users) Error! My paper was not in ADS! There is no Bibcode for the paper! -- If there is no Bibcode then please Move to Step 4</font> <br>
<font color=purple>Step 4. Use DOI to search in Urllib method -- your DOI is already populated from step 1 in Urllib Method -- Output will be the full citation as a plain text/JSON output. Output is customizable as HTML through Step 5, or in BibTeX format through Step 6</font> <br>
<font color=teal>Step 5. The full JSON output will be populated from the Urllib method to encode it as HTML</font> <br>
<font color=maroon>Step 6. The DOI is already populated and you will recieve the full BibTeX citation for the paper using the GScholar Method</font> <br>

### *All Done!* Now you have the citation for your paper in 3 different formats!

## <font color=blue>Step 1.</font>

<font color=blue>You can retrieve the DOI for your paper in many different ways.
1. The DOI is a unique alphanumeric string assigned by the International DOI Foundation, to identify content and provide a persistent link to its location on the Internet. It is written in the general format of '10.1000/xyz123'
2. The DOI should be written on the top left or top right corner of your paper, it is written as 'DOI:10.1000/xyz123'
3. The DOI should be listed in the details or citation section on the publishers website where you have found your paper
4. The DOI may also be written as a link, next to the papers information on the publishers website, the link is written as https://doi.org/10.1000/xyz123 or https://dx.doi.org/10.1000/xyz123
5. In order to use this notebook please type in "ENTER HERE" spots the DOI in the '10.1000/xyz123' format, *NOT* as hyperlink</font>

In [7]:
# Enter your token here: You need this for using step 3
# This is where the token goes, between the "" 
token="gx43LyUuTTD0zoTWx8qKpWbWi3euTmx7FCM3fJjY"

## <font color=red>Step 2. Retrive Bibcode</font> 

In [35]:
def get_citeproc_authors(cpd_author):
    if cpd_author is None:
        return None
    names = []
    for author in cpd_author:
        try:
            family = author['family'].title()
        except KeyError:
            name = author['name']
            names.append(name)
            continue
        try:
            given = author['given']
        except KeyError:
            # This author has first name
            names.append(family)
            continue
        initials = given.split()
        initials[0] = '{}.'.format(initials[0][0])
        initials = ' '.join(initials)
        names.append('{} {}'.format(initials, family))
    return ', '.join(names)

def parse_citeproc_json(citeproc_json):
    """Parse the provided JSON into a Ref object."""
    
    cpd = json.loads(citeproc_json)
    try:
        if cpd['type'] != 'article-journal':
            return None
    except KeyError:
        return None

    authors = get_citeproc_authors(cpd.get('author', ''))
    title = cpd.get('title', '').replace('\n', '')
    journal = cpd.get('container-title', '')
    volume = cpd.get('volume', '')
    page_start, page_end = cpd.get('page', ''), ''
    if page_start and '-' in page_start:
        page_start, page_end = page_start.split('-')
    article_number = cpd.get('article-number', '')
    doi = cpd.get('DOI', '')
    url = cpd.get('URL', '')
    try:
        year = cpd['issued']['date-parts'][0][0]
    except (KeyError, IndexError):
        year = None
        
    try:
        bibcode = cpd.get('bibcode', '')
    except (KeyError, IndexError):
        bibcode = None
        
# # =============================================================================
# #   OUTPUT
# # =============================================================================
    ref = [authors, 
        title, 
        journal, 
        volume,
        year, 
        page_start, 
        page_end, 
        doi,
        url, 
        article_number,
        citeproc_json]
    return ref 

def get_citeproc_json_from_doi(doi):
    base_url = 'http://dx.doi.org/'
    url = base_url + doi
    req = urllib.request.Request(url)
    req.add_header('Accept', 'application/citeproc+json')
    try:
        with urllib.request.urlopen(req) as f:
            citeproc_json = f.read().decode()
    except HTTPError as e:
        if e.code == 404:
            raise ValueError('DOI not found.')
        raise
    return citeproc_json

def get_source_from_doi(doi):
    citeproc_json = get_citeproc_json_from_doi(doi)
    ref = parse_citeproc_json(citeproc_json)
    return ref

In [36]:
# DOI Example HITRAN Paper: = 10.1016/j.jqsrt.2017.06.038
# DOI Example Superscript Title: = 10.1103/PhysRevA.85.032515
# NO, NO2, N2O PAPER: = 10.1016/j.jqsrt.2019.04.040
# EAMON H2O 2020 PAPER: = 10.1016/j.jqsrt.2019.106711

doi = input("Enter DOI Here: ")
doi_fetched = get_source_from_doi(doi)

ads_token = '4PtdkIDyxpjjZ1JDYJ4HI59VXJuJD98tACcrnfPv'
token     = ads_token

rdoi = doi

rdoi_bs = rdoi.replace("\\", "%2F")    # Remove backslash and replace with URL code for backslash
rdoi_fs = rdoi_bs.replace("/", "%2F")  # Remove forwardslash and replace with URL code for backslash

rurl = requests.get("https://api.adsabs.harvard.edu/v1/search/query?q=doi:"+rdoi_fs,\
                 params={"q":"*:*", "fl": "*", "rows":2000},
                 headers={'Authorization': 'Bearer ' + token})

todos          = json.loads(rurl.text)
todos_response = todos.get('response', '')

#Bibcode = (todos_response['docs']['bibcode'])
Bibcode =  (todos_response['docs'][0]['bibcode']) 

print(Bibcode)

Enter DOI Here: fwejf


ValueError: DOI not found.

## <font color=green>Step 3. ADS Method</font>

### <font color=green>Note before using ADS Method</font>
  1. Exporting using bibcodes require two things.
       - A Bibcode Number (which you got from step 2)<br>
       - A Token--You will need to know what a Token is. It must be used whenever you want to access the ADS database. A token can only be used once you have an account on NASA/ADS https://ui.adsabs.harvard.edu/. Once you have an account click on 'Account' in the top right hand corner, then click on 'Customize Settings' on the dropdown menu. In 'Customize Settings' there is a panel to the left of the screen, if you scroll down that panel you will see 'API Token'. Click on 'API Token' and then click on 'generate a new key'. <br>
         - You are technically using ADS's API when you are using this method. So for any questions/concerns please refer to the NASA/ADS API Information tool on GitHub https://github.com/adsabs/adsabs-dev-api#access-settings <br>
  2. The benefits of this method are the endless choices to customize your citation output.
     - You can get more information such as... the abstract, copyright, citation count, author affiliation, keywords, publication category and arXiv e-print number, etc.<br>
     - You can search more than 1 bibcode at a time<br>
     - You have more output options such as... EndNote, ProCite, RIS (Refman), RefWorks, MEDLARS, AASTeX, Icarus, MNRAS, Solar Physics (SoPh), DC (Dublin Core) XML, REF-XML, REFABS-XML, VOTables and RSS<br>
     - This notebook does not display examples of all of these output format options, if you are interested in any of these choices or extra features please refer to http://adsabs.github.io/help/actions/export <br>
  3. The first option is to retrieve a citation where the output is in JSON format<br>
  4. The second option is to retrieve a citation where the output is in html format<br>
  6. The third option is to retrieve a citation where the output is in LaTeX format</font>
  
*<font color=green>Overall you need to make an account on ADS in order to use this method. </font>*

*<font color=green>If you do not want to make an account then use the BibTeX citation from step 6 and if you want, use steps 4 & 5 to retrieve html and JSON citation formats, in steps 4 & 5 you only need to enter the DOI to retrieve citations </font>*

*<font color=green>However there are many benefits to using the ADS method, your citation output is completely customizable! So if your willing and you have your Bibcode then its recommended to use this method!</font>*

<font color=green>After running the cell below and then the print cell you will recieve an HTML reference with the characters &, <, >, and “ included

In [27]:
# HTML with the characters &, <, >, and “ included
payload = {"bibcode": ["{}".format(Bibcode)], # 2012PhRvA..85c2515Z
           "sort": "first_author asc",
           "format":
           '''{"ref_json": {"authors": "%I",
              "title": "%T",
              "journal": "%J",
              "volume": "%V",
              "start-page": "%p",
              "end-page": "%P",
              "year": %Y,
              "doi": "%d",
              "bibcode": "%u"}}'''
              }
r = requests.post("https://api.adsabs.harvard.edu/v1/export/custom", \
                 headers={"Authorization": "Bearer " + token, "Content-type": "application/json"}, \
                 data=json.dumps(payload))
response_json = r.json()
ref_json = json.loads(response_json['export'])['ref_json']

In [28]:
authors = print('authors:', ref_json['authors'])
print('title:', ref_json['title'])
print('journal:', ref_json['journal'])
print('volume:', ref_json['volume'])
print('start-page:', ref_json['start-page'])
print('end-page:', ref_json['end-page'])
print('year:', ref_json['year'])
print('doi:', ref_json['doi'])
print('bibcode:', ref_json['bibcode'])

authors: Ulenikov, O. N., E. S. Bekhtereva, O. V. Gromova, K. B. Berezkin, V.-M. Horneman, C. Sydow, C. Maul, and S. Bauerecker
title: First high resolution analysis of the 3ν<SUB>2</SUB> and 3ν<SUB>2</SUB> -ν<SUB>2</SUB> bands of <SUP>32</SUP>S<SUP>16</SUP>O<SUB>2</SUB>
journal: Journal of Quantitative Spectroscopy and Radiative Transfer
volume: 202
start-page: 1
end-page: 5
year: 2017
doi: 10.1016/j.jqsrt.2017.07.012
bibcode: https://ui.adsabs.harvard.edu/abs/2017JQSRT.202....1U


<font color=green>After running the cell below and then the print cell you will recieve a BibTeX reference

In [30]:
# BibTeX Reference
payload = {"bibcode": ["{}".format(Bibcode)],
           "sort": "first_author asc",
           "format": 
           '''{"ref_json": {"encoder": "%ZEncoding:latex\\bibitem",
              "title": "%T",
              "journal": "%J",
              "volume": "%V",
              "start-page": "%p",
              "end-page": "%P",
              "year": %Y,
              "authors": "%I",
              "doi": "%d",
              "bibcode": "%u"}}'''
              }
r = requests.post("https://api.adsabs.harvard.edu/v1/export/custom", \
                 headers={"Authorization": "Bearer " + token, "Content-type": "application/json"}, \
                 data=json.dumps(payload))
response_json = r.json()
ref_json = json.loads(response_json['export'])['ref_json']

JSONDecodeError: Invalid \escape: line 2 column 65 (char 101)

In [None]:
print('authors:', ref_json['authors'])
print('title:', ref_json['title'])
print('journal:', ref_json['journal'])
print('volume:', ref_json['volume'])
print('start-page:', ref_json['start-page'])
print('end-page:', ref_json['end-page'])
print('year:', ref_json['year'])
print('doi:', ref_json['doi'])
print('bibcode:', ref_json['bibcode'])

<font color=green>After running the cell below and then the print cell you will recieve an HTML reference with the characters &, <, >, and “ converted to & amp; & lt; & gt; and & quot; respectively.

In [17]:
# HTML with the characters &, <, >, and “ are converted to &amp;, &lt;, &gt;, and &quot;, respectively.
payload = {"bibcode": ["{}".format(Bibcode)],
           "sort": "first_author asc",
           "format": 
           '''{"ref_json": {"encoder": "%ZEncoding:html<P>",
              "authors": "%I",
              "title": "%T",
              "journal": "%J",
              "volume": "%V",
              "start-page": "%p",
              "end-page": "%P",
              "year": %Y,
              "doi": "%d",
              "bibcode": "%u"}}'''
              }
r = requests.post("https://api.adsabs.harvard.edu/v1/export/custom", \
                 headers={"Authorization": "Bearer " + token, "Content-type": "application/json"}, \
                 data=json.dumps(payload))
response_json = r.json()
ref_json = json.loads(response_json['export'])['ref_json']

In [31]:
print('authors:', ref_json['authors'])
print('title:', ref_json['title'])
print('journal:', ref_json['journal'])
print('volume:', ref_json['volume'])
print('start-page:', ref_json['start-page'])
print('end-page:', ref_json['end-page'])
print('year:', ref_json['year'])
print('doi:', ref_json['doi'])
print('bibcode:', ref_json['bibcode'])

authors: Ulenikov, O. N., E. S. Bekhtereva, O. V. Gromova, K. B. Berezkin, V.-M. Horneman, C. Sydow, C. Maul, and S. Bauerecker
title: First high resolution analysis of the 3ν<SUB>2</SUB> and 3ν<SUB>2</SUB> -ν<SUB>2</SUB> bands of <SUP>32</SUP>S<SUP>16</SUP>O<SUB>2</SUB>
journal: Journal of Quantitative Spectroscopy and Radiative Transfer
volume: 202
start-page: 1
end-page: 5
year: 2017
doi: 10.1016/j.jqsrt.2017.07.012
bibcode: https://ui.adsabs.harvard.edu/abs/2017JQSRT.202....1U


## <font color=purple>Step 4. Urllib method
 <font color=purple>If you did not have a bibcode or you want a plain text reference then use this method 

In [32]:
#doi = input("Enter doi Here: ")
doi_fetched = get_source_from_doi('{}'.format(doi))

# Below are the parameters for searching your citation
# if you would like to add or change anything then refer to the initial code above to make your changes
reference = (doi_fetched[0],doi_fetched[1],doi_fetched[2],doi_fetched[3],doi_fetched[4],doi_fetched[5],doi_fetched[6],doi_fetched[7],doi_fetched[8])

print ('Authors:', doi_fetched[0], '')
print ('Title:', doi_fetched[1], '')
print ('Journal:', doi_fetched[2], '')
print ('Volume:', doi_fetched[3], '')
print ('Year:', doi_fetched[4], '')
print ('Page Start:', doi_fetched[5], '')
print ('Page End:', doi_fetched[6], '')
print ('DOI:', doi_fetched[7], '')
print ('URL:', doi_fetched[8], '')

Authors: O. Ulenikov, E. Bekhtereva, O. Gromova, K. Berezkin, V. Horneman, C. Sydow, C. Maul, S. Bauerecker 
Title: First high resolution analysis of the 3ν2 and3ν2−ν2bands of 32S16O2 
Journal: Journal of Quantitative Spectroscopy and Radiative Transfer 
Volume: 202 
Year: 2017 
Page Start: 1 
Page End: 5 
DOI: 10.1016/j.jqsrt.2017.07.012 
URL: http://dx.doi.org/10.1016/j.jqsrt.2017.07.012 


## <font color=teal>Step 5. Encoding JSON in HTML <br>
<font color=teal>Reference is populated from the Urllib Method

In [20]:
# This will give you a plain text reference with no html codes
def escape2(s, quote=True):
    s = s.replace("}$", "</SUP>")
    s = s.replace("$^{", "<SUP>") 
    s = s.replace("{", "")
    s = s.replace("}", "")
    s = s.replace("\"", "")
    #s = s.replace("'", "")
    s = s.replace("''", "")
    #s = s.replace(" ", "")
    s = s.replace(",,", "")
    s = s.replace(", ,", "")
    return s
print(escape2("{}".format(reference)))

('Y. Lin, A. V. Akimov', 'Dependence of Nonadiabatic Couplings with Kohn–Sham Orbitals on the Choice of Density Functional: Pure vs Hybrid', 'The Journal of Physical Chemistry A', '120', 2016, '9028', '9041', '10.1021/acs.jpca.6b09660', 'http://dx.doi.org/10.1021/acs.jpca.6b09660')


In [13]:
# I am populating our output from the urllib method
# This will replace ("""& < " '> """ ) with (&amp; &lt; &quot; &#x27; &gt;)
s = html.escape( """& < " '> """ ) 
html.escape(s) #(s, quote=True)
html.escape("{}".format(reference))

'(&#x27;S. Bin Zhang, D. L. Yeager&#x27;, &quot;Complex-scaled multireference configuration-interaction method to study Be and Be-like cations&#x27; (B, C, N, O, Mg) Auger resonances1s2s22p1,3Po&quot;, &#x27;Physical Review A&#x27;, &#x27;85&#x27;, 2012, &#x27;&#x27;, &#x27;&#x27;, &#x27;10.1103/physreva.85.032515&#x27;, &#x27;http://dx.doi.org/10.1103/PhysRevA.85.032515&#x27;)'

## <font color=maroon>Step 6. BibTeX citation</font>

In [21]:
"""Library to query Google Scholar.
Call the method query with a string which contains the full search
string. Query will return a list of citations.
"""

GOOGLE_SCHOLAR_URL = "https://scholar.google.com"
HEADERS = {'User-Agent': 'Mozilla/5.0'}

FORMAT_BIBTEX = 4
FORMAT_ENDNOTE = 3
FORMAT_REFMAN = 2
FORMAT_WENXIANWANG = 5


logger = logging.getLogger(__name__)

# we are using query in our code
def query(searchstr, outformat=FORMAT_BIBTEX, allresults=False):
    """Query google scholar.
    This method queries google scholar and returns a list of citations.
    Parameters
    ----------
    searchstr : str
        the query
    outformat : int, optional
        the output format of the citations. Default is bibtex.
    allresults : bool, optional
        return all results or only the first (i.e. best one)
    Returns
    -------
    result : list of strings
        the list with citations
    """
    logger.debug("Query: {sstring}".format(sstring=searchstr))
    searchstr = '/scholar?q='+quote(searchstr)
    url = GOOGLE_SCHOLAR_URL + searchstr
    header = HEADERS
    header['Cookie'] = "GSP=CF=%d" % outformat
    request = Request(url, headers=header)
    response = urlopen(request)
    html = response.read()
    html = html.decode('utf8')
    # grab the links
    tmp = get_links(html, outformat)

    # follow the bibtex links to get the bibtex entries
    result = list()
    if not allresults:
        tmp = tmp[:1]
    for link in tmp:
        url = GOOGLE_SCHOLAR_URL+link
        request = Request(url, headers=header)
        response = urlopen(request)
        bib = response.read()
        bib = bib.decode('utf8')
        result.append(bib)
    return result

def get_links(html, outformat):
    """Return a list of reference links from the html.
    Parameters
    ----------
    html : str
    outformat : int
        the output format of the citations
    Returns
    -------
    List[str]
        the links to the references
    """
    if outformat == FORMAT_BIBTEX:
        refre = re.compile(r'<a href="https://scholar.googleusercontent.com(/scholar\.bib\?[^"]*)')
    elif outformat == FORMAT_ENDNOTE:
        refre = re.compile(r'<a href="https://scholar.googleusercontent.com(/scholar\.enw\?[^"]*)"')
    elif outformat == FORMAT_REFMAN:
        refre = re.compile(r'<a href="https://scholar.googleusercontent.com(/scholar\.ris\?[^"]*)"')
    elif outformat == FORMAT_WENXIANWANG:
        refre = re.compile(r'<a href="https://scholar.googleusercontent.com(/scholar\.ral\?[^"]*)"')
    reflist = refre.findall(html)
    # escape html entities
    reflist = [re.sub('&(%s);' % '|'.join(name2codepoint), lambda m:
                      chr(name2codepoint[m.group(1)]), s) for s in reflist]
    return reflist

In [23]:
class Bibtex(object):
    """ Convert doi number to bibtex entries."""
    def __init__(self, doi=None, title=None):
        """
        Input doi number ou title (actually any text/keyword.)
        Returns doi, encoded doi, and doi url or just the title.
        """
        _base_url = "http://dx.doi.org/"
        self.doi = doi
        self.title = title
        self.bibtex = None
        #if doi:
            #self._edoi = parse.quote(doi)
            #self.url = _base_url + self._edoi  # Encoded doi.
        #else:
            #self.url = None
# Beautiful Soup is a Python library for pulling data out of HTML and XML files
    def _soupfy(self, url):
        """Returns a soup object."""
        html = request.urlopen(url).read()
        self.soup = BeautifulSoup(html, 'html.parser')
        return self.soup
    
    
    def getGScholar(self):
        """Get bibtex entry from doi using Google database."""
        bib = query(self.doi, 4)[0]
        bib = bib.split('\n') 
        self.bibtex = '\n'.join(bib[0:-1]) #-9
        return self.bibtex

def main(argv=None):
    if argv is None:
        argv = sys.argv

    #args = parse_args(argv[1:])

    doi = args.positional
    method = args.method

    def allfailed():
        """All failed message+google try."""
        bold, reset = "\033[1m", "\033[0;0m"
        bib.getGScholar()
        url = bold + bib.url + reset
        msg = """Unable to resolve this DOI using database
        \nTry opening, \n\t{0}\nand download it manually.
        \n...or if you are lucky check the Google Scholar search below:
        \n{1}
        """.format(url, bib.bibtex)
        return msg

    bib = Bibtex(doi=doi)

In [24]:
doi = '{}'.format(doi)       
bib = Bibtex(doi)
bib = bib.getGScholar()
print(bib)

@article{lin2016dependence,
  title={Dependence of Nonadiabatic Couplings with Kohn--Sham Orbitals on the Choice of Density Functional: Pure vs Hybrid},
  author={Lin, Yuhan and Akimov, Alexey V},
  journal={The Journal of Physical Chemistry A},
  volume={120},
  number={45},
  pages={9028--9041},
  year={2016},
  publisher={ACS Publications}
}


#  <font color=orange> Authors List Fix
<font color=orange>This last part is in case you would like to change the authors list generated from ADS. ADS provides many options for formatting the authors list but HITRAN has a specific format that ADS does not provide. The next cell fixes the first name from the ADS output to what HITRAN uses

In [25]:
#s = "de Ghellinck d'Elseghem Vaernewijck, X."
#s = "Zhang, S. B."
#s = "Zhang-Zow, Sh."
s = input("Enter First Author Name Here: ")
words = s.split(',') 
string =[] 
for word in words: 
    string.insert(-1, word) 
print(" ".join(string)) 

Enter First Author Name Here: Akimov, Alexey V
 Alexey V Akimov
