<a href="https://colab.research.google.com/github/dani-lbnl/mudit/blob/main/Springer_Extract_New.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# set up drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import urllib.request as request
import json


class SpringerScraper():
    def __init__(
            self,
            query_text,
            start=0,
            year=2020,
            apikey="d0fce19bf0a92a57dd448bd7f0c2301f",
            max_return=100):
        """
        :param query_text: query words
        :param start: start page number
        :param year: year of published papers
        :param apikey: Springer APIKEY
        :param max_return: max returned number of papers
        """
        self.apikey = apikey
        self.start = start
        self.max_return = max_return
        self.year = year
        self.query_text = query_text.replace(" ", '%20')

    def FindDois(self):
        """
        :return: <list> of <str> list of DOIs
        """
        url = "http://api.springernature.com/meta/v2/json?&q={}%20type:Journal%20year:{}&s={}&p={}&api_key={}".format(
            self.query_text, self.year, self.start, self.max_return, self.apikey)
        # url = "http://api.springernature.com/openaccess/json?q=title:%22lithium%22metal%22battery%22type:Journal%20year:{}&s={}&p={}&api_key={}".format(self.year, self.start, self.max_return, self.apikey)
        # url = "http://api.springernature.com/openaccess/json?q=title:{}%20type:Journal%20year:{}&s={}&p={}&api_key={}".format(self.query_text,self.year, self.start, self.max_return, self.apikey)

        
        crawl_content = request.urlopen(url).read()
        Content = json.loads(crawl_content.decode('utf8'))
        Dois = []
        for i in range(self.max_return):
            Dois.append(Content['records'][i]['doi'])
        return Dois

    def FindingXml(self, papers):
        """
        :param papers: list of DOIS
        :return: url addresses of papers
        """
        DoiUrls = []
        for doi in papers:
            doiUrls = "https://api.springernature.com/meta/v2/jats?q=doi:{}&p=2&api_key={}".format(
                doi, self.apikey)
            DoiUrls.append(doiUrls)
        return DoiUrls


def download_doi(doi):
    """
    :param doi: DOI
    :return:
    """
    WebContent = request.urlopen(doi).read()
    with open(r'springer_{}.xml'.format(doi), 'wb') as f:
        f.write(WebContent)
        f.close()
    return

In [4]:
year_list = list(range(2010, 2022)) # from 2010 to 2021
start_list= [1,101,201,301,401,501,601,701,801,901] # five iterations

total_url = []
for y in year_list:
  print("****Year*****:",y)
  for s in start_list:
    print("Start:",s)
    ss = SpringerScraper("lithium battery", start = s,year = y)
    try:
      doi_list = ss.FindDois()
      url_doi=ss.FindingXml(doi_list)
      if len(total_url)==0:
        total_url = url_doi
      else:
        total_url+= url_doi
    except:
      print("Error raised; going back")
      print("start: ",s)
      break


****Year*****: 2010
Start: 1
Start: 101
Start: 201
Start: 301
Start: 401
Start: 501
Start: 601
Error raised; going back
start:  601
****Year*****: 2011
Start: 1
Start: 101
Start: 201
Start: 301
Start: 401
Start: 501
Start: 601
Start: 701
Start: 801
Error raised; going back
start:  801
****Year*****: 2012
Start: 1
Start: 101
Start: 201
Start: 301
Start: 401
Start: 501
Start: 601
Start: 701
Start: 801
Start: 901
****Year*****: 2013
Start: 1
Start: 101
Start: 201
Start: 301
Start: 401
Start: 501
Start: 601
Start: 701
Start: 801
Start: 901
****Year*****: 2014
Start: 1
Start: 101
Start: 201
Start: 301
Start: 401
Start: 501
Start: 601
Start: 701
Start: 801
Start: 901
****Year*****: 2015
Start: 1
Start: 101
Start: 201
Start: 301
Start: 401
Start: 501
Start: 601
Start: 701
Start: 801
Start: 901
****Year*****: 2016
Start: 1
Start: 101
Start: 201
Start: 301
Start: 401
Start: 501
Start: 601
Start: 701
Start: 801
Start: 901
****Year*****: 2017
Start: 1
Start: 101
Start: 201
Start: 301
Start: 401
S

In [23]:
total_url[0]

'https://api.springernature.com/meta/v2/jats?q=doi:10.1186/1741-7015-8-89&p=2&api_key=d0fce19bf0a92a57dd448bd7f0c2301f'

In [26]:
# replace pam with jats

import re
url_list_v2 = []
for a in total_url:
  url_list_v2.append(re.sub("jats", 'pam',a))


In [27]:
url_list_v2[0]

'https://api.springernature.com/meta/v2/pam?q=doi:10.1186/1741-7015-8-89&p=2&api_key=d0fce19bf0a92a57dd448bd7f0c2301f'

## saving

In [6]:
num_list = []
for i in range(1,11401):
  num_list.append(i)


In [20]:
def download_doi(doi,num_list):
    """
    :param doi: DOI
    :return:
    """

    save_list = []
    WebContent = request.urlopen(doi).read()
    with open(r'/content/drive/My Drive/LBNL/Springer_10k_abs/springer_{}.xml'.format(num_list), 'wb') as f:
        save_list.append(WebContent)   
        f.write(WebContent)
        f.close()
    return

In [21]:
!pwd

/content


In [None]:
for i in range(11400):
  download_doi(url_list_v2[i],num_list[i])