In [1]:
#Import Statements
import scrapy
from urllib.request import urlopen #to open the urls that the dois are put into
import json
import requests
import logging

#for the ArticleItem section
from scrapy.item import Item, Field
from scrapy.selector import Selector
from scrapy.spiders import Spider

#for the spiders 
from scrapy import Spider
from scrapy.http import TextResponse #defines what response is in xpath

#to run the spider in Jupyter notebook, have to restart the kernel each time to run it
from scrapy.settings import Settings
from scrapy.crawler import CrawlerProcess

#Running spiders imports 
from twisted.internet import reactor
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging

In [2]:
#doi_list = [ACS, Elsevier, Springer, Aiche Journal]
#automatically decides which journal a doi is for, then it will follow a certain set of instructions
doi_list = ["10.1021/ja302991b", "10.1016/j.micromeso.2012.01.033", "10.1007/s10450-012-9423-1", "10.1002/aic.690470520", "10.1007/s10450-013-9527-2"]
for d in doi_list:
    test_url = 'http://dx.doi.org/{0}'.format(d)
    
    headers = {'Accept': 'application/citeproc+json'}
    bib_info = json.loads(requests.get(test_url, headers=headers).content)
    if bib_info['publisher'] == 'American Chemical Society (ACS)':
        
        doi_acs = bib_info.get('DOI')
        full_url_acs = 'http://pubs.acs.org/doi/full/{0}'.format(doi_acs)
        print(full_url_acs)
        response_acs = urlopen(full_url_acs)
        content_acs = response_acs.read()
 
        
    elif bib_info['publisher'] == 'Elsevier BV':
        
        doi_el = bib_info.get('DOI')
        full_url_el = 'http://dx.doi.org/{0}'.format(doi_el)
        print(full_url_el)
        response_el = urlopen(full_url_el)
        content_el = response_el.read()
        
    elif bib_info['publisher'] == 'Springer Nature':
        
        doi_spr = bib_info.get('DOI')
        full_url_spr = 'http://link.springer.com/article/{0}'.format(doi_spr)
        print(full_url_spr)
        response_spr = urlopen(full_url_spr)
        content_spr = response_spr.read()
        
    else:
        print('wrong publisher')

http://pubs.acs.org/doi/full/10.1021/ja302991b
http://dx.doi.org/10.1016/j.micromeso.2012.01.033
http://link.springer.com/article/10.1007/s10450-012-9423-1
wrong publisher
http://link.springer.com/article/10.1007/s10450-013-9527-2


In [4]:
#You define classes that you want the spider to scrape
#This is what I defined as wanting from each article
#later I will create the spider and tell the spider where to find this for each publisher website

class ArticleItem(Item):
    title = Field()
    authors = Field()
    doi = Field()
    abstract = Field()
    text = Field()
    figures = Field()

In [None]:
#Variables and lists defined to add urls for each publisher
full_url_acs = ''
full_url_spr = ''
full_url_acs_lst = []
full_url_spr_lst = []

In [None]:
#Finding out which publisher each DOI comes from
#Two currently defined are ACS and Springer Nature
#Sort DOIs, under each if statement the corresponding spider for each publisher
#to change the list of DOIs => change file in Bash loop not in this code

dois = open('doi_list.txt') #doi_list.txt is the where the list of DOIs came from
doi_lst = dois.readlines()
fixed_doi = []
for x in doi_lst:
    fixed_doi.append(re.sub('\n','', x))
doi_lst = fixed_doi

for d in doi_lst:
    test_url = 'http://dx.doi.org/{0}'.format(d)
    
    headers = {'Accept': 'application/citeproc+json'}
    bib_info = json.loads(requests.get(test_url, headers=headers).content)

    if bib_info['publisher'] == 'American Chemical Society (ACS)':
        
        doi_acs = bib_info.get('DOI')
        full_url_acs = 'http://pubs.acs.org/doi/full/{0}'.format(doi_acs)
        response_acs = urlopen(full_url_acs)
        content_acs = response_acs.read()

        full_url_acs_lst.append(full_url_acs)
        
    
        
    elif bib_info['publisher'] == 'Springer Nature':
        
        doi_spr = bib_info.get('DOI')
        full_url_spr = 'http://link.springer.com/article/{0}'.format(doi_spr)
        response_spr = urlopen(full_url_spr)
        content_spr = response_spr.read()

        full_url_spr_lst.append(full_url_spr)

    else:
        print('wrong publisher')

In [None]:
#Spider for ACS 
class ArticleSpider(scrapy.Spider):
    name = 'ArticleSpider' #Name the spider anything 
    allowed_domains = ["http://pubs.acs.org/"]
    start_urls = full_url_acs_lst 

    #where to find the text, easy to find using inspect tool on webpage
    #if text isn't scraped add "/text()" after class
    #use items defined earlier in ArticleItem
    def parse(self, response):
        item = ArticleItem()
        item['title'] = response.xpath('//span[@class="hlFld-Title"]/text()').extract()
        item['authors'] = response.xpath('//div[@id="authors"]/text()').extract()
        item['doi'] = response.xpath('//div[@id="doi"]/text()').extract()
        item['abstract'] = response.xpath('//p[@class="articleBody_abstractText"]/text()').extract
        item['text'] = response.xpath('//div[@class="hlFld-Fulltext"]').extract()
        item['figures'] = response.xpath('//img[@alt="figure"]').extract()
        yield item

In [None]:
#Spider for Springer Nature
class ArticleSpiderSpr(scrapy.Spider):
    name = 'ArticleSpiderSpr'
    allowed_domains = ["https://link.springer.com"]
    start_urls = full_url_spr_lst

    def parse(self, response):
        item = ArticleItem()
        item['title'] = response.xpath('//h1[@class="ArticleTitle"]/text()').extract()
        item['authors'] = response.xpath('//span[@class="authors__name"]/text()').extract()
        item['doi'] = response.xpath('//span[@id="doi-url"]/text()').extract()
        item['abstract'] = []
        item['text'] = response.xpath('//div[@id="body"]/descendant::text()').extract()
        item['figures'] = response.xpath('//div[@class="MediaObject"]').extract()
        yield item

In [None]:
#How to run the spiders
#Bash loop will do this for each DOI and create a unique JSON file name for each
def run():

    settings = get_project_settings()
    settings.set('FEED_FORMAT', 'jsonlines')
    settings.set('FEED_URI', 'result6.jl')

    configure_logging()
    runner = CrawlerRunner(settings)

    runner.crawl(ArticleSpider)
    runner.crawl(ArticleSpiderSpr)

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())

    reactor.run()  # the script will block here until all crawling jobs are finished

if __name__ == '__main__':
    run()