In [1]:
# we need headers to disguise our bot as a browser

headers = {
    "Connection": "keep-alive",
    "Cache-Control": "max-age=0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
    "Accept-Encoding": "gzip,deflate,sdch",
    "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4,zh-TW;q=0.2",
}

import requests
from scrapy.http import TextResponse

r = requests.get('http://www.sciencedirect.com/science/article/pii/S0167923616301580', 
                 headers = headers)

response = TextResponse(r.url, body = r.text, encoding = 'utf-8')

# there is a response we need to handle
response

<200 http://www.sciencedirect.com/science/article/pii/S0167923616301580>

In [2]:
# Elsevier

# home items xpath
source = dict(issn = '//*[@class="issn keyword"]/span/text()',
chief_editor = '//*[@id="Title"]//span[@class="nowrap"]/text()',
publication_title = '//*[@id="Title"]//h1[@itemprop="name"]/text()',
description = '//*[@class="publication-description"]//p',
coverimage = '//*[@id="Title"]//img[@class="cover-img"]/@src')

# article url xpath
document_url = '//ol[@class="articleList results"]//a[@class="cLink artTitle S_C_artTitle "]/@href'


# -------------------
# article fields xpaths
document = dict(
title = '//*[@class="svTitle"]/text()',
abstract = '//div[@class="abstract svAbstract "]/p/text()',
date = '//*[@class="articleDates"]/dd/text()',
submission_path = '//*[@class="volIssue"]/a/text()',
dp = '//*[@class="volIssue"]/text()')

# keyword field xpaths
keyword = '//*[@class="svKeywords"]/span/text()'

# author fields xpaths

# within an author selector

# version 1:
author = dict(
auth = '//ul[@class="authorGroup noCollab svAuthor"]/li',
fn = 'a[@class="authorName svAuthor"]/@data-fn',
ln = 'a[@class="authorName svAuthor"]/@data-ln',
email = 'a[@class="auth_mail"]/@href',
fid = 'a[@class="intra_ref auth_aff"]/@id',
address = '//*[@id="%s"]/span/text()',
href = 'span/a[@class="authorVitaeLink"]/@href',
vitae = '//p[@id="%s"]/text()',
avatar = '//div[@id="%shidden"]//img/@src')



# ----------------


document0 = dict(
title = '//*[@class="article-title"]/text()',
abstract = '//div[@class="abstract abstract-type-author"]/div/text()',
date = '//*[@class="article-history-dates"]/text()',
submission_path = '//*[@class="journal-volume"]/a/text()',
dp = '//*[@class="journal-volume"]/text()')


keyword0 = '//*[@class="keyword"]/text()'

# author fields xpaths

# within an author selector

# version 0:
author0 = dict(
auth = '//*[@class="author-group"]',
name = './/a/@data-related-url',
address = './/*[@class="affiliation__text"]/text()',
href = './/*[@class="footnote-ref"]/@href',
vitae = '//*[@id="%s"]/dd/text()',
email = './/*[@class="author-email"]/@href')

relationship = dict(
publication_title = '//*[@id="Title"]//h1[@itemprop="name"]/text()',
title = '//*[@class="svTitle"]/text()',
title0 = '//*[@class="article-title"]/text()'
)


Xpath = dict(
    source = source,
    document = document,
    author = author,
    keyword = keyword,
    document0 = document0,
    author0 = author0,
    keyword0 = keyword0,
    relationship = relationship,
    document_url = document_url
    )


In [3]:
source = Xpath['source']
document = Xpath['document']
keyword = Xpath['keyword']
author = Xpath['author']
document0 = Xpath['document0']
keyword0 = Xpath['keyword0']
author0 = Xpath['author0']
document_url = Xpath['document_url']
relationship = Xpath['relationship']

In [6]:
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Item, Field


class DocumentItem(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    abstract = Field()

    publication_date = Field()
    submission_date = Field()
    online_date = Field()
    revision_date = Field()
    accepted_date = Field()

    title = Field()
    coverpage_url = Field()
    fpage = Field()
    lpage = Field()
    pages = Field()
    submission_path = Field()

    publication_title = Field()


class KeywordItem(Item):
    keyword = Field()

    title = Field()


class SourceItem(Item):
    publication_title = Field()
    chief_editor = Field()
    issn = Field()
    description = Field()
    home_url = Field()
    coverimage = Field()

    title = Field()

class AuthorItem(Item):
    institution = Field()
    email = Field()
    avatar = Field()
    vitae = Field()
    fname = Field()
    lname = Field()
    address = Field()

    title = Field()


In [18]:

import re
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Join, TakeFirst

from dateutil.parser import parse

def getdate(dates):
    d = {}
    d['submission_date'] = None
    d['revision_date'] = None
    d['accepted_date'] = None
    d['online_date'] = None
    for date in dates:
        if 'Received' in date:
            d['submission_date'] = parse(date.split('Received ')[-1])
        elif 'Revised' in date:
            d['revision_date'] = parse(date.split('Revised ')[-1])
        elif 'Accepted' in date:
            d['accepted_date'] = parse(date.split('Accepted ')[-1])
        elif 'Available online' in date:
            d['online_date'] = parse(date.split('Available online ')[-1])
    return d


def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext


#############

def load_source(response, source):
    l = ItemLoader(item = SourceItem(), response = response)
    l.default_output_processor = TakeFirst()
    l.add_xpath("issn",source['issn'])
    l.add_xpath('chief_editor', source['chief_editor'])
    l.add_xpath('publication_title', source['publication_title'])
    l.add_xpath('coverimage', source['coverimage'])
    l.add_xpath('description', source['description'], Join(), cleanhtml)
    l.add_value('home_url', response.url)
    publication_title = l.get_xpath(source['publication_title'])
    return l



def load_document(response, document):
    l = ItemLoader(item = DocumentItem(), response = response)
    l.default_output_processor = TakeFirst()
    l.add_value('coverpage_url', response.url)
    l.add_xpath('abstract', document['abstract'])
    l.add_xpath('title', document['title'])
    l.add_xpath('submission_path', document['submission_path'])

    # handle dates
    dates = [i for i in l.get_xpath(document['date'])[0].split(', ')]
    d = getdate(dates)
    l.add_value('submission_date',d['submission_date'])
    l.add_value('revision_date',d['revision_date'])
    l.add_value('accepted_date', d['accepted_date'])
    l.add_value('online_date', d['online_date'])

    date_page = l.get_xpath(document['dp'])[0].split(', ')
    try:
        l.add_value('publication_date', parse(date_page[-2]))
    except:
        pass

    # handle pages
    try:
        pages = date_page[-1].split()[-1]
        if '–' in pages:
            fp = int(pages.split('–')[0])
            lp = int(pages.split('–')[1])
        elif '-' in pages:
            fp = int(pages.split('-')[0])
            lp = int(pages.split('-')[1])
        l.add_value('fpage', fp)
        l.add_value('lpage', lp)
        l.add_value('pages', lp-fp+1)
    except:
        pass

    # mark it down, with source's publication_title
    return l



def load_keyword(response, keyword):
    for k in response.xpath(keyword).extract():
        k1 = k.split(';')[0]
        l = ItemLoader(item = KeywordItem(), response = response)
        l.default_output_processor = TakeFirst()
        l.add_value('keyword', k1)
        yield l



def load_author(response,author):
    auths = response.xpath(author['auth'])
    for auth in auths:
        l = ItemLoader(item = AuthorItem(), response = response)
        l.default_output_processor = TakeFirst()

        # author's first name and last name
        fn = auth.xpath(author['fn']).extract()[0]
        ln = auth.xpath(author['ln']).extract()[0]
        l.add_value('fname', fn)
        l.add_value('lname', ln)

        # author's email
        try:
            email = auth.xpath(author['email']).extract()[0][7:]
            l.add_value('email', email)
        except:
            pass

        # author's address and institution
        try:
            fid = auth.xpath(author['fid']).extract()[0][1:]
            address = l.get_xpath(author['address'] %fid)

            for i in address[0].split(', '):
                if 'niversity' in i:
                    institution = i
                    break
            l.add_value('address', address)
            l.add_value('institution', institution)
        except:
            pass

        # author's vitae
        try:
            href = auth.xpath(author['href']).extract()[0][1:]
            vitae = response.xpath(author['vitae'] %href).extract()[0]
            l.add_value('vitae', fn+' '+ln+vitae)
        except:
            pass

        # author's avatar
        try:
            href = auth.xpath(author['href']).extract()[0][1:]
            avatar = response.xpath(author['avatar'] %href).extract()[0]
            l.add_value('avatar', avatar)
        except:
            pass

        yield l




def load_author0(response, author):
    auths = response.xpath(author['auth'])
    for auth in auths:
        l = ItemLoader(item = AuthorItem(), response = response)
        l.default_output_processor = TakeFirst()

        # add author's fname and lname
        name = auth.xpath(author['name']).extract()[0].split('&')[-2:]
        fn = name[-1].split('first-name=')[-1]
        ln = name[0].split('last-name=')[-1]

        l.add_value('fname', fn)
        l.add_value('lname', ln)

        # add author's email
        try:
            email = auth.xpath(author['email']).extract()[0][7:]
            l.add_value('email', email)
        except:
            pass


        # add author's institution and address
        try:
            address = auth.xpath(author['address']).extract()
            for i in address[0].split(', '):
                # elif i in univelist:# institution = i# break
                if "niversity" in i:
                    institution = i
                    break
            l.add_value('address', address)
            l.add_value('institution', institution)
        except:
            pass

        # add author's vitae
        try:
            href = auth.xpath(author['href']).extract()
            vitae = response.xpath(author['vitae'] %href[0][1:]).extract()[0]
            l.add_value('vitae', fn+ ' ' +ln+vitae)
        except:
            pass

        yield l


In [33]:
def parse_document(response):
    if len(response.xpath(author['auth'])) != 0:
        l = load_document(response, document)
        l.add_value('publication_title', 'Decision Support System')
        yield l.load_item()
        title = response.xpath(relationship['title']).extract()[0]
        for l in load_keyword(response, keyword):
            l.add_value('title', title)
            yield l.load_item()
        for l in load_author(response, author):
            l.add_value('title', title)
            yield l.load_item()
    elif len(response.xpath(author0['auth'])) != 0:
        l = load_document(response, document0)
        l.add_value('publication_title', response.meta['publication_title'])
        yield l.load_item()
        title = response.xpath(relationship['title0']).extract()[0]
        for l in load_keyword(response, keyword0):
            l.add_value('title', title)
            yield l.load_item()
        for l in load_author0(response, author0):
            l.add_value('title', title)
            yield l.load_item()
    else:
        print(" $ No Authors $   ", response.url, " <-----   LOOK HERE! ~\('o ')")


In [34]:
for i in parse_document(response):
    print(i)

{'abstract': "This study explores the role of norms in employees' compliance "
             'with an organizational information security policy (ISP). '
             'Drawing upon norm activation theory, social norms theory, and '
             'ethical climate literature, we propose a model to examine how '
             'ISP-related personal norms are developed and then activated to '
             "affect employees' ISP compliance behavior. We collected our data "
             'through Amazon Mechanical Turk for hypothesis testing. The '
             'results show that ISP-related personal norms lead to ISP '
             'compliance behavior, and the effect is strengthened by '
             'ISP-related ascription of personal responsibility. Social norms '
             'related to ISP (including injunctive and subjective norms), '
             'awareness of consequences, and ascription of personal '
             'responsibility shape personal norms. Social norms related to ISP '
     