In [2]:
# we need headers to disguise our bot as a browser

headers = {
    "Connection": "keep-alive",
    "Cache-Control": "max-age=0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
    "Accept-Encoding": "gzip,deflate,sdch",
    "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4,zh-TW;q=0.2",
}

import requests
from scrapy.http import TextResponse

r = requests.get('http://journals.sagepub.com/doi/full/10.1177/0018726716650730', 
                 headers = headers)

response = TextResponse(r.url, body = r.text, encoding = 'utf-8')

# there is a response we need to handle
response

<200 http://journals.sagepub.com/doi/full/10.1177/0018726716650730>

In [3]:
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Item, Field


class DocumentItem(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    abstract = Field()

    publication_date = Field()
    submission_date = Field()
    online_date = Field()
    revision_date = Field()
    accepted_date = Field()

    title = Field()
    coverpage_url = Field()
    fpage = Field()
    lpage = Field()
    pages = Field()
    submission_path = Field()

    publication_title = Field()


class KeywordItem(Item):
    keyword = Field()

    title = Field()


class SourceItem(Item):
    publication_title = Field()
    chief_editor = Field()
    issn = Field()
    description = Field()
    home_url = Field()
    coverimage = Field()

    title = Field()

class AuthorItem(Item):
    institution = Field()
    email = Field()
    avatar = Field()
    vitae = Field()
    fname = Field()
    lname = Field()
    address = Field()

    title = Field()

In [4]:
# sagepub
document = dict(
    title = './/div[@class="publicationContentTitle"]/h1/text()',
    abstract = './/div[@class="abstractSection abstractInFull"]/p/text()',
    submission_path = './/div[@class="articleJournalNavTitle"]/text()',
    pages = './/div[@class="Article information"]/div/text()',
    dates = './/div[@class="published-dates"]/text()'
    )

In [43]:
from dateutil.parser import parse
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Join, TakeFirst

def load_document(response, document):
    l = ItemLoader(item = DocumentItem(), response = response)
    l.default_output_processor = TakeFirst()
    
    l.add_value('coverpage_url', response.url)
    l.add_xpath('abstract', document['abstract'])
    l.add_value('title', (l.get_xpath(document['title'])[0]).replace('\n', '').strip())
    l.add_value('submission_path', l.get_xpath(document['submission_path'])[0].replace('\n', '').strip())

    # handle dates
    try:
        dates =[i.replace('\n', '').replace(';', '').strip() for i in response.xpath(document['dates']).extract()[-2:]]
        d = [parse(i) for i in dates]
        l.add_value('online_date', d[0])
        l.add_value('publication_date', d[1])
    except:
        pass

    # handle pages
    try:
        pages = response.xpath(document['pages']).extract()[0].strip().split('\n')[-1].strip().split(':')[-1]
        if '–' in pages:
            fp = int(pages.split('–')[0])
            lp = int(pages.split('–')[1])
        elif '-' in pages:
            fp = int(pages.split('-')[0])
            lp = int(pages.split('-')[1])
        l.add_value('fpage', fp)
        l.add_value('lpage', lp)
        l.add_value('pages', lp-fp+1)
    except:
        pass

    # mark it down, with source's publication_title
    return l

In [44]:
l = load_document(response, document)
l.load_item()

IndexError: list index out of range

In [None]:
keyword = 


def load_keyword(response, keyword):
    for k in response.xpath(keyword).extract():
        k1 = k.split(';')[0]
        l = ItemLoader(item = KeywordItem(), response = response)
        l.default_output_processor = TakeFirst()
        l.add_value('keyword', k1)
        yield l


        

In [8]:
author =dict(
    auth = './/div[@class="contribDegrees"]',
    name = './/a[@class="entryAuthor"]/text()',
    email = './/a[@class="email"]/@href',
    institution = './/div[@class="artice-info-affiliation"]/text()'
    )


In [11]:

def load_author(response,author):
    auths = response.xpath(author['auth'])
    for auth in auths:
        l = ItemLoader(item = AuthorItem(), response = response)
        l.default_output_processor = TakeFirst()

        # author's first name and last name
        name = auth.xpath(author['name']).extract()[0].split()
        fn = name[0]
        ln = name[-1]
        l.add_value('fname', fn)
        l.add_value('lname', ln)

        # author's email
        try:
            email = auth.xpath(author['email']).extract()[0][7:]
            l.add_value('email', email)
        except:
            pass

        # author's institution
        try:
            institution = auth.xpath(author['institution']).extract()[0]
            l.add_value('institution', institution)
        except:
            pass
        yield l



In [12]:
t = load_author(response, author)
for i in t:
    print(i.load_item())

{'email': 'a.dy@lboro.ac.uk',
 'fname': 'Angela',
 'institution': 'Loughborough University London, UK, ',
 'lname': 'Dy'}
{'email': 'susan.marlow@nottingham.ac.uk',
 'fname': 'Susan',
 'institution': 'University of Nottingham, UK, ',
 'lname': 'Marlow'}
{'email': 'lee.martin@warwick.ac.uk',
 'fname': 'Lee',
 'institution': 'University of Warwick, UK, ',
 'lname': 'Martin'}


In [13]:
response.url

'http://journals.sagepub.com/doi/full/10.1177/0018726716650730'

# Source

In [1]:
# we need headers to disguise our bot as a browser

headers = {
    "Connection": "keep-alive",
    "Cache-Control": "max-age=0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
    "Accept-Encoding": "gzip,deflate,sdch",
    "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4,zh-TW;q=0.2",
}

import requests
from scrapy.http import TextResponse

r = requests.get('https://us.sagepub.com/en-us/nam/human-relations/journal200870', 
                 headers = headers)

response = TextResponse(r.url, body = r.text, encoding = 'utf-8')

# there is a response we need to handle
response

<200 https://us.sagepub.com/en-us/nam/human-relations/journal200870>

In [74]:
source = dict(
    issn = '//span[@class="margin-right"]/text()',
    chief_editor = '//td[@class="journal-contributor-member"]/a/text()',
    publication_title = './/h1[@class="heading-large heading-spacing--small"]/text()',
    description = './/div[@class="field field-name-field-website-configuration field-type-text-long field-label-hidden"]',
    coverimage = './/img[@class="sage-thumbnail-width-150px lazy"]/@data-original'
    )

In [81]:
import re
from scrapy.loader.processors import Join, TakeFirst, Join


# this function is used to strip the html tags
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext


def load_source(response, source):
    website_url = 'https://us.sagepub.com' 
    l = ItemLoader(item = SourceItem(), response = response)
    l.default_output_processor = TakeFirst()
    l.add_value("issn", response.xpath(source['issn']).extract()[1].split()[-1])
    l.add_value('chief_editor', response.xpath(source['chief_editor']).extract()[0])
    l.add_xpath('publication_title', source['publication_title'])
    l.add_value('coverimage', website_url + l.get_xpath(source['coverimage'])[0])
    l.add_xpath('description', './/div[@class="field-item even"]', Join(), cleanhtml, lambda x: x.replace('\n', '').replace('  ', '').strip())
    l.add_value('home_url', response.url)
    publication_title = l.get_xpath(source['publication_title'])
    return l

In [83]:
l = load_source(response, source)
l.load_item()

{'chief_editor': 'Nick Turner',
 'coverimage': 'https://us.sagepub.com/sites/default/files/styles/sage_thumbnail_width_150px/feed/79366_HUM.jpg',
 'description': '2015\xa0Impact Factor: 2.619 2015\xa0Ranking:\xa037/192 in '
                'Management\xa0|\xa04/95 in Social Sciences, '
                'Interdisciplinary2016 Release of Journal Citation Reports, '
                'Source: 2015 Web of Science Data Visit the Human Relations '
                'website\xa0for more informationHuman Relations has had a long '
                'tradition of bringing social science disciplines together in '
                'order to understand the character and complexity of human '
                'problems. We publish incisive investigations from an '
                'international network of leading scholars in management, '
                'psychology, sociology, politics, anthropology and '
                'economics.Human Relations seeks high quality research papers '
                'that 

# More URLs

### Home Page

In [2]:
# we need headers to disguise our bot as a browser

headers = {
    "Connection": "keep-alive",
    "Cache-Control": "max-age=0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
    "Accept-Encoding": "gzip,deflate,sdch",
    "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4,zh-TW;q=0.2",
}

import requests
from scrapy.http import TextResponse

r = requests.get('https://us.sagepub.com/en-us/nam/human-relations/journal200870', 
                 headers = headers)

response = TextResponse(r.url, body = r.text, encoding = 'utf-8')

# there is a response we need to handle
response

<200 https://us.sagepub.com/en-us/nam/human-relations/journal200870>

In [49]:
doc_prefix_url = 'http://journals.sagepub.com'

### Volume

In [4]:
r = requests.get('http://journals.sagepub.com/toc/huma/1', 
                 headers = headers)

response = TextResponse(r.url, body = r.text, encoding = 'utf-8')

# there is a response we need to handle
response

<200 http://journals.sagepub.com/toc/huma/1>

In [51]:
document_url = './/a[@class="ref nowrap"]/@href'

In [40]:
[ base_url + i for i in response.xpath(article_url).extract()]

['http://journals.sagepub.com/doi/full/10.1177/0018726716678367',
 'http://journals.sagepub.com/doi/full/10.1177/0018726716676431',
 'http://journals.sagepub.com/doi/full/10.1177/0018726716673442',
 'http://journals.sagepub.com/doi/full/10.1177/0018726716674063',
 'http://journals.sagepub.com/doi/full/10.1177/0018726716670226',
 'http://journals.sagepub.com/doi/full/10.1177/0018726716640865',
 'http://journals.sagepub.com/doi/full/10.1177/0018726716641747',
 'http://journals.sagepub.com/doi/full/10.1177/0018726716645660']

In [32]:
len(a)

8

In [36]:
[i for i in response.xpath(article_url).extract()]

['/doi/full/10.1177/0018726716678367',
 '/doi/full/10.1177/0018726716676431',
 '/doi/full/10.1177/0018726716673442',
 '/doi/full/10.1177/0018726716674063',
 '/doi/full/10.1177/0018726716670226',
 '/doi/full/10.1177/0018726716640865',
 '/doi/full/10.1177/0018726716641747',
 '/doi/full/10.1177/0018726716645660']

In [52]:
[doc_prefix_url + i  for i in response.xpath(document_url).extract()]

['http://journals.sagepub.com/doi/full/10.1177/0018726716678367',
 'http://journals.sagepub.com/doi/full/10.1177/0018726716676431',
 'http://journals.sagepub.com/doi/full/10.1177/0018726716673442',
 'http://journals.sagepub.com/doi/full/10.1177/0018726716674063',
 'http://journals.sagepub.com/doi/full/10.1177/0018726716670226',
 'http://journals.sagepub.com/doi/full/10.1177/0018726716640865',
 'http://journals.sagepub.com/doi/full/10.1177/0018726716641747',
 'http://journals.sagepub.com/doi/full/10.1177/0018726716645660']