# 0.Preparation

In [None]:
# we need headers to disguise our bot as a browser

headers = {
    "Connection": "keep-alive",
    "Cache-Control": "max-age=0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
    "Accept-Encoding": "gzip,deflate,sdch",
    "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4,zh-TW;q=0.2",
}

import requests
from scrapy.http import TextResponse

r = requests.get('https://www.jstor.org/journal/amereconrevi', 
                 headers = headers)

response = TextResponse(r.url, body = r.text, encoding = 'utf-8')

# there is a response we need to handle
response

In [None]:
import re
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

In [None]:
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Item, Field


class DocumentItem(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    abstract = Field()

    publication_date = Field()
    submission_date = Field()
    online_date = Field()
    revision_date = Field()
    accepted_date = Field()

    title = Field()
    coverpage_url = Field()
    fpage = Field()
    lpage = Field()
    pages = Field()
    submission_path = Field()

    publication_title = Field()


class KeywordItem(Item):
    keyword = Field()

    title = Field()


class SourceItem(Item):
    publication_title = Field()
    chief_editor = Field()
    issn = Field()
    description = Field()
    home_url = Field()
    coverimage = Field()

    title = Field()

class AuthorItem(Item):
    institution = Field()
    email = Field()
    avatar = Field()
    vitae = Field()
    fname = Field()
    lname = Field()
    address = Field()

    title = Field()

# 1.Structure

`UPDATE: 2017.6.25 00:39`

In [None]:
response.xpath('.//dl[@class="accordion"]/@data-decade').extract()

In [None]:
'https://www.jstor.org/journal/acadmanaj?decade=1990'

In [None]:
base_url = 'https://www.jstor.org/journal/amereconrevi'

In [None]:
base_url

In [None]:
[ base_url + '?decade=' + i for i in response.xpath('.//dl[@class="accordion"]/@data-decade').extract()]

In [None]:
'https://www.jstor.org/stable/i302974'

In [None]:
jstor_url = 'https://www.jstor.org'

In [None]:
[jstor_url + i for i in response.xpath('.//li[@data-doi]/a/@href').extract()]

In [None]:
r = requests.get('https://www.jstor.org/stable/10.2307/i29780254', 
                 headers = headers)

response = TextResponse(r.url, body = r.text, encoding = 'utf-8')

# there is a response we need to handle
response

In [None]:
[jstor_url + i for i in response.xpath('.//div[@class="media-body media-object-section main-section"]/a/@href').extract()]

In [None]:
r = requests.get('https://www.jstor.org/stable/10.1086/666616', 
                 headers = headers)

response = TextResponse(r.url, body = r.text, encoding = 'utf-8')
# there is a response we need to handle
response

# 2.Document

In [None]:
document = dict(
    title = './/h1[@class="title"]/text()',
    # submission_path, publication_date, page
    meta = './/*[@class="src mbl"]/text()',
    abstract = './/*[@class="abstract1"]/text()'
    )

In [None]:
response.xpath(document['abstract']).extract()

In [None]:
response.xpath(document['title']).extract()[0].replace('\n', '').strip()

In [None]:
a = response.xpath(document['meta']).extract()[0].replace('\n', '').strip()
a

In [None]:
import re
meta = re.split('[()]', response.xpath(document['meta']).extract()[0].replace('\n', '').strip())
pages = [ int(i) for i in meta[-1].split('pp.')[-1].split('-')]
pages

In [None]:
from dateutil.parser import parse
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Join, TakeFirst
import re

def load_document(response, document):
    l = ItemLoader(item = DocumentItem(), response = response)
    l.default_output_processor = TakeFirst()
    
    l.add_value('coverpage_url', response.url)
    l.add_xpath('abstract', document['abstract'])
    l.add_value('title', response.xpath(document['title']).extract()[0].replace('\n', '').strip())
    meta = re.split('[()]', response.xpath(document['meta']).extract()[0].replace('\n', '').strip())
    try:
        l.add_value('submission_path', meta[0])
    except:
        pass
    
    try:
        l.add_value('publication_date', parse(meta[1]))
    except:
        pass

    # handle pages
    try:
        pages = [ int(i) for i in meta[-1].split('pp.')[-1].split('-')]
        fp = pages[0]
        lp = pages[-1]
        l.add_value('fpage', fp)
        l.add_value('lpage', lp)
        l.add_value('pages', lp-fp+1)
    except:
        pass

    # mark it down, with source's publication_title
    return l

In [None]:
l = load_document(response, document)
l.load_item()

# 3.Keyword

In [None]:
keyword = './/*[@class="topics mtl"]/a/text()'

response.xpath(keyword).extract()

#  4.Author

In [None]:
author = dict(names = './/*[@class="contrib"]/text()')

string = response.xpath(author['names']).extract()[0].replace('\n', '').strip()
string

In [None]:
import re
names = [str.strip(i) for i in string.replace(' and ', ', ').split(',')]
names

In [None]:
name = names[0].split()
name

In [None]:
def load_author(response, author):
    string = response.xpath(author['names']).extract()[0].replace('\n', '').strip()
    names = [str.strip(i) for i in string.replace(' and ', ', ').split(',')]
    for name in names:
        l = ItemLoader(item = AuthorItem(), response = response)
        l.default_output_processor = TakeFirst()
        # author's first name and last name
        flname = name.split()
        fn = flname[0]
        ln = flname[-1]
        l.add_value('fname', fn)
        l.add_value('lname', ln)
        yield l

for i in list(load_author(response, author)):
    print(i.load_item())

# 5.Source

`UPDATED: 2017.6.23 17:16`

In [None]:
r = requests.get('https://www.jstor.org/journal/jconsrese', 
                 headers = headers)

response = TextResponse(r.url, body = r.text, encoding = 'utf-8')
# there is a response we need to handle
response

In [None]:
response.xpath('.//*[@class="journal_info_button"]/a/@href').extract()[0]

In [None]:
jstor_url = 'https://www.jstor.org'
jstor_url + response.xpath('.//*[@class="journal_info_button"]/a/@href').extract()[0]

In [None]:
r = requests.get('https://www.jstor.org/journal/acadmanaj?item_view=journal_info', 
                 headers = headers)

response = TextResponse(r.url, body = r.text, encoding = 'utf-8')
# there is a response we need to handle
response

In [None]:
source = dict(
    issn = '//div[@class="issn mtm"]/text()',
    publication_title = './/div[@class="journal lookslikeh2 drop-content-title"]/text()',
    description = './/div[@class="journal_description mtm"]',
    subjects = './/div[@class="subjects mtm"]',
    collections = './/div[@class="collections mtm"]',
    coverimage = './/img[@class="cover"]/@src'
    )

In [None]:
def _get_descrip(key, response, source):
    ''' Inner Function '''  
    try:
        a =response.xpath(source[key])[0].extract()
        b = cleanhtml(a).replace('  ','').replace("&amp", '') 
    except:
        b  = ''
    value = b.replace('\n','')
    return value
    

In [None]:
_get_descrip('description', response, source)

In [None]:
description = " ".join([ _get_descrip(j, response, source) for j in ['description', 'subjects', 'collections']])
description

In [None]:
response.xpath(source['issn']).extract()[0].strip()

In [None]:
from scrapy.loader.processors import Join, TakeFirst
from scrapy.loader import ItemLoader

def _get_descrip(key, response, source):
    ''' Inner Function '''  
    try:
        a =response.xpath(source[key])[0].extract()
        b = cleanhtml(a).replace('  ','').replace("&amp", '') 
    except:
        b  = ''
    value = b.replace('\n','')
    return value

def load_source(response, source):
    l = ItemLoader(item = SourceItem(), response = response)
    l.default_output_processor = TakeFirst()
    l.add_xpath("issn", source['issn'])
    l.add_xpath('publication_title', source['publication_title'])
    description = " ".join([ _get_descrip(j, response, source) for j in ['description', 'subjects', 'collections']])
    l.add_value('description', description)
    l.add_value('home_url', response.url)
    # l.add_xpath('coverimage', response.meta.get('coverimage'))
    return l

In [None]:
l = load_source(response, source)
l.load_item()

In [None]:
s = 'efasdfsdfadsfas'
s[:100]