In [None]:
# we need headers to disguise our bot as a browser

headers = {
    "Connection": "keep-alive",
    "Cache-Control": "max-age=0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
    "Accept-Encoding": "gzip,deflate,sdch",
    "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4,zh-TW;q=0.2",
}

import requests
from scrapy.http import TextResponse

r = requests.get('http://pubsonline.informs.org/doi/abs/10.1287/mnsc.2015.2304', 
                 headers = headers)

response = TextResponse(r.url, body = r.text, encoding = 'utf-8')

# there is a response we need to handle
response

In [None]:
from scrapy import Item, Field

class DocumentItem(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    abstract = Field()

    publication_date = Field()
    submission_date = Field()
    online_date = Field()
    revision_date = Field()
    accepted_date = Field()

    title = Field()
    coverpage_url = Field()
    fpage = Field()
    lpage = Field()
    pages = Field()
    submission_path = Field()

    publication_title = Field()

class KeywordItem(Item):
    keyword = Field()

    title = Field()

class SourceItem(Item):
    publication_title = Field()
    chief_editor = Field()
    issn = Field()
    description = Field()
    home_url = Field()
    coverimage = Field()

    title = Field()

class AuthorItem(Item):
    institution = Field()
    email = Field()
    avatar = Field()
    vitae = Field()
    fname = Field()
    lname = Field()
    address = Field()

    title = Field()

In [None]:
def getdate(dates):
    d = {}
    d['submission_date'] = None
    d['revision_date'] = None
    d['accepted_date'] = None
    d['online_date'] = None
    for date in dates:
        if 'Received' in date:
            d['submission_date'] = parse(date.split('Received ')[-1])
        elif 'Revised' in date:
            d['revision_date'] = parse(date.split('Revised ')[-1])
        elif 'Accepted' in date:
            d['accepted_date'] = parse(date.split('Accepted ')[-1])
        elif 'Available online' in date:
            d['online_date'] = parse(date.split('Available online ')[-1])
    return d



In [None]:
document = dict(
    title = './/h1[@class="chaptertitle"]/text()',
    abstract = './/div[@class="abstractSection abstractInFull"]/p/text()',
    submission_path = './/ul[@class="breadcrumbs"]/li/a/text()',
    revision_date = './/*[@class="publicationContentReceivedDate dates"]/text()',
    accepted_date = './/*[@class="publicationContentAcceptedDate dates"]/text()',
    online_date = './/*[@class="publicationContentEpubDate dates"]/text()',
    pages = './/div[@class="publicationContentPageRange"]/text()'
    )

In [None]:
from dateutil.parser import parse

In [None]:
l = ItemLoader(item = DocumentItem(), response = response)
l.default_output_processor = TakeFirst()

l.get_xpath(document['title'])[0].replace('\n', '').strip()
l.get_xpath(document['abstract'])[0]
l.get_xpath(document['pages'])[0].split('\n')[-2].strip().split(' - ')
l.get_xpath(document['submission_path'])[-1].replace('\n', '').strip()

parse(l.get_xpath(document['online_date'])[0].replace('\n', '').strip().split('Published Online: ')[-1])
parse(l.get_xpath(document['accepted_date'])[0].replace('\n', '').strip().split('Accepted: ')[-1])
parse(l.get_xpath(document['revision_date'])[0].replace('\n', '').strip().split('Received: ')[-1])

pages = l.get_xpath(document['pages'])[0].split('\n')[-2].strip().split(' - ')
pages[0]
pages[-1]

In [None]:
from dateutil.parser import parse
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Join, TakeFirst

def load_document(response, document):
    l = ItemLoader(item = DocumentItem(), response = response)
    l.default_output_processor = TakeFirst()
    
    l.add_value('title', l.get_xpath(document['title'])[0].replace('\n', '').strip())
    l.add_value('abstract', l.get_xpath(document['abstract'])[0])
    l.add_value('submission_path', l.get_xpath(document['submission_path'])[-1].replace('\n', '').strip())

    # dates
    try:
        l.add_value('online_date', parse(l.get_xpath(document['online_date'])[0].replace('\n', '').strip().split('Published Online: ')[-1]))
    except:
        pass
    
    try:
        l.add_value('accepted_date', parse(l.get_xpath(document['accepted_date'])[0].replace('\n', '').strip().split('Accepted: ')[-1]))
    except:
        pass
    
    try:
        l.add_value('revision_date', parse(l.get_xpath(document['revision_date'])[0].replace('\n', '').strip().split('Received: ')[-1]))
    except:
        pass
   
    # handle pages
    try:
        pages = l.get_xpath(document['pages'])[0].split('\n')[-2].strip().split(' - ')
        fp = int(pages[0])
        lp = int(pages[-1])
        l.add_value('fpage', fp)
        l.add_value('lpage', lp)
        l.add_value('pages', lp-fp+1)
    except:
        pass

    # mark it down, with source's publication_title
    return l

In [None]:
l = load_document(response, document)

### keywords

In [None]:
keyword = './/*[@class="abstractKeywords"]//a/text()'

response.xpath(keyword).extract()

### authors

In [None]:
author = dict(
    name = './/div[@class="contribDegrees"]/a[@class="entryAuthor"]/text()',
    institution = './/*[@class="contribAff"]/text()')

In [None]:
names = response.xpath(author['name']).extract()

In [None]:
institutions = response.xpath(author['institution']).extract()

In [None]:
def load_author(response, author):
    names = response.xpath(author['name']).extract()
    institutions = response.xpath(author['institution']).extract()
    for i in range(len(names)):
        name = names[i].split()
        fn = name[0]
        ln = name[-1]
        institution = institutions[i]
        l = ItemLoader(item = AuthorItem(), response = response)
        l.default_output_processor = TakeFirst()
        l.add_value('fname', fn)
        l.add_value('lname', ln)
        l.add_value('institution', institution)
        yield l
    

In [None]:
for l in load_author(response, author):
    print(l.load_item())

### Source

In [None]:
r = requests.get('http://pubsonline.informs.org/journal/mnsc', 
                 headers = headers)

response = TextResponse(r.url, body = r.text, encoding = 'utf-8')

# there is a response we need to handle
response

In [None]:
source = dict(
    issn = './/div[@class="wrapped "]/div/div[@class="pb-rich-text"]/p/span/text()',
    publication_title = './/ul[@class="breadcrumbs"]/li/text()',
    description = './/div[@class="pb-rich-text"]/p',
    coverimage = './/div[@class="pb-columns row-fluid "]//div[@class="wrapped "]/div/a/img/@src'
    )

In [None]:
# description
description = unicodedata.normalize("NFKD", cleanhtml(response.xpath(source['description']).extract()[1]))
description

In [None]:
# issn 
issn = [i for i in response.xpath(source['issn']).extract() if "ISSN: " in i][0].replace('ISSN: ', '')
issn

In [None]:
publication_title = response.xpath(source['publication_title']).extract()[-1].replace('\n', '').strip()
publication_title

In [None]:
informs_url = "http://pubsonline.informs.org"
coverimage = informs_url + response.xpath(source['coverimage']).extract()[-1]
coverimage

In [None]:
import re
from scrapy.loader.processors import Join, TakeFirst, Join
import unicodedata

# this function is used to strip the html tags
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext


def load_source(response, source):
    informs_url = "http://pubsonline.informs.org"
    coverimage = informs_url + response.xpath(source['coverimage']).extract()[-1]
    publication_title = response.xpath(source['publication_title']).extract()[-1].replace('\n', '').strip()
    issn = [i for i in response.xpath(source['issn']).extract() if "ISSN: " in i][0].replace('ISSN: ', '')
    description = unicodedata.normalize("NFKD", cleanhtml(response.xpath(source['description']).extract()[1]))
    
    l = ItemLoader(item = SourceItem(), response = response)
    l.default_output_processor = TakeFirst()
    l.add_value("issn", issn)
    l.add_value('publication_title', publication_title)
    l.add_value('coverimage', coverimage)
    l.add_value('description', description)
    l.add_value('home_url', response.url)
    return l

In [None]:
l = load_source(response, source)
l.load_item()