# Scraper commands for publications on springer
## Examples

Here it is only about the steps, to parse are necessary so that it can be included in the end.

There are different website types for publications on springer.

In [88]:
journal_doi = 'https://doi.org/10.1007/s12525-020-00445-0'
journal_full_link = 'https://link.springer.com/article/10.1007/s12525-020-00445-0'

conference_chapter_link = 'https://link.springer.com/chapter/10.1007/978-3-030-49570-1_14'
conference_book_link = 'https://link.springer.com/book/10.1007/978-3-642-22531-4'

volume_contribution_link = 'https://link.springer.com/chapter/10.1007/978-3-030-06234-7_27'
#volume_link = 'https://link.springer.com/book/10.1007/978-3-030-06234-7'

springer_links = [journal_full_link, conference_chapter_link, conference_book_link, volume_contribution_link]

In [89]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
import re
import json

In [90]:
# identify the type of the link
def get_springer_link_type(url):
    if '/chapter/' in url:
        return 'chapter'
    elif '/book/' in url:
        return 'book'
    elif '/article/' in url:
        return 'article'
    else:
        return 'unknown'


In [91]:
for link in springer_links:
    get_springer_link_type(link)

In [92]:
def get_bs(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15'}
        r = requests.get(url, headers=headers)
        print(r.status_code)
        bs = BeautifulSoup(r.text, 'html.parser')
    except:
        print('Error: ', url)
        return None
    return bs

In [93]:
journal_soup = get_bs(journal_full_link)
conference_chapter_soup = get_bs(conference_chapter_link)
conference_book_soup = get_bs(conference_book_link)
volume_contribution_soup = get_bs(volume_contribution_link)
springer_soups = [journal_soup, conference_chapter_soup, conference_book_soup, volume_contribution_soup]

200
200
200
200


In [7]:
# get whole data from json and loads to dict
# articles need to be prefiltered because they are nested differently
def get_json_data(bs):
    json_string = bs.find('script', {'type': 'application/ld+json'}).text
    json_data = json.loads(json_string)

    if '{"mainEntity":' in json_string:
        return json_data['mainEntity']
    else:
        return json_data


Creating json files for the three different types of publications for testing

In [8]:
journal_json = get_json_data(journal_soup)
conference_chapter_json = get_json_data(conference_chapter_soup)
conference_book_json = get_json_data(conference_book_soup)
volume_contribution_json = get_json_data(volume_contribution_soup)
json_data_list = [journal_json, conference_chapter_json, conference_book_json, volume_contribution_json]

In [9]:
# get names of json fields
def get_json_fields(json_data):
    return list(json_data.keys())

In [10]:
get_json_fields(journal_json)

['headline',
 'description',
 'datePublished',
 'dateModified',
 'pageStart',
 'pageEnd',
 'license',
 'sameAs',
 'keywords',
 'image',
 'isPartOf',
 'publisher',
 'author',
 'isAccessibleForFree',
 '@type']

## Main Fields
Title #TODO get title also for book (if instruction or with json)

In [11]:
def get_title(bs):
    try:
        title = bs.find('h1', {'class': 'c-article-title'}).text
        return title
    except:
        return None

In [12]:
for soup in springer_soups:
    print(get_title(soup))

Exploring customers’ likeliness to use e-service touchpoints in brick and mortar retail
A Two-Phase Framework for Detecting Manipulation Campaigns in Social Media
None
Applications of Artificial Intelligence in Supply Chain Management and Logistics: Focusing Onto Recognition for Supply Chain Execution


### Authors

In [13]:
def get_authors(bs):
    """
    Return list of authors in the format:
    [{'name': 'Author Name', 'orcid': orcid}, ...]

    :param bs: Received bs of the publication
    :return: list of dicts
    """
    try:
        json_data = get_json_data(bs)
        authors = []
        for author in json_data.get('author'):
            name = author.get('name')
            # split name at comma and reverse
            name = name.split(', ')
            name = name[1] + ' ' + name[0]
            orcid = author.get('url')
            authors.append({'name': name,
                            'orcid': orcid})
        return authors
    except:
        return None

In [14]:
for soup in springer_soups:
    print(get_authors(soup))

[{'name': 'Benjamin Barann', 'orcid': 'http://orcid.org/0000-0002-1965-2688'}, {'name': 'Jan H. Betzing', 'orcid': None}, {'name': 'Marco Niemann', 'orcid': None}, {'name': 'Benedikt Hoffmeister', 'orcid': None}, {'name': 'Jörg Becker', 'orcid': None}]
[{'name': 'Dennis Assenmacher', 'orcid': None}, {'name': 'Lena Clever', 'orcid': None}, {'name': 'Janina Susanne Pohl', 'orcid': None}, {'name': 'Heike Trautmann', 'orcid': None}, {'name': 'Christian Grimme', 'orcid': None}]
None
[{'name': 'Bernd Hellingrath', 'orcid': None}, {'name': 'Sandra Lechtenberg', 'orcid': None}]


### Keywords #TODO may be replaced with BS scraping because weird data structure

In [152]:
def get_keywords(bs):
    """
    Return list of keywords from json data
    :param bs: Received bs of the publication
    :return:
    """

    """
    Return list of keywords in the format:
    [keyword1, keyword2, ...]
    :param bs: Received bs of the publication
    :return: list: String
    """
    try:
        json_data = get_json_data(bs)
        keywords_string = json_data.get('keywords')
        keywords = keywords_string.split(',')
        return keywords
    except:
        print("Error: no keywords found")
        return None

In [153]:
for soup in springer_soups:
    print(get_keywords(soup))

['IT in Business', 'e-Commerce/e-business']
['Social campaign detection', ' Stream clustering', ' Unsupervised learning']
['Java', ' XQuery', ' abstract interpretation', ' higher-order patterns', ' non-deterministic functions']
['']


### Abstract

In [17]:
def get_abstract(bs, url):
    """
    Returns the abstract of the articles, books/preceedings do not have abstracts.
    :param bs: Received bs of the publication
    :param url: Received url of the publication
    :return: String
    """
    if '/book/' in url:
        return None

    try:
        json_data = get_json_data(bs)
        abstract = json_data.get('description')
        return abstract
    except:
        print("Error: no abstract found")
        return None


In [18]:
for soup, url in zip(springer_soups, springer_links):
    print(get_abstract(soup, url))
    print("-------")

E-commerce has embraced the digital transformation and innovated with e-service touchpoints to improve customers’ experiences. Now some traditional, less-digitalized brick and mortar (BaM) retailers are starting to counteract the increasing competition by adopting digital touchpoints. However, the academic literature offers little in terms of what determines customers’ behavioral intentions toward e-service touchpoints. Therefore, drawing from the dominant design theory, this article first conceptually adapts selected dominant touchpoints of leading e-commerce solutions to BaM retail. Then 250 shoppers are surveyed regarding the likeliness that they will use the selected touchpoints, followed by an exploratory factor analysis to determine the touchpoints’ characteristics that lead to the shoppers’ assessments. The results suggest that customers prefer touchpoints that support product search and selection, provide information, and increase shopping efficiency. The likeliness that survey

### Pdf #todo #TODO: download

In [150]:
def get_pdf(bs, url):
    """
    Returns the pdf link of the publication, if available. Download might require login.
    :param bs: Received bs of the publication
    :param url:
    :return:
    """
    try:
        pdf = None
        #todo automate download
        # differentiate between article, chapter and book
        if '/article/' in url:
            pdf = bs.find('div', class_='c-pdf-container').find('a', {'data-article-pdf': 'true'}).get('href')

        elif '/chapter/' in url:
            pdf_box = bs.find('div', {'class': 'c-article-access-provider'})
            pdf = pdf_box.find('a', {'data-track-action': 'Pdf download'}).get('href')

        elif '/book/' in url:
            pdf = bs.find('div', {'data-test': 'download-article-link-wrapper',
                                  'class': 'js-context-bar-sticky-point-desktop'}).find('a', {
                'data-track-action': 'Book download - pdf'}).get('href')
        if pdf is not None:
            # append base url if necesary
            if 'link.springer.com' in pdf:
                return pdf
            else:
                return f'https://link.springer.com{pdf}'
    except:
        print("Error: no pdf found")
        return None

In [151]:
test_links = []
for soup, url in zip(springer_soups, springer_links):
    test_links.append(get_pdf(soup, url))

test_links

Error: no pdf found
Error: no pdf found


['https://link.springer.com/content/pdf/10.1007/s12525-020-00445-0.pdf',
 'https://link.springer.com/content/pdf/10.1007/978-3-030-49570-1_14.pdf',
 None,
 None]

In [21]:
journal_soup.find('div', class_='c-pdf-container').find('a', {'data-article-pdf': 'true'}).get('href')

'https://link.springer.com/content/pdf/10.1007/s12525-020-00445-0.pdf'

### Publisher

In [22]:
def get_publisher(bs):
    """
    Returns the publisher of the publication
    :param bs: Received bs of the publication
    :return: String
    """
    try:
        json_data = get_json_data(bs)
        publisher = json_data.get('publisher').get('name')
        return publisher
    except:
        print("Error: no publisher found")
        return None

In [23]:
for soup in springer_soups:
    print(get_publisher(soup))

Springer Berlin Heidelberg
Springer International Publishing
Springer Berlin Heidelberg
Springer International Publishing


### Year

In [24]:
def get_year(bs, url):
    """
    Returns the year of the publication
    :param url: URL of the publication
    :param bs: Received bs of the publication
    :return:
    """
    try:
        json_data = get_json_data(bs)
        if ('/chapter/' in url) or ('/article/' in url):
            date = json_data.get('datePublished')
            year = date.split('-')[0]  # get year from date (if date is available)
            return year
        if '/book/' in url:
            year = json_data.get('copyrightYear')
            return year

    except:
        print("Error: no year found")
        print(bs.name)
        return None

In [27]:
for soup, url in zip(springer_soups, springer_links):
    print(get_year(soup, url))


2020
2020
2011
2019


### Publication type

In [49]:
def get_publication_type(bs, url):
    """
    Returns the publication type of the publication
    :param url: URL of the publication
    :param bs: Received bs of the publication
    :return: String
    """
    try:
        if '/book/' in url:
            type = bs.find('li', {'class': 'c-article-identifiers__item'}).text
        else:
            type = bs.find('li', {'class': 'c-article-identifiers__item', 'data-test': 'article-category'}).text
        return type
    except:
        print("Error: no publication type found in bs, deriving by url")
        if '/book/' in url:
            return 'Book'
        elif '/chapter/' in url:
            return 'Chapter'
        elif '/article/' in url:
            return 'Article'
        #return None

In [50]:
for soup, url in zip(springer_soups, springer_links):
    print(get_publication_type(soup, url))

Research Paper
Conference paper
Conference proceedings
Chapter


In [52]:
# html_test = requests.get("https://link.springer.com/article/10.1007/s11129-017-9188-7").text
# bs_test = BeautifulSoup(html_test, 'html.parser')
# get_publication_type(bs_test, 'https://link.springer.com/article/10.1007/s11129-017-9188-7')

## Journal Fields
### Journal name

In [42]:
def get_journal_name(bs, url):
    """
    Returns the journal name of the publication
    Returns None if publication is not a journal article
    :param bs: Received bs of the publication
    :param url: URL of the publication
    :return: Journal name (String)
    """

    json_data = get_json_data(bs)
    if '/article/' in url:
        journal_name = json_data.get('isPartOf').get('name')
        return journal_name
    else:
        return None

In [43]:
for soup, url in zip(springer_soups, springer_links):
    print(get_journal_name(soup, url))

Electronic Markets
None
None
None


### Journal volume

In [44]:
def get_journal_volume(bs, url):
    """
    Returns the journal volume of the publication
    Returns None if publication is not a journal article
    :param bs: Received bs of the publication
    :param url: URL of the publication
    :return: Journal volume (String)
    """
    json_data = get_json_data(bs)
    if '/article/' in url:
        journal_volume = json_data.get('isPartOf').get('volumeNumber')
        return journal_volume
    else:
        return None

In [45]:
for soup, url in zip(springer_soups, springer_links):
    print(get_journal_volume(soup, url))

32
None
None
None


## Conference fields
### Conference name

In [57]:
def get_conference_name(bs, url):
    if get_publication_type(bs, url) == 'Conference paper':
        conference_name = bs.find('p', class_='c-chapter-info-details u-mb-8').find('a', {
            'data-track': 'click', 'data-track-action': 'open conference'
        }).text
        return conference_name
    else:
        return None

In [58]:
for soup, url in zip(springer_soups, springer_links):
    print(get_conference_name(soup, url))

None
International Conference on Human-Computer Interaction
None
None


### Venue -> omitted

### Conference proceeding/ Book title

In [64]:
def get_proceedings(bs, url):
    """
    Returns the title of the conference proceedings or book under which the publication was published.
    :param bs: Received bs of the publication
    :param url: URL of the publication
    :return: Title of the proceedings/book (String)
    """
    if get_publication_type(bs, url) == 'Conference paper':
        try:
            json_data = get_json_data(bs)
            proceedings = json_data.get('isPartOf').get('name')
            return proceedings
        except:
            return None
    else:
        return None

In [65]:
for soup, url in zip(springer_soups, springer_links):
    print(get_proceedings(soup, url))

None
Social Computing and Social Media. Design, Ethics, User Behavior, and Social Network Analysis
None
None


## Book/ volume contributions
### Book title

In [67]:
def get_book_title(bs, url):
    """
    Returns the title of the book under which the publication was published.
    :param bs: Received bs of the publication
    :param url: URL of the publication
    :return: Title of the book (String)
    """
    if get_publication_type(bs, url) == 'Chapter':
        try:
            json_data = get_json_data(bs)
            book_title = json_data.get('isPartOf').get('name')
            return book_title
        except:
            return None
    else:
        return None

In [68]:
for soup, url in zip(springer_soups, springer_links):
    print(get_book_title(soup, url))

None
None
None
The Art of Structuring


In [147]:
def get_editors(bs, url):
    """
    Returns the editors of the volume (under which the publication was published)
    :param bs: Received bs of the publication
    :param url: URL of the publication
    :return: List of Editors : [String]
    """
    if '/chapter/' in url:
        try:
            editor_div = bs.find('div', {'class': 'c-article-section__content', 'id': 'editor-information-content'})
            editors = []
            for editor in editor_div.find_all('p', class_='c-article-author-affiliation__authors-list'):
                editors.append(editor.text)
            # remove titles since we scrape the names
            # remove Everything including the point from strings in list
            editors = [editor.split('.')[1].strip() for editor in editors]
            return editors
        except:
            return None
    # in books the editors are in the json file
    if '/book/' in url:
        try:
            json_data = get_json_data(bs)
            editors = [editor.get('name') for editor in json_data.get('editor')]
            return editors
        except:
            return None
    else:
        return None


In [140]:
for soup, url in zip(springer_soups, springer_links):
    print(get_editors(soup, url))

None
['Gabriele Meiselwitz']
None
['Katrin Bergener', 'Michael Räckers', 'Armin Stein']


## Books (in general: proceedings or editor volumes)
### Book subtitle

In [73]:
def get_book_subtitle(bs, url):
    """
    Returns the subtitle of the book (proceedings or editor volume).
    :param bs: Received bs of the publication
    :param url: URL of the publication
    :return: Subtitle of the book (String)
    """
    if '/book/' in url:
        try:
            json_data = get_json_data(bs)
            book_subtitle = json_data.get('alternateName')
            return book_subtitle
        except:
            return None
    return None

In [74]:
for soup, url in zip(springer_soups, springer_links):
    print(get_book_subtitle(soup, url))

None
None
20th International Workshop, WFLP 2011, Odense, Denmark, July 19, 2011, Proceedings
None


In [75]:
# html_test = requests.get('https://link.springer.com/book/10.1007/978-3-030-06234-7').text
# bs_test = BeautifulSoup(html_test, 'html.parser')
# get_book_subtitle(bs_test, 'https://link.springer.com/book/10.1007/978-3-030-06234-7')

'Bridging the Gap Between Information Systems Research and Practice'