In [1]:
import requests.exceptions

from Research_Scraper_Code.Research_Scraper import ResearchScraper
from Research_Scraper_Code import utils
import pandas as pd
import time
import json
import urllib.parse
import random
from scholarly import scholarly, ProxyGenerator
from scholarly import MaxTriesExceededException
import os

### Init scraper and load the publications from the csv

In [2]:
scraper = ResearchScraper()

In [3]:
def load_publications_from_csv():
    data = 'data/publications_without_abstract.csv'

    with open(data) as f:
        df = pd.read_csv(f, sep=';')
    return df


In [4]:
df_publications = load_publications_from_csv()

## Scraping utils

In [5]:
def get_all_dois(df):
    dois = df['doi']
    # remove NaNs
    dois = dois.dropna()
    dois.tolist()
    return dois


publication_dois = get_all_dois(df_publications)

Create a sample of cris data and scrape them

In [6]:
def scrape_sample_of_dois(dois, n):
    """
    Chooses random n entries from the list of dois and scrapes them. For debugging purposes
    :param dois: list of dois to sample from
    :param n: size of the sample
    :return: results of the scraping
    """
    # get a sample of 10 from dois
    sample = dois.sample(n)
    #print(sample)
    results = []

    for doi in sample:
        print(f'Scraping {doi}')
        start = time.time()
        result = scraper.scrape_publication_by_doi(doi, params=['full'])
        end = time.time()
        print(f'Total time : {end - start}')
        old_len = len(results)
        results.append(result)
        print(f'\n \t  >>>>>> added new result, n went from {old_len} to n={len(results)}')
        # print(f'\t -> Results: {results}')

    #write_results(results, f'sample_{time.strftime("%Y_%m_%d__%H_%M")}')
    return results


In [None]:
# run_cell = Truex
# # scrape sample of 3
# if run_cell:
#     res = None
#     res = scrape_sample_of_dois(publication_dois, 3)
#     utils.write_results(res, f'sample_scrapings_{time.strftime("%Y_%m_%d__%H_%M")}')

Scraping a list of DOIs

In [None]:
# def scrape_publication_by_doi_list(doi_list, params=['full']):
#     print(f'Time of scrape start: {time.strftime("%Y_%m_%d__%H_%M")}')
#     results = []
#     for doi in doi_list:
#         print(f'>>> Scraping {doi}')
#         result = scraper.scrape_publication_by_doi(doi, params)
#         print(f'>>>> Scraping {doi} done')
#         results.append(result)
#         print(f'>>>> Scraping {doi} added to results')
#     print(f'>>>> Scraping {len(doi_list)} publications done')
#     utils.write_results(results, f'scrapings_{time.strftime("%Y_%m_%d__%H_%M")}')
#     print(f'Time of scrape end: {time.strftime("%Y_%m_%d__%H_%M")}')
#     return results

## Importing results of scraping

Read the results of scraping from a file

In [None]:
# read json file with scraping results (all cris doi numbers)
with open('exports/scrapings/scrapings_2022_10_21__03_38.json') as f:
    scraping_results_imported = json.load(f)

# old file 582 after clean

Drop nones and errors, extract error rows

In [None]:
scraping_results_imported_cleaned = [x for x in scraping_results_imported if x is not None and x.get('error') is None]

In [None]:
# rows from the dict with the key 'error'
error_rows = [x for x in scraping_results_imported if x is not None and x.get('error') is not None]
error_rows_unknown = [x for x in error_rows if x.get('error') != 'No scraper found for this url']

You can search for a publication in the scraped results by url

In [None]:
[x for x in scraping_results_imported_cleaned if
 x.get('url') == urllib.parse.unquote('https://link.springer.com/article/10.1007%2Fs12599-019-00623-1')]

With the scraper we can fill missing keywords, imagine we would want to find some keywords

In [19]:
scraper = ResearchScraper()
link = 'https://link.springer.com/article/10.1007%2Fs12599-019-00623-1'
result = scraper.scrape_publication_by_url(link, params=['keywords']).get('keywords')
result

[6;30;42m[DEBUG - ResearchScraper] - Found scraper for https://link.springer.com/article/10.1007%2Fs12599-019-00623-1 -> ScraperSpringer[0m
[6;30;32m Scraped keys: ['url', 'keywords'][0m


['Reference modeling',
 'Forms',
 'Standardization',
 'E-government',
 'Digital government',
 'Public administration',
 'Meta-modeling',
 'Graphical user interface']

## Cris data exploration
### Find pdf links directly from cris

In [10]:
df_publications = load_publications_from_csv()
# filter url not nan
df_publications_with_url = df_publications[df_publications['url'].notna()]
# df_publications_with_url['url'] must end with .pdf
df_publications_with_url = df_publications_with_url[df_publications_with_url['url'].str.endswith('.pdf')]
# get id and url col (so we can join later)
df_publications_with_url = df_publications_with_url[['id', 'url']]
# convert to list of dicts
#dict_publications_with_url = df_publications_with_url.to_dict('records')
print(f'There are {len(df_publications_with_url)} publications with a pdf url')

# iterate over the rows of dataframe df_publications_with_url and get the url

#for index, row in df_publications_with_url.iterrows():


df_publications_with_url.iloc[466]

There are 543 publications with a pdf url


id                                                152557
url    http://www.ifg-muenster.de/forschen/veroeffent...
Name: 9294, dtype: object

In [8]:
test_row = df_publications_with_url.iloc[8]
filename = f'document_{test_row["id"]}'
pdf_url = test_row['url']
print(pdf_url)
folder = 'exports/pdf_downloads/0_cris_downloads'
utils.download_pdf(url=pdf_url, filename=filename, write_folder_path=folder)


http://udoo.uni-muenster.de/downloads/publications/2072.pdf
[utils.py: download_pdf] Connection Error - could not download: HTTPConnectionPool(host='udoo.uni-muenster.de', port=80): Max retries exceeded with url: /downloads/publications/2072.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8a0a511160>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))


## Download all the pdf from cris

In [8]:
def download_pdf_from_cris(df_publications):
    df_prepared = df_publications[df_publications['url'].notna()]
    df_prepared = df_prepared[df_prepared['url'].str.endswith('.pdf')]
    df_prepared = df_prepared[['id', 'url']]
    df_prepared = df_prepared.reset_index(drop=True)

    folder = 'exports/pdf_downloads/0_cris_downloads'
    for index, row in df_prepared.iterrows():
        print(f'Downloading {index + 1} of {len(df_prepared)} publications')
        filename = f'document_{row["id"]}'
        pdf_url = row['url']
        utils.download_pdf(url=pdf_url, filename=filename, write_folder_path=folder, timeout=10)


download_pdf_from_cris(df_publications)



Downloading 1 of 543 publications
[utils.py: download_pdf] Connection Error - could not download: error type: <class 'requests.exceptions.ConnectionError'> -> HTTPConnectionPool(host='udoo.uni-muenster.de', port=80): Max retries exceeded with url: /downloads/publications/2011.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fbe29de79d0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
Downloading 2 of 543 publications
[utils.py: download_pdf] Connection Error - could not download: error type: <class 'requests.exceptions.ConnectTimeout'> -> HTTPConnectionPool(host='web.up.ac.za', port=80): Max retries exceeded with url: /ecis/ECIS2010PR/ECIS2010/Content/Papers/0451.R1.pdf (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7fbe39bd3fd0>, 'Connection to web.up.ac.za timed out. (connect timeout=10)'))
Downloading 3 of 543 publications
[utils.py: download_pdf] Connection Error - coul

### Example: Finding missing keywords
Searching for publications that are not complete, with the scrape we could get the missing keywords here

In [22]:
# filter df_publications url not nan and doi not nan
df_publications_filtered = df_publications[df_publications['url'].notna() & df_publications['doi'].notna()]
# filter rows with nan in keywords
df_publications_filtered = df_publications[df_publications['keywords'].isna()]
# filter rows with supported url
df_publications_filtered = df_publications_filtered[(df_publications_filtered['url'].str.contains('link.springer')) | (
    df_publications_filtered['url'].str.contains('sciencedirect')) | (
                                                        df_publications_filtered['url'].str.contains('ieeexplore'))]
df_publications_filtered


Unnamed: 0,id,cris_id,cris_c4_id,title,subtitle,title_in_source,language,external,content_type,peer_reviewed,...,title_of_series,place,publisher,isbn,doi,url,authors,strings,sort_order,client_purpose
1703,22913,17292119,22913,Tracking down the business cycle: A dynamic fa...,,,English,,,Ja,...,,,,,10.1016/j.eeh.2009.04.002,http://www.sciencedirect.com/science?_ob=Artic...,"Sarferaz S, Uebele M","[[[entry:ApaHtml]]]Sarferaz, S., & Uebele, M. ...","7990-1060-Sarferaz S, Uebele M",
1797,25524,17456544,25524,The Dynamic of Bicycle Finals: A Theoretical a...,,,English,Ja,,Ja,...,,Münster,,,10.1016/S0313-5926(09)50037-9,http://www.sciencedirect.com/science/article/p...,"Dilger Alexander, Geyer Hannah","[[[entry:ApaHtml]]]Dilger, A., & Geyer, H. (20...","7990-1060-Dilger Alexander, Geyer Hannah",
2258,30720,17807442,30720,The Absent-Minded Prisoner,,,English,Ja,,Ja,...,,,,,10.1007/s10108-006-9011-1,http://link.springer.com/article/10.1007%2Fs10...,Dilger Alexander,"[[[entry:ApaHtml]]]Dilger, A. (2006). The Abse...",7993-1060-Dilger Alexander,
2737,46125,18894525,46125,Rankings von Zeitschriften und Personen in der...,,,German,Ja,,Ja,...,,Münster,,,10.1007/s12354-010-0119-8,http://link.springer.com/article/10.1007%2Fs12...,Dilger Alexander,"[[[entry:ApaHtml]]]Dilger, A. (2010). Rankings...",7989-1060-Dilger Alexander,
2914,48566,19063464,48566,Arbeitsmarktregulierung und nachvertraglicher ...,,,German,Nein,,Ja,...,,,,,10.1007/978-3-663-07711-4_9,http://link.springer.com/chapter/10.1007%2F978...,"Frick Bernd, Dilger Alexander, Prinz Joachim","[[[entry:ApaHtml]]]Frick, B., Dilger, A., & Pr...","7997-1060-Frick Bernd, Dilger Alexander, Prinz...",
4133,80987,21568164,80987,Why Social Currency Becomes a Key Driver of a ...,,,English,Ja,,Ja,...,,,,,10.1016/j.lrp.2012.11.004,http://www.sciencedirect.com/science/article/p...,"Lobschat L, Zinnbauer MA, Pallas F, Joachimsth...","[[[entry:ApaHtml]]]Lobschat, L., Zinnbauer, M....","7986-1060-Lobschat L, Zinnbauer MA, Pallas F, ...",
4872,92164,20829855,92164,Graph Theory and Model Collection Management –...,,,English,Ja,,Ja,...,,,,,10.1007/s10257-014-0243-6,http://link.springer.com/article/10.1007/s1025...,"Breuker Dominic, Delfmann Patrick, Dietrich Ha...","[[[entry:ApaHtml]]]Breuker, D., Delfmann, P., ...","7984-1060-Breuker Dominic, Delfmann Patrick, D...",
5386,99652,20345089,99652,Bitcoin: A First Legal Analysis – with referen...,,,English,Ja,,Ja,...,,,,,10.1007/978-3-662-44774-1_4,http://link.springer.com/chapter/10.1007%2F978...,"Boehm, Pesch","[[[entry:ApaHtml]]]Boehm, P. (2014). Bitcoin: ...","7985-1040-Boehm, Pesch",
5429,101077,20258749,101077,Improving the State of the Art in Inexact TSP ...,,,English,Ja,,Ja,...,,Cham,Springer International Publishing,,10.1007/978-3-319-19084-6_18,http://download.springer.com/static/pdf/93/chp...,"Kotthoff Lars, Kerschke Pascal, Hoos Holger H,...","[[[entry:ApaHtml]]]Kotthoff, L., Kerschke, P.,...","7984-1040-Kotthoff Lars, Kerschke Pascal, Hoos...",
5628,104246,20053043,104246,Extending Battery Management Systems for Makin...,,,English,Ja,,Ja,...,Lecture Notes in Computer Science,,Springer International Publishing,,10.1007/978-3-319-18714-3_37,https://link.springer.com/content/pdf/10.1007%...,"Monhof Markus, Beverungen Daniel, Klör Benjami...","[[[entry:ApaHtml]]]Monhof, M., Beverungen, D.,...","7984-1040-Monhof Markus, Beverungen Daniel, Kl...",


In [None]:
# Scraping only that keywords
# get {id: id, url: url, doi: doi}
scrape_dict = df_publications_filtered[['id', 'url', 'doi', 'title']].to_dict('records')
for idx, row in enumerate(scrape_dict):
    print(f'Row {idx + 1} of {len(scrape_dict)}')

    ## if url not na
    #row['url'] = urllib.parse.unquote(row['url'])  # make sure url is fine
    #row['doi'] = urllib.parse.unquote(row['doi'])  # make sure doi is fine

    if pd.notna(row.get('doi')):
        row['keywords'] = scraper.scrape_publication_by_doi(row.get('doi'), params=['keywords']).get('keywords')
    elif pd.notna(row.get('url')):
        # if url ends with '.pdf' then skip
        if row.get('url').endswith('.pdf'):
            print(f'Row {idx + 1} of {len(scrape_dict)} is pdf, skipping')
            continue
        row['keywords'] = scraper.scrape_publication_by_url(row.get('url'), params=['keywords']).get('keywords')
    else:
        row['keywords'] = None

Our scraper found the missing keywords, out of the 48 publications it found 37 keywords. We could now add these keywords to the cris data and do this search for other attributes as well.

In [None]:
# found keywords
scrape_dict_found = [x for x in scrape_dict if x.get('keywords') is not None]
scrape_dict_keywords = [x.get('keywords') for x in scrape_dict_found]
print(f'Found {len(scrape_dict_found)} keywords out of {len(scrape_dict)} publications')

1623 entries with DOI

In [None]:
# filter where doi not nan
df_publications_filtered = df_publications[df_publications['doi'].notna()]
df_publications_filtered.shape

675 with URL

In [None]:
# filter where url not nan
df_publications_filtered = df_publications_filtered[df_publications_filtered['url'].notna()]
df_publications_filtered.shape

948 entries only with doi

In [None]:
df_publications_filtered = df_publications[df_publications['doi'].notna() & df_publications['url'].isna()]
df_publications_filtered.shape
df_publications_filtered

1162 entries only with url

In [None]:
df_publications_filtered = df_publications[df_publications['doi'].isna() & df_publications['url'].notna()]
df_publications_filtered.shape
df_publications_filtered

entries with DOI and url

In [None]:
df_publications_filtered = df_publications[df_publications['doi'].notna() & df_publications['url'].notna()]
print(df_publications_filtered.shape)
df_publications_filtered

2785 with DOI or url

In [None]:
df_publications_filtered = df_publications[df_publications['doi'].notna() | df_publications['url'].notna()]
df_publications_filtered.shape
df_publications_filtered
print(df_publications.shape)
# relative amount of publications with DOI or URL
print((df_publications_filtered.shape[0] / df_publications.shape[0]) * 100, '%')

## Exploring our scraping results

In [None]:
final_scraping_result = scraping_results_imported_cleaned

In [None]:
# extract all the pdf links in the format {doi, pdf_link}, if not null
all_found_pdf_links = [(x.get('doi'), x.get('pdf')) for x in final_scraping_result if
                       x.get('pdf') is not None]

In [14]:
# get all domains from resolved links (using exported csv file)
def domain(url):
    if url is not None and pd.notna(url):
        return urllib.parse.urlparse(url).netloc  # returns domain
    else:
        return None


In [None]:

# apply domain to all results
domains_of_dois = [{'doi': x.get('doi'), 'domain': domain(x.get('url'))} for x in final_scraping_result if
                   x.get('url') is not None]

In [None]:
# describe domains_of_dois
df_domains_of_dois = pd.DataFrame(domains_of_dois)
# custom col name
df_domains_of_dois.columns = ['doi', 'domain']
# count the domains
counts = df_domains_of_dois['domain'].value_counts()
print(counts / len(df_domains_of_dois))
# plot
df_domains_of_dois['domain'].value_counts().plot(kind='bar')

In [None]:
# get all publications from domain 'link.springer.com' from final_scraping_result
springer_publications = [x for x in final_scraping_result if
                         x.get('url') is not None and domain(x.get('url')) == 'link.springer.com']
# create overview of all publication types
springer_publication_types = [x.get('publication_type') for x in springer_publications]
# count the publication types
counts = pd.Series(springer_publication_types).value_counts()
counts


### Get authors with listed orcid

In [None]:
def get_orcid_authors(scraping_result):
    # search authors with orcid
    # get all authors
    all_authors = [x.get('authors') for x in scraping_result if x.get('authors') is not None]
    all_authors
    # flatten list
    all_authors = [item for sublist in all_authors for item in sublist]
    # authors with orcid
    authors_with_orcid = [x for x in all_authors if type(x) is not str and x.get('orcid') is not None]
    authors_with_orcid
    # remove duplicate double entries
    authors_with_orcid = list({v['orcid']: v for v in authors_with_orcid}.values())
    # sort by name
    authors_with_orcid = sorted(authors_with_orcid, key=lambda k: k['name'])

    return authors_with_orcid


authors_with_orcid = get_orcid_authors(final_scraping_result)
authors_with_orcid

## Downloading pdf documents

In [None]:
final_scraping_result = scraping_results_imported_cleaned
# get {doi:doi, pdf_link:pdf} for all entries with pdf_link
all_found_pdf_links = [{'doi': x.get('doi'), 'pdf': x.get('pdf')} for x in final_scraping_result if
                       x.get('pdf') is not None]

In [None]:
scraper = ResearchScraper()
scraping_results = utils.load_and_clean_scraping_results(filename='scrapings_2022_10_21__03_38')
scraper.download_pdf_of_publications_by_scraping_results(scraping_results)

## FIind urls with scholar
We need first to get the data from cris to apply the scholar search

In [None]:
def get_publication_target_dict(df_publications):
    #global df_publications_target, dict_publication_target
    # filter rows without doi and without url
    df_publications_target = df_publications[df_publications['doi'].isna() & df_publications['url'].isna()]
    # df_publications cols: cris_id, title and authors
    df_publications_target = df_publications_target[['cris_id', 'title', 'authors']]
    #convert to dict
    dict_publication_target = df_publications_target.to_dict('records')
    return dict_publication_target


def get_publication_target_dict_chunks(dict_publication_target):
    dict_publication_target_split = [dict_publication_target[i:i + 50] for i in
                                     range(0, len(dict_publication_target), 50)]
    return dict_publication_target_split


df_publications_target = get_publication_target_dict(df_publications)
# # first 50 rows
# df_publications_sample = df_publications_target.head(50)
# # convert to dict
# dict_publications_sample = df_publications_sample.to_dict('records')




For large scale application we can make chunks of 50 each (worth 1250 api credits

In [None]:
# split dict_publication_target into pieces of 50 with list comprehension
#dict_publication_target_split = [dict_publication_target[i:i + 50] for i in range(0, len(dict_publication_target), 50)]

In [None]:
# method that takes list of dict {cris_id: cris-id, title:title, authors:authors} an tries to receive a url from scholar
def get_urls_from_scholar_list_of_publications(publications, filename):
    # create a list of publications with {cris_id: cris-id, title:title, authors:authors, url:url}
    publications_with_url = []
    # iterate over publications
    for idx, publication in enumerate(publications):
        print(f'publication {idx + 1} of {len(publications)}: {publication.get("title")}')
        # get title and authors
        title = publication.get('title')
        authors = publication.get('authors')
        # if authors nan then make ''
        if pd.isna(authors):
            authors = ''
        cris_id = publication.get('cris_id')

        # create a query
        query = title
        # if authors are available add them to the query
        if authors is not None:
            query = query + ' ' + authors

        try:
            # search for url with query
            publication['url'] = get_url_from_publication_with_scholarly(query)
            print(f' Found url: {publication["url"]}')
        except Exception as e:
            print(f' Error: {e}')
            print(f'Could not find url for cris: {cris_id}, Google scholar blocked us')
        else:
            # add complete publication to publications_with_url
            publications_with_url.append(publication)
            utils.write_results(publications_with_url, filename)

    return publications_with_url

In [None]:
#pub_res = get_urls_from_scholar_list_of_publications(dict_publications_sample, 'url_scrape_results_0_to_49')

In [None]:
def handle_proxy(proxy=None):
    """
    Handle the procxy, by default no proxy thus None
    :param proxy: choose whether to use proxy, we can add premium proxied if needed later
    :return: void
    """

    if proxy is None:
        scholarly.use_proxy(None)
    elif proxy == 'free':
        pg = ProxyGenerator()
        success = pg.FreeProxies()
        print(f'Free proxy success: {success}')
        scholarly.use_proxy(pg)
    elif proxy == 'scraper_api':
        pg = ProxyGenerator()
        success = pg.ScraperAPI(os.environ['SCRAPER_API_KEY'])
        scholarly.use_proxy(pg)
    else:
        print('No proxy recognized')


def get_url_from_publication_with_scholarly(search_query):
    scholarly_search = scholarly.search_pubs(search_query)
    publication = next(scholarly_search)
    url = publication.get('pub_url')

    return url


handle_proxy('scraper_api')

### Extract second chunk of 50

In [None]:
# get all target publications
pub_target = get_publication_target_dict(df_publications)
pub_target_chunks = get_publication_target_dict_chunks(pub_target)


In [None]:
# scraper api is active
pub_res_chunk_0 = None  # see above
pub_res_chunk_1 = get_urls_from_scholar_list_of_publications(pub_target_chunks[1], 'url_scrape_results_50_to_99')

In [None]:
pub_res_chunk_2 = get_urls_from_scholar_list_of_publications(pub_target_chunks[2], 'url_scrape_results_100_to_149')

In [None]:
# Quick Demo for Blogpost
# cris_data
title = 'Linguistic Theories in IS Research - A Review'
authors = 'Becker J, Fielenbach K, Niehaves B'
scholarly_search = scholarly.search_pubs(f'{title} {authors}')
publication = next(scholarly_search)
url = publication.get('pub_url')

url

In [None]:
scraper.s

In [None]:
# get all target publications
pub_target = get_publication_target_dict(df_publications)
pub_target_chunks = get_publication_target_dict_chunks(pub_target)


In [None]:
# scraper api is active
pub_res_chunk_0 = None  # see above
pub_res_chunk_1 = get_urls_from_scholar_list_of_publications(pub_target_chunks[1], 'url_scrape_results_50_to_99')

In [None]:
pub_res_chunk_2 = get_urls_from_scholar_list_of_publications(pub_target_chunks[2], 'url_scrape_results_100_to_149')

In [None]:
# Quick Demo for Blogpost
# cris_data
title = 'Linguistic Theories in IS Research - A Review'
authors = 'Becker J, Fielenbach K, Niehaves B'
scholarly_search = scholarly.search_pubs(f'{title} {authors}')
publication = next(scholarly_search)
url = publication.get('pub_url')

url

In [18]:
# read them in
with open('exports/scrapings/url_scrape_results_0_to_49.json') as f:
    pub_res_chunk_0 = json.load(f)
with open('exports/scrapings/url_scrape_results_50_to_99.json') as f:
    pub_res_chunk_1 = json.load(f)
with open('exports/scrapings/url_scrape_results_100_to_149.json') as f:
    pub_res_chunk_2 = json.load(f)

# combine them into one list
pub_res_0_to_149 = pub_res_chunk_0 + pub_res_chunk_1 + pub_res_chunk_2
# count how many urls were found
amount_urls_found = len([pub for pub in pub_res_0_to_149 if pub.get('url') is not None])
print(f'Amount of urls found: {amount_urls_found}')

# get those urls
urls = [pub.get('url') for pub in pub_res_0_to_149 if pub.get('url') is not None]
domains = [domain(url) for url in urls]

# look what domains were found and how often
domains_grouped = pd.DataFrame(domains, columns=['domain']).groupby('domain').size().reset_index(name='count')
# sort desc
domains_grouped = domains_grouped.sort_values(by='count', ascending=False)
domains_grouped

Amount of urls found: 118


Unnamed: 0,domain,count
6,link.springer.com,38
0,aisel.aisnet.org,37
16,www.researchgate.net,13
11,www.econstor.eu,4
1,books.google.com,4
9,www.academia.edu,4
5,ieeexplore.ieee.org,4
2,cs.emis.de,3
4,eprints.qut.edu.au,2
7,osnascholar.ub.uni-osnabrueck.de,1


In [8]:
x = 3
print(f'\033[1;30;43{x}\033[0m')

[1;30;433[0m


In [10]:
msg = 'hel'
print(f'\033[1;30;43m{msg}\033[0m')

[1;30;43mhel[0m


In [28]:
# get all target publications
pub_target = get_publication_target_dict(df_publications)
pub_target_chunks = get_publication_target_dict_chunks(pub_target)


In [33]:
# scraper api is active
pub_res_chunk_0 = None  # see above
pub_res_chunk_1 = get_urls_from_scholar_list_of_publications(pub_target_chunks[1], 'url_scrape_results_50_to_99')

publication 1 of 50: Business Process Reference Models for Reorganizing Public Administrations - A Case Study
 Found url: https://www.academia.edu/download/42050734/Business_Process_Reference_Models_for_Re20160204-7902-77x3te.pdf
[1;30;42m1 results written to url_scrape_results_50_to_99.json[0m
publication 2 of 50: Perspectives on Process Documentation - A Case Study
 Found url: https://link.springer.com/chapter/10.1007/978-1-4020-5347-4_19
[1;30;42m2 results written to url_scrape_results_50_to_99.json[0m
publication 3 of 50: Open Source Software for Virtual Town-Halls
 Found url: https://www.researchgate.net/profile/Patrick-Delfmann/publication/265021197_Open_Source_Software_for_Virtual_Town-Halls/links/56123bbc08ae6b29b49e509f/Open-Source-Software-for-Virtual-Town-Halls.pdf
[1;30;42m3 results written to url_scrape_results_50_to_99.json[0m
publication 4 of 50: Identifikation von Best Practices durch Geschäftsprozessmodellierung in öffentlichen Verwaltungen
 Found url: https://ww

In [34]:
pub_res_chunk_2 = get_urls_from_scholar_list_of_publications(pub_target_chunks[2], 'url_scrape_results_100_to_149')

publication 1 of 50: Process Reference Model for Data Warehouse Development - A Consensus-oriented Approach
 Found url: None
[1;30;42m1 results written to url_scrape_results_100_to_149.json[0m
publication 2 of 50: Epistemological Perspectives on IS-Development - A Consensus-Oriented Approach on Conceptual Modeling
 Found url: https://link.springer.com/chapter/10.1007/11590019_71
[1;30;42m2 results written to url_scrape_results_100_to_149.json[0m
publication 3 of 50: Epistemologische Positionierung in der Wirtschaftsinformatik am Beispiel einer konsensorientierten Informationsmodellierung
 Found url: https://www.econstor.eu/handle/10419/59562
[1;30;42m3 results written to url_scrape_results_100_to_149.json[0m
publication 4 of 50: Bezugsrahmen zur epistemologischen Positionierung der Referenzmodellierung
 Found url: https://link.springer.com/chapter/10.1007/978-3-7908-2698-2_1
[1;30;42m4 results written to url_scrape_results_100_to_149.json[0m
publication 5 of 50: Customer Proces

In [36]:
# Quick Demo for Blogpost
# cris_data
title = 'Linguistic Theories in IS Research - A Review'
authors = 'Becker J, Fielenbach K, Niehaves B'
scholarly_search = scholarly.search_pubs(f'{title} {authors}')
publication = next(scholarly_search)
url = publication.get('pub_url')

url

'https://link.springer.com/chapter/10.1007/978-3-7908-2336-3_5'

In [None]:
scraper.s