# Live Demo
## Bibliotheken

In [24]:
from Research_Scraper_Code.Research_Scraper import ResearchScraper
from Research_Scraper_Code import utils
import pandas as pd
import json
import os
from scholarly import scholarly, ProxyGenerator
from scholarly import MaxTriesExceededException

In [25]:
scraper = ResearchScraper()

## BS Demo

In [29]:
sample_dois = ['10.1007/s12525-020-00445-0',
               '10.1002/joom.1125',
               '10.1007/978-3-030-49570-1_14',
               '10.1016/j.jvcir.2018.09.015'
               '10.1109/MIS.2017.40']

Diese Liste wird jetzt gescraped

In [30]:
def demo_scrape():
    scraping_results = scraper.scrape_publication_by_doi_list(sample_dois, params=['full'])
    print(f'Anzahl der Ergebnisse: {len(scraping_results)}')
    return scraping_results

scraping_results = demo_scrape()

Time of scrape start: 2022_10_27__01_55
[6;30;44m>>> Scraping 10.1007/s12525-020-00445-0 #0[0m
URL ('https://doi.org/10.1007/s12525-020-00445-0') is a DOI link, Links is now resolved properly
Resolved DOI link to: https://link.springer.com/article/10.1007/s12525-020-00445-0
[6;30;42m[DEBUG - ResearchScraper] - Found scraper for https://link.springer.com/article/10.1007/s12525-020-00445-0 -> ScraperSpringer[0m
[6;30;32m Scraped keys: ['url', 'title', 'doi', 'authors', 'keywords', 'abstract', 'pdf', 'publisher', 'year', 'start_page', 'end_page', 'publication_type', 'full_text', 'references', 'journal_name', 'journal_volume', 'article_accesses', 'amount_citations'][0m
>>>> Scraping 10.1007/s12525-020-00445-0 done
>>>> Scraping 10.1007/s12525-020-00445-0 added to results
[6;30;44m>>> Scraping 10.1002/joom.1125 #1[0m
URL ('https://doi.org/10.1002/joom.1125') is a DOI link, Links is now resolved properly
Resolved DOI link to: https://onlinelibrary.wiley.com/doi/10.1002/joom.1125
[6;

## PDF Demo
CRIS einlesen

In [31]:
def load_publications_from_csv():
    data = 'data/publications_without_abstract.csv'

    with open(data) as f:
        df = pd.read_csv(f, sep=';')
    return df

Cleaning

In [37]:
df_publications = load_publications_from_csv()
df_publications_with_url = df_publications[df_publications['url'].notna()]
df_publications_with_url = df_publications_with_url[df_publications_with_url['url'].str.endswith('.pdf')]
df_publications_with_url = df_publications_with_url[['id', 'url']]
# only first 10 entries
df_publications_with_url = df_publications_with_url.head(50)
print(f'There are {len(df_publications_with_url)} publications with a pdf url')

There are 50 publications with a pdf url


In [39]:
def download_pdf_from_cris(df_publications):
    df_prepared = df_publications[df_publications['url'].notna()]
    df_prepared = df_prepared[df_prepared['url'].str.endswith('.pdf')]
    df_prepared = df_prepared[['id', 'url']]
    df_prepared = df_prepared.reset_index(drop=True)
    print(f'shape of df_prepared: {df_prepared.shape}')

    folder = 'exports/pdf_downloads/live_demo'
    for index, row in df_prepared.iterrows():
        print(f'Downloading {index + 1} of {len(df_prepared)} publications')
        filename = f'document_{row["id"]}'
        pdf_url = row['url']
        utils.download_pdf(url=pdf_url, filename=filename, write_folder_path=folder, timeout=2)


download_pdf_from_cris(df_publications_with_url)

shape of df_prepared: (50, 2)
Downloading 1 of 50 publications
[utils.py: download_pdf] Connection Error - could not download: error type: <class 'requests.exceptions.ConnectionError'> -> HTTPConnectionPool(host='udoo.uni-muenster.de', port=80): Max retries exceeded with url: /downloads/publications/2011.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fdbf19f6460>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
Downloading 2 of 50 publications
[utils.py: download_pdf] Connection Error - could not download: error type: <class 'requests.exceptions.ConnectTimeout'> -> HTTPConnectionPool(host='web.up.ac.za', port=80): Max retries exceeded with url: /ecis/ECIS2010PR/ECIS2010/Content/Papers/0451.R1.pdf (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7fdbf19f64f0>, 'Connection to web.up.ac.za timed out. (connect timeout=2)'))
Downloading 3 of 50 publications
[utils.py: download_pd

## Scholarly

In [9]:
def handle_proxy(proxy=None):
    """
    Starts the proxy, by default no proxy thus None
    :param proxy: choose whether to use proxy, we can add premium proxied if needed later
    :return: void
    """

    if proxy is None:
        scholarly.use_proxy(None)  # no proxy
    elif proxy == 'free':
        pg = ProxyGenerator()
        success = pg.FreeProxies()
        print(f'Free proxy success: {success}')
        scholarly.use_proxy(pg)
    elif proxy == 'scraper_api':
        pg = ProxyGenerator()
        success = pg.ScraperAPI('641f7fb55bd1e65ee847412731234835')
        scholarly.use_proxy(pg)
    else:
        print('No proxy recognized')

handle_proxy()

In [34]:
search_query = scholarly.search_author('Fabian Gieseke, Münster')
next(search_query)

{'container_type': 'Author',
 'filled': [],
 'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 'SEARCH_AUTHOR_SNIPPETS'>,
 'scholar_id': 'g3ZiieoAAAAJ',
 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=g3ZiieoAAAAJ',
 'name': 'Fabian Gieseke',
 'affiliation': 'Department of Information Systems, University of Münster',
 'email_domain': '@uni-muenster.de',
 'interests': ['Data Engineering', 'Maschine Learning'],
 'citedby': 1381}

In [35]:
# cris_data (the publication has no doi/url in cris)
title = 'Linguistic Theories in IS Research - A Review'  # cris data
authors = 'Becker J, Fielenbach K, Niehaves B'  # cris data
scholarly_search = scholarly.search_pubs(f'{title} {authors}')
publication = next(scholarly_search)
url = publication.get('pub_url')

publication

{'container_type': 'Publication',
 'source': <PublicationSource.PUBLICATION_SEARCH_SNIPPET: 'PUBLICATION_SEARCH_SNIPPET'>,
 'bib': {'title': 'Theories of Language in IS Research–A Review',
  'author': ['K Bergener', 'B Niehaves'],
  'pub_year': '2009',
  'venue': 'Wissenschaftstheorie und gestaltungsorientierte …',
  'abstract': ', are based on a theory of linguistic phenomena.”  language theories in IS research: Speech  Act Theory (Subsection 2.1), Theory of Communicative Action (2.2), Structural Linguistics (2.3'},
 'filled': False,
 'gsrank': 1,
 'pub_url': 'https://link.springer.com/chapter/10.1007/978-3-7908-2336-3_5',
 'author_id': ['Fw3nrmAAAAAJ', 'xH4aTOsAAAAJ'],
 'url_scholarbib': '/scholar?hl=en&q=info:udyY4s0q2PUJ:scholar.google.com/&output=cite&scirp=0&hl=en',
 'url_add_sclib': '/citations?hl=en&xsrf=&continue=/scholar%3Fq%3DLinguistic%2BTheories%2Bin%2BIS%2BResearch%2B-%2BA%2BReview%2BBecker%2BJ,%2BFielenbach%2BK,%2BNiehaves%2BB%26hl%3Den%26as_sdt%3D0,33&citilm=1&update_op