In [1]:
import json
import time

import requests
from scholarly import scholarly, ProxyGenerator
import pprint
import pandas as pd
import numpy as np

# Init proxy

In [8]:
def handle_proxy(proxy=None):
    """
    Handle the procxy, by default no proxy thus None
    :param proxy: choose whether to use proxy, we can add premium proxied if needed later
    :return: void
    """

    if proxy is None:
        scholarly.use_proxy(None)
    elif proxy == 'free':
        pg = ProxyGenerator()
        success = pg.FreeProxies()
        print(f'Free proxy success: {success}')
        scholarly.use_proxy(pg)
    elif proxy == 'scraper_api':
        pg = ProxyGenerator()
        success = pg.ScraperAPI('XXX Token XXX')
        scholarly.use_proxy(pg)
    else:
        print('No proxy recognized')

# Methods
## Search author (with fill parameter)

In [21]:
search_query = scholarly.search_author('Fabian Gieseke, Münster')
author = next(search_query)
# res = scholarly.fill(author, sections=[])

## Method: author details

In [18]:
def search_author_information_from_google_scholar(details, sections=['basics', 'indices', 'counts']):
    result = scholarly.search_author(details)
    # get first result
    found_author = next(result)
    filled_author = scholarly.fill(found_author, sections=sections)
    # remove unwanted values from filled_author
    del filled_author['container_type']
    del filled_author['filled']
    del filled_author['source']

    return filled_author


#handle_proxy()
prof_gieseke = search_author_information_from_google_scholar('Fabian Gieseke, University of Munster')
# print prof_gieseke keys
print(prof_gieseke.keys())

dict_keys(['scholar_id', 'url_picture', 'name', 'affiliation', 'email_domain', 'interests', 'citedby', 'citedby5y', 'hindex', 'hindex5y', 'i10index', 'i10index5y', 'cites_per_year', 'organization', 'homepage'])


## Search by author id

In [50]:
author = scholarly.search_author_id('g3ZiieoAAAAJ')  # now in the short form without fill
author

{'container_type': 'Author',
 'filled': ['basics'],
 'scholar_id': 'g3ZiieoAAAAJ',
 'source': <AuthorSource.AUTHOR_PROFILE_PAGE: 'AUTHOR_PROFILE_PAGE'>,
 'name': 'Fabian Gieseke',
 'url_picture': 'https://scholar.googleusercontent.com/citations?view_op=view_photo&user=g3ZiieoAAAAJ&citpid=2',
 'affiliation': 'Department of Information Systems, University of Münster',
 'organization': 12991841939693602029,
 'interests': ['Data Engineering', 'Maschine Learning'],
 'email_domain': '@uni-muenster.de',
 'homepage': 'https://www.wi.uni-muenster.de/de/institut/dasc',
 'citedby': 1361}

## Search orga and authors by orga id

Searching for WWU as orga

In [8]:
orga = scholarly.search_org('Münster')[0]  # taking first result
wwu_orga_id = orga.get('id')
print(f'WWU Orga ID: {wwu_orga_id}')

WWU Orga ID: 12991841939693602029


Using WWU orga id to search for authors

In [9]:
search_query_author_by_orga_wwu = scholarly.search_author_by_organization(
    wwu_orga_id)  # an iterator, you need to loop through
author = next(search_query_author_by_orga_wwu)  # first result of iterator
author

{'container_type': 'Author',
 'filled': [],
 'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 'SEARCH_AUTHOR_SNIPPETS'>,
 'scholar_id': 'U5xmC3IAAAAJ',
 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=U5xmC3IAAAAJ',
 'name': 'Frank Glorius',
 'affiliation': 'WWU Muenster',
 'email_domain': '@uni-muenster.de',
 'interests': ['Organic Chemistry',
  'Photocatalysis',
  'CH Activation',
  'Organometallic Chemistry',
  'Asymmetric Catalysis'],
 'citedby': 58871}

## Search by keyword

In [4]:
search_query_keywords = scholarly.search_keyword('Data Engineering')
next(search_query_keywords)  # returns first result

{'container_type': 'Author',
 'filled': [],
 'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 'SEARCH_AUTHOR_SNIPPETS'>,
 'scholar_id': 'ui0P8NYAAAAJ',
 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=ui0P8NYAAAAJ',
 'name': 'David Rio Deiros',
 'affiliation': 'Scientific Software Engineer, Baylor College Of Medicine',
 'email_domain': '@bcm.edu',
 'interests': ['bioinformatics',
  'genomics',
  'nextgen sequencing',
  'visualization',
  'data'],
 'citedby': 10039}

## Search for publications (and their links, dois)
Scholarly is a good way to find the link of a publication when we have some information. This can be used with the incomplete cris data. Unfortunately google is very fast with blocking the bot ( probably the IP?)

In [7]:
search_query_publication = scholarly.search_pubs(
    query='How Regular Are Conjugate Exponential Families?')
next(search_query_publication)

{'container_type': 'Publication',
 'source': <PublicationSource.PUBLICATION_SEARCH_SNIPPET: 'PUBLICATION_SEARCH_SNIPPET'>,
 'bib': {'title': 'How regular are conjugate exponential families?',
  'author': ['U Müller-Funk', 'F Pukelsheim'],
  'pub_year': '1989',
  'venue': 'Statistics & probability letters',
  'abstract': "Given an exponential family of sampling distributions of order k, one may construct in a natural way an exponential family of conjugate (that is, prior) distributions depending on a k-dimensional parameter c and an additional weight w> 0. We compute the bias term by which the expectation of the sampling mean-value parameter under the conjugate distribution deviates from the conjugate parameter c. This bias term vanishes for regular exponential families, providing an appealing interpretation of the conjugate parameter c as a 'prior"},
 'filled': False,
 'gsrank': 1,
 'pub_url': 'https://www.sciencedirect.com/science/article/pii/016771528990117X',
 'author_id': ['', ''],

## Get the url of publication by search query

In [20]:
from scholarly import MaxTriesExceededException


def get_url_from_publication_with_scholarly(search_query):
    scholarly_search = scholarly.search_pubs(search_query)
    publication = next(scholarly_search)
    url = publication.get('pub_url')

    return url


import time

start = time.time()
#handle_proxy('scraper_api')
test_url = get_url_from_publication_with_scholarly(
    'Exploring customers’ likeliness to use e-service touchpoints in brick and mortar retail, Benjamin Barann')
end = time.time()
print(f'Time: {end - start}')

Time: 46.76773524284363


In [19]:
handle_proxy()

 ## Citedby search

In [67]:
# this function triggers ban
#  publication = next(
#scholarly.search_pubs('Massively-Parallel Change Detection for Satellite Time Series Data with Missing Values'))
#query_cited_by = scholarly.citedby(publication)
# print all citations
#next(query_cited_by)

# Error: MaxTriesExceededException: Cannot Fetch from Google Scholar -> prob IP ban

KeyboardInterrupt: 

## Using proxies to bypass google scholar security
Not very good working, google is blocking our requests pretty fast

In [13]:
pg = ProxyGenerator()  # init proxy generator

We use a free proxy, could use a more powerfula (and not free) proxy if neeeded during the course of the project

In [14]:
success = pg.FreeProxies()  # active free proxy and check if it is working
success  # Returns true if working

True

Activate proxy

In [15]:
scholarly.use_proxy(pg)

In [16]:
# Query as before
search_query = scholarly.search_author('Fabian Gieseke, Münster')
next(search_query)

{'container_type': 'Author',
 'filled': [],
 'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 'SEARCH_AUTHOR_SNIPPETS'>,
 'scholar_id': 'g3ZiieoAAAAJ',
 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=g3ZiieoAAAAJ',
 'name': 'Fabian Gieseke',
 'affiliation': 'Department of Information Systems, University of Münster',
 'email_domain': '@uni-muenster.de',
 'interests': ['Data Engineering', 'Maschine Learning'],
 'citedby': 1361}

In [7]:
publication = next(
    scholarly.search_pubs('Big Universe, Big Data: Machine Learning and Image Analysis for Astronomy, Fabian Gieseke'))
#query_cited_by = scholarly.citedby(publication)
# print all citations
#next(query_cited_by)

MaxTriesExceededException: Cannot Fetch from Google Scholar.

In [None]:
publication.get_journal_categories()