In [12]:
import json
import requests
from scholarly import scholarly, ProxyGenerator
import pprint
import pandas as pd
import numpy as np

# Methods
## Search author (with fill parameter)

In [10]:
search_query = scholarly.search_author('Fabian Gieseke, Münster')
author = next(search_query)
res = scholarly.fill(author, sections=[])

In [11]:
res

{'container_type': 'Author',
 'filled': ['basics',
  'indices',
  'counts',
  'coauthors',
  'publications',
  'public_access'],
 'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 'SEARCH_AUTHOR_SNIPPETS'>,
 'scholar_id': 'g3ZiieoAAAAJ',
 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=g3ZiieoAAAAJ',
 'name': 'Fabian Gieseke',
 'affiliation': 'Department of Information Systems, University of Münster',
 'email_domain': '@uni-muenster.de',
 'interests': ['Data Engineering', 'Maschine Learning'],
 'citedby': 1361,
 'organization': 12991841939693602029,
 'homepage': 'https://www.wi.uni-muenster.de/de/institut/dasc',
 'citedby5y': 1116,
 'hindex': 20,
 'hindex5y': 18,
 'i10index': 31,
 'i10index5y': 25,
 'cites_per_year': {2009: 5,
  2010: 6,
  2011: 8,
  2012: 13,
  2013: 38,
  2014: 41,
  2015: 65,
  2016: 65,
  2017: 85,
  2018: 108,
  2019: 151,
  2020: 156,
  2021: 291,
  2022: 314},
 'coauthors': [{'container_type': 'Author',
   'filled': [],
   'scholar_i

## Search by author id

In [50]:
author = scholarly.search_author_id('g3ZiieoAAAAJ')  # now in the short form without fill
author

{'container_type': 'Author',
 'filled': ['basics'],
 'scholar_id': 'g3ZiieoAAAAJ',
 'source': <AuthorSource.AUTHOR_PROFILE_PAGE: 'AUTHOR_PROFILE_PAGE'>,
 'name': 'Fabian Gieseke',
 'url_picture': 'https://scholar.googleusercontent.com/citations?view_op=view_photo&user=g3ZiieoAAAAJ&citpid=2',
 'affiliation': 'Department of Information Systems, University of Münster',
 'organization': 12991841939693602029,
 'interests': ['Data Engineering', 'Maschine Learning'],
 'email_domain': '@uni-muenster.de',
 'homepage': 'https://www.wi.uni-muenster.de/de/institut/dasc',
 'citedby': 1361}

## Search orga and authors by orga id

Searching for WWU as orga

In [8]:
orga = scholarly.search_org('Münster')[0]  # taking first result
wwu_orga_id = orga.get('id')
print(f'WWU Orga ID: {wwu_orga_id}')

WWU Orga ID: 12991841939693602029


Using WWU orga id to search for authors

In [9]:
search_query_author_by_orga_wwu = scholarly.search_author_by_organization(
    wwu_orga_id)  # an iterator, you need to loop through
author = next(search_query_author_by_orga_wwu)  # first result of iterator
author

{'container_type': 'Author',
 'filled': [],
 'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 'SEARCH_AUTHOR_SNIPPETS'>,
 'scholar_id': 'U5xmC3IAAAAJ',
 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=U5xmC3IAAAAJ',
 'name': 'Frank Glorius',
 'affiliation': 'WWU Muenster',
 'email_domain': '@uni-muenster.de',
 'interests': ['Organic Chemistry',
  'Photocatalysis',
  'CH Activation',
  'Organometallic Chemistry',
  'Asymmetric Catalysis'],
 'citedby': 58871}

## Search by keyword

In [4]:
search_query_keywords = scholarly.search_keyword('Data Engineering')
next(search_query_keywords)  # returns first result

{'container_type': 'Author',
 'filled': [],
 'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 'SEARCH_AUTHOR_SNIPPETS'>,
 'scholar_id': 'ui0P8NYAAAAJ',
 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=ui0P8NYAAAAJ',
 'name': 'David Rio Deiros',
 'affiliation': 'Scientific Software Engineer, Baylor College Of Medicine',
 'email_domain': '@bcm.edu',
 'interests': ['bioinformatics',
  'genomics',
  'nextgen sequencing',
  'visualization',
  'data'],
 'citedby': 10039}

## Search for publications
Scholarly is a good way to find the link of a publication when we have some information. This can be used with the incomplete cris data. Unfortunately google is very fast with blocking the bot ( probably the IP?)

In [10]:
search_query_publication = scholarly.search_pubs(
    query='Massively-Parallel Change Detection for Satellite Time Series Data with Missing Values')
next(search_query_publication)

KeyboardInterrupt: 

 ## Citedby search

In [67]:
publication = next(
    scholarly.search_pubs('Massively-Parallel Change Detection for Satellite Time Series Data with Missing Values'))
query_cited_by = scholarly.citedby(publication)
# print all citations
next(query_cited_by)

# Error: MaxTriesExceededException: Cannot Fetch from Google Scholar -> prob IP ban

KeyboardInterrupt: 

## Using proxies to bypass google scholar security
Not very good working, google is blocking our requests pretty fast

In [13]:
pg = ProxyGenerator()  # init proxy generator

We use a free proxy, could use a more powerfula (and not free) proxy if neeeded during the course of the project

In [14]:
success = pg.FreeProxies()  # active free proxy and check if it is working
success  # Returns true if working

True

Activate proxy

In [15]:
scholarly.use_proxy(pg)

In [16]:
# Query as before
search_query = scholarly.search_author('Fabian Gieseke, Münster')
next(search_query)

{'container_type': 'Author',
 'filled': [],
 'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 'SEARCH_AUTHOR_SNIPPETS'>,
 'scholar_id': 'g3ZiieoAAAAJ',
 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=g3ZiieoAAAAJ',
 'name': 'Fabian Gieseke',
 'affiliation': 'Department of Information Systems, University of Münster',
 'email_domain': '@uni-muenster.de',
 'interests': ['Data Engineering', 'Maschine Learning'],
 'citedby': 1361}

In [7]:
publication = next(scholarly.search_pubs('Big Universe, Big Data: Machine Learning and Image Analysis for Astronomy'))
#query_cited_by = scholarly.citedby(publication)
# print all citations
#next(query_cited_by)

MaxTriesExceededException: Cannot Fetch from Google Scholar.

In [None]:
publication.get_journal_categories()