In [1]:
from scholarly import scholarly
from scholarly import ProxyGenerator
import os
import csv



In [2]:
# Activates proxy because Google Scholar otherwise might block the IP address
pg = ProxyGenerator()
scholarly.use_proxy(pg, pg)

In [3]:
# This list contains the names of all faculty members who's Google Scholar profiles should be scraped.
# Make sure that the authors have a Google Scholar profile.
# It is important that the names exactly match the name listed on the respective Google Scholar profile.

author_names = ['David Blaauw']

In [5]:
keys = ['author','title','num_citations','number_of_co_authors','pub_year']
for year in range(2000,2023): # Creates a column for the citations in each year respectively. Currently, it's from 2000 until 2022. 
    keys.append(f'{year}') # Define the time span you're interested in by changing the years.

output_file =  open('faculty_pubs.csv', 'a', newline='')
dict_writer = csv.DictWriter(output_file, keys)
if os.path.getsize('faculty_pubs.csv') == 0:
    dict_writer.writeheader()
    output_file.flush()
dict_reader = csv.DictReader(open('faculty_pubs.csv', newline=''), keys)
csv_reader = [row for row in dict_reader]



for name in author_names:
    author = next(scholarly.search_author(name))
    author =  scholarly.fill((author),sections = ['publications'])
    pubs = author['publications']
    author_pubs_in_file = [row for row in csv_reader if row['author'] == name]
    num_author_pubs_in_file = len(author_pubs_in_file) 
    if num_author_pubs_in_file >= len(author['publications']): continue
    i = 0
    for pub in pubs:
        if i < num_author_pubs_in_file: 
            i += 1
            continue
        pub = scholarly.fill(pub)
        if len([x for x in author_pubs_in_file if x['title'] == pub['bib']['title']]) > 0 : continue
        pub_res = {
            "author":name,
            "title":pub['bib']['title'],
            "num_citations":pub['num_citations'],
            }

        if 'author' in pub['bib']:
            pub_res["number_of_co_authors"] = len(pub['bib']['author'].split(' and ')) - 1
        else:
            pub_res["number_of_co_authors"] = ''

        if 'pub_year' in pub['bib']:
            pub_res['pub_year'] = pub['bib']['pub_year']
        else:
            pub_res['pub_year'] = ''
        
        for year in range(2000,2023):
            if year in pub['cites_per_year']:
                pub_res[f'{year}'] = pub['cites_per_year'][year]
            else:
                pub_res[f'{year}'] = ''
        
        dict_writer.writerows([pub_res])
        output_file.flush()

MaxTriesExceededException: Cannot Fetch from Google Scholar.

In [152]:
# Here starts the code that I wrote

In [153]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
import time

In [154]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)

driver = webdriver.Chrome(options=chrome_options)
driver.implicitly_wait(2)

search_command = "Alex Carsello"
url = "https://scholar.google.com/scholar?hl=en&as_sdt=0%2C23&q=" + search_command + "&btnG="

In [155]:
driver.get(url)

In [156]:
# Go to actual link that contains list

driver.find_element(By.XPATH, '//*[@id="gs_res_ccl_mid"]/div[1]/table/tbody/tr/td[2]/h4/a').click()
time.sleep(1)
driver.find_element(By.XPATH, '//*[@id="gsc_a_ha"]').click()

In [157]:
#Show more button

while(1):
    if(driver.find_element(By.XPATH, '//*[@id="gsc_bpf_more"]').get_attribute('disabled')):
        break
    driver.find_element(By.XPATH, '//*[@id="gsc_bpf_more"]').click()
    time.sleep(1)
print("Show Everything")

Show Everything


In [158]:
# Get all paper links

paper_link_list = []
num = 1
while(1):
    link = '#gsc_a_b > tr:nth-child(' + str(num) + ') > td.gsc_a_t > a'
    try:
        paper_link = driver.find_elements(By.CSS_SELECTOR, link)[0].get_attribute("href")
    except:
        break
    num += 1
    paper_link_list.append(paper_link)

len(paper_link_list)
print(paper_link_list[0])

https://scholar.google.com/citations?view_op=view_citation&hl=en&user=7BlVLHkAAAAJ&sortby=pubdate&citation_for_view=7BlVLHkAAAAJ:d1gkVwhDpl0C


In [159]:
# Save the information

title_list = []
authors_list = []
date_list = []
journal_list = []

stop_year = 2022

for i in paper_link_list:
    driver.get(i)
    title = driver.find_element(By.CLASS_NAME, 'gsc_oci_title_link').text
    authors = driver.find_elements(By.CLASS_NAME, 'gsc_oci_value')[0].text
    date = driver.find_elements(By.CLASS_NAME, 'gsc_oci_value')[1].text
    journal = driver.find_elements(By.CLASS_NAME, 'gsc_oci_value')[2].text
    #print(title)
    #print(authors)
    #print(date)
    #print(journal)
    if(date.split('/')[0] == stop_year):
        break
    title_list.append(title)
    authors_list.append(authors)
    date_list.append(date)
    journal_list.append(journal)
    time.sleep(5)
driver.close()

In [160]:
new_authors_list = []
for i in authors_list:
    tmp = i.split(",")
    tmp_list = []
    for j in tmp:
        tmp_list.append("[[" + j.strip() + "]]")
    new_authors_list.append(tmp_list)
#print(new_authors_list[0])


In [161]:
print("Total Length : ",len(title_list), len(new_authors_list))

Total Length :  5 5


In [162]:
file_name = "./" + search_command + ".txt"
print(file_name)
file = open(file_name,'w')

for i in range(len(title_list)):
    if(journal_list[i] == 'US'):
        continue
    print(date_list[i], file=file)
    print("[[", title_list[i], "]]", sep='', file=file)
    for index, j in enumerate(new_authors_list[i]):
        if(index == (len(new_authors_list[i])-1)):
            print(j,"", sep='', end='\n', file=file)
        else:
            print(j,", ", sep='', end='', file=file)
    print(journal_list[i], file=file)
    print('-------------------------------------------------------------------------------------------------------------------------', file=file)

file.close()

./Alex Carsello.txt


In [123]:
for i in new_authors_list[0]:
    print(i, sep='', end=', ')

[[Tim Dunn]], [[David Blaauw]], [[Reetuparna Das]], [[Satish Narayanasamy]], 

In [None]:
title = driver.find_element(By.CLASS_NAME, 'gsc_oci_title_link').text
authors = driver.find_elements(By.CLASS_NAME, 'gsc_oci_value')[0].text
date = driver.find_elements(By.CLASS_NAME, 'gsc_oci_value')[1].text
journal = driver.find_elements(By.CLASS_NAME, 'gsc_oci_value')[2].text
print(title)
print(authors)
print(date)
print(journal)

In [86]:
driver.get(paper_link)

In [99]:
title = driver.find_element(By.CLASS_NAME, 'gsc_oci_title_link').text
authors = driver.find_elements(By.CLASS_NAME, 'gsc_oci_value')[0].text
date = driver.find_elements(By.CLASS_NAME, 'gsc_oci_value')[1].text
journal = driver.find_elements(By.CLASS_NAME, 'gsc_oci_value')[2].text
print(title)
print(authors)
print(date)
print(journal)

nPoRe: n-polymer realigner for improved pileup-based variant calling
Tim Dunn, David Blaauw, Reetuparna Das, Satish Narayanasamy
2023/12
BMC bioinformatics


In [100]:
driver.find_element(By.XPATH, '//*[@id="gs_hdr_bck"]/span[1]').click()

In [None]:
#gsc_a_b > tr:nth-child#gsc_a_b > tr:nth-child(940)

In [62]:
driver.find_elements(By.CLASS_NAME, 'gsc_a_t')[1].get_attribute('a')

In [68]:
driver.find_elements(By.CSS_SELECTOR, '#gsc_a_b > tr:nth-child(1) > td.gsc_a_t > a')[0].get_attribute("href")

'https://scholar.google.com/citations?view_op=view_citation&hl=en&user=P3JdmqAAAAAJ&sortby=pubdate&citation_for_view=P3JdmqAAAAAJ:w_ORaKkuc5QC'

In [46]:

paper_list = driver.find_elements(By.CLASS_NAME, 'gsc_a_tr')

In [50]:
paper_list[0].click()

In [79]:
#Show more button

while(driver.find_element(By.XPATH, '//*[@id="gsc_bpf_more"]').get_attribute('disabled') == False):
    driver.find_element(By.XPATH, '//*[@id="gsc_bpf_more"]').click()
    time.sleep(1)
print("Show Everything")




Show Everything


In [77]:
driver.find_element(By.XPATH, '//*[@id="gsc_bpf_more"]').get_attribute('disabled')

'true'

In [9]:
search_url = driver.find_elements(By.CLASS_NAME, "gs_rt2")

In [None]:
driver.find_element(By.CSS_SELECTOR, "")

In [31]:
search_url[0].get_attribute('href')

In [44]:
#links = "#gs_res_ccl_mid > div:nth-child(1) > table > tbody > tr > td:nth-child(2) > h4 > a"
#search_url = driver.find_elements(By.CSS_SELECTOR, links)
#search_url[0].get_attribute("href")


#driver.find_element_by_xpath('//*[@id="gs_res_ccl_mid"]/div[1]/table/tbody/tr/td[2]/h4/a').click()
driver.find_element(By.XPATH, '//*[@id="gs_res_ccl_mid"]/div[1]/table/tbody/tr/td[2]/h4/a').click()
time.sleep(1)
driver.find_element(By.XPATH, '//*[@id="gsc_a_ha"]').click()


In [None]:



#time.sleep(10)

#search_url = driver.find_elements(By.CLASS_NAME, "gs_rt2")

links = "#gs_res_ccl_mid > div:nth-child(1) > table > tbody > tr > td:nth-child(2) > h4 > a"

search_url = driver.find_elements(By.CSS_SELECTOR, links)

print("주소: ", search_url.get_attribute("href"))

print("Hi")




#driver.close()

