In this notebook, I want to randomly select 100 pagers and see how many of them contain citation counts on CrossRef, Scopus, and Web of Science. 

In [1]:
# Load packages
import pandas as pd
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException 
from selenium.common.exceptions import ElementNotInteractableException
import os
import random
import re
import numpy as np
import sys
from os.path import join as pjoin
import csv

In [2]:
dois = pd.read_csv('../../data/processed/vispd_plus_good_papers.txt', header=None)[0].tolist()

In [3]:
"""randomly select 100 dois
"""
random.seed(4213)
random_dois = random.sample(dois, 100)

In [4]:
"""insert 'http://dx.doi.org/' to the start of each randomly selected doi
"""
urls = [re.sub(r'^', 'http://dx.doi.org/', i) for i in random_dois]

In [5]:
def specify_driver_options():
    """
    specify driver options
    """
    options = Options()
    options.set_preference("browser.download.folderList", 2)
    options.set_preference("browser.download.manager.showWhenStarting", 
                           False)
    options.set_preference("browser.helperApps.neverAsk.saveToDisk", 
                           "text/plain, text/txt, application/plain, application/txt")
    return options

In [6]:
options = specify_driver_options()

In [7]:
def check_exists(ByMethod, value):
    """define the function of checking whether an element exists. 
    Returns true if exists. 
    """

    try:
        wait.until(EC.presence_of_element_located((
            ByMethod, value
        )))
    except (NoSuchElementException, TimeoutException):
        return False
    return True

In [8]:
def close_ieee_cookie_window():
    """close the cookie windown on IEEE webiste
    """

    if check_exists(By.CLASS_NAME, "cc-compliance"):
        try:
            cc = wait.until(EC.element_to_be_clickable((
                By.CLASS_NAME, "cc-compliance"
            )))
            cc.click()
        except:
            pass

In [9]:
def open_ieee_url(url, doi_index):
    try:
        driver.get(url)
        # only run close_ieee_cookie_window for once (for the first doi)
        if doi_index == 1:
            close_ieee_cookie_window()
        time.sleep(1)
    except TimeoutException:
        driver.refresh()
        open_ieee_url(url, doi_index)

In [10]:
def get_html():
    """get the html element by tag name
    Returns:
        html
    
    Usage:
        I use the returned element to scroll down to the bottom of page
    """
    
    html = wait.until(EC.presence_of_element_located((
        By.TAG_NAME, "html"
    )))
    return html

In [11]:
def open_metrics_tab():
    # open the metrics tab
    metrics_tab = wait.until(EC.element_to_be_clickable((
        By.XPATH, '//div[@id="metrics-header"]/a[@id="metrics"]'
    )))
    html = get_html()
    html.send_keys(Keys.END)
    try:
        metrics_tab.click()
    except (TimeoutException, ElementNotInteractableException):
        time.sleep(1)
        driver.refresh()
        return open_metrics_tab()

In [12]:
def get_citation_info_dict(doi):
    """get citation info (if exists)
    
    Args:
      citation-button.regular
    
    Returns:
      a list of strings containing citation_count_strings, and citation_source_strings
      
    Note: 
      use regex to remove the ',' in citation counts
    """
    if check_exists(By.CLASS_NAME, "citations-button.regular"):
        citation_counts = driver.find_elements(
            By.XPATH, '//a[@class="citations-button regular"]/child::div[1]'
            )
        citation_sources = driver.find_elements(
            By.XPATH, '//a[@class="citations-button regular"]/child::div[2]'
            )
        citation_count_list = [int(re.sub(r',', '', i.text)) for i in citation_counts]
        citation_source_list = [i.text for i in citation_sources]
        citation_source_list = [re.sub(r'®', '', i) for i in citation_source_list]
        citation_source_list = [re.sub(r'\n', ' ', i) for i in citation_source_list]
        citation_info_dict = dict(zip(citation_source_list, citation_count_list))
        citation_info_dict['DOI'] = doi
    else:
        citation_info_dict = {'DOI': doi}
    return citation_info_dict

In [13]:
# open_ieee_url(urls[2])

In [14]:
# open_metrics_tab()

In [15]:
# citation_info_dict = get_citation_info_dict(urls[0])

In [16]:
# citation_info_dict

In [17]:
driver = webdriver.Firefox(options=options)
wait = WebDriverWait(driver, 10)

In [18]:
citation_info_dict_list = []
for doi in random_dois:
    doi_index = random_dois.index(doi) + 1
    url = re.sub(r'^', 'http://dx.doi.org/', doi)
    open_ieee_url(url, doi_index)
    open_metrics_tab()
    citation_info_dict = get_citation_info_dict(url)
    citation_info_dict_list.append(citation_info_dict)
    print(f'{doi_index} is done')
    time.sleep(1)

1 is done
2 is done
3 is done
4 is done
5 is done
6 is done
7 is done
8 is done
9 is done
10 is done
11 is done
12 is done
13 is done
14 is done
15 is done
16 is done
17 is done
18 is done
19 is done
20 is done
21 is done
22 is done
23 is done
24 is done
25 is done
26 is done
27 is done
28 is done
29 is done
30 is done
31 is done
32 is done
33 is done
34 is done
35 is done
36 is done
37 is done
38 is done
39 is done
40 is done
41 is done
42 is done
43 is done
44 is done
45 is done
46 is done
47 is done
48 is done
49 is done
50 is done
51 is done
52 is done
53 is done
54 is done
55 is done
56 is done
57 is done
58 is done
59 is done
60 is done
61 is done
62 is done
63 is done
64 is done
65 is done
66 is done
67 is done
68 is done
69 is done
70 is done
71 is done
72 is done
73 is done
74 is done
75 is done
76 is done
77 is done
78 is done
79 is done
80 is done
81 is done
82 is done
83 is done
84 is done
85 is done
86 is done
87 is done
88 is done
89 is done
90 is done
91 is done
92 is do

In [19]:
df = pd.DataFrame(citation_info_dict_list)
df.to_csv('../../data/interim/methods_reporting/ieee_citation_metrics.csv', index=False)

In [20]:
df[df.Crossref.isnull()]

Unnamed: 0,Crossref,Scopus,Web of Science,DOI
23,,1.0,,http://dx.doi.org/10.1109/tvcg.2021.3114818
51,,1.0,1.0,http://dx.doi.org/10.1109/INFVIS.2002.1173154
69,,,,http://dx.doi.org/10.1109/tvcg.2021.3114679


In [23]:
df[df.Scopus.notnull()].shape

(71, 4)

In [24]:
df[df['Web of Science'].notnull()].shape

(68, 4)