## Aim

I aim to re-understand the script of `scrape_ica_paper_dois.py`.

In [1]:
import pandas as pd
import numpy as np
import time 
import random
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException 
import sys

In [2]:
def get_journal_and_urls():
    journals = [
        'Journal of Communication',
        'Human Communication Research',
        'Communication Theory',
        'Journal of Computer-Mediated Communication',
        'Communication, Culture and Critique',
    ]
    j_urls = [
        'https://academic.oup.com/joc/issue',
        'https://academic.oup.com/hcr/issue',
        'https://academic.oup.com/ct/issue',
        'https://academic.oup.com/jcmc/issue',
        'https://academic.oup.com/ccc/issue',
    ]
    url_j_dic = dict(zip(j_urls, journals))
    return journals, j_urls, url_j_dic 

In [6]:
driver = webdriver.Firefox()
wait = WebDriverWait(driver, 3)

In [7]:
journals, j_urls, url_j_dic = get_journal_and_urls()

In [8]:
j_url = j_urls[0]
journal = journals[0]
j_url, journal

('https://academic.oup.com/joc/issue', 'Journal of Communication')

In [9]:
driver.get(j_url)

In [10]:
def click_browse_by_volume():
	browse_volume_link = wait.until(EC.element_to_be_clickable((
		By.CSS_SELECTOR, "div.issue-browse-volume-link > a"
	)))
	browse_volume_link.click()

In [11]:
click_browse_by_volume()

In [12]:
def get_volume_and_issue():
	volume_and_issue = wait.until(EC.presence_of_element_located((
		By.CSS_SELECTOR, "div.issue-info-pub"
	))).text
	split_text = volume_and_issue.split(', ')
	volume_num = split_text[0]
	issue_num = split_text[1]
	return volume_num, issue_num

In [13]:
volume_num, issue_num = get_volume_and_issue()

In [14]:
volume_num

'Volume 72'

In [15]:
issue_num

'Issue 3'

In [17]:
v_and_issue = driver.find_elements(
    By.CSS_SELECTOR, "div.issue-info-pub"
)
v_and_issue[0].text

'Volume 72, Issue 3'

In [18]:
v_and_issue[1].text

'Volume 72, Issue 3, June 2022'

In [19]:
def get_mo_and_yr():
	# get the issue date information from the left panel
	issue_date = wait.until(EC.presence_of_element_located((
		By.CSS_SELECTOR, "div.issue-info-date"
	))).text
	split_text = issue_date.split(' ')
	year = split_text[-1]
	# sometimes the issue_date is in the format of "1 March 2004"
	# for example: https://academic.oup.com/joc/issue/54/1
	if len(split_text) > 2:
		month = split_text[1]
	else:
		month = split_text[0]
	return month, year

In [20]:
month, year = get_mo_and_yr()

In [21]:
month, year

('June', '2022')