## Aim

I aim to re-understand the script of `scrape_ica_paper_dois.py`.

In [59]:
import pandas as pd
import numpy as np
import time 
import random
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException 
import sys

In [60]:
def get_journal_and_urls():
    journals = [
        'Journal of Communication',
        'Human Communication Research',
        'Communication Theory',
        'Journal of Computer-Mediated Communication',
        'Communication, Culture and Critique',
    ]
    j_urls = [
        'https://academic.oup.com/joc/issue',
        'https://academic.oup.com/hcr/issue',
        'https://academic.oup.com/ct/issue',
        'https://academic.oup.com/jcmc/issue',
        'https://academic.oup.com/ccc/issue',
    ]
    url_j_dic = dict(zip(j_urls, journals))
    return journals, j_urls, url_j_dic 

In [3]:
driver = webdriver.Firefox()
wait = WebDriverWait(driver, 3)

In [4]:
journals, j_urls, url_j_dic = get_journal_and_urls()

In [5]:
j_url = j_urls[0]
journal = journals[0]
j_url, journal

('https://academic.oup.com/joc/issue', 'Journal of Communication')

In [6]:
driver.get(j_url)

In [7]:
def click_browse_by_volume():
	browse_volume_link = wait.until(EC.element_to_be_clickable((
		By.CSS_SELECTOR, "div.issue-browse-volume-link > a"
	)))
	browse_volume_link.click()

In [8]:
click_browse_by_volume()

In [9]:
def get_volume_and_issue():
	volume_and_issue = wait.until(EC.presence_of_element_located((
		By.CSS_SELECTOR, "div.issue-info-pub"
	))).text
	split_text = volume_and_issue.split(', ')
	volume_num = split_text[0]
	issue_num = split_text[1]
	return volume_num, issue_num

In [10]:
volume_num, issue_num = get_volume_and_issue()

In [11]:
volume_num

'Volume 72'

In [12]:
issue_num

'Issue 3'

In [13]:
v_and_issue = driver.find_elements(
    By.CSS_SELECTOR, "div.issue-info-pub"
)
v_and_issue[0].text

'Volume 72, Issue 3'

In [14]:
v_and_issue[1].text

'Volume 72, Issue 3, June 2022'

In [15]:
def get_mo_and_yr():
	# get the issue date information from the left panel
	issue_date = wait.until(EC.presence_of_element_located((
		By.CSS_SELECTOR, "div.issue-info-date"
	))).text
	split_text = issue_date.split(' ')
	year = split_text[-1]
	# sometimes the issue_date is in the format of "1 March 2004"
	# for example: https://academic.oup.com/joc/issue/54/1
	if len(split_text) > 2:
		month = split_text[1]
	else:
		month = split_text[0]
	return month, year

In [16]:
month, year = get_mo_and_yr()

In [17]:
month, year

('June', '2022')

## abstract

In [30]:
# go to: https://academic.oup.com/joc/issue/46/4?browseBy=volume
# here the paper of 'dynamic social impact' has two paragraphs for abstract
sections = wait.until(EC.presence_of_all_elements_located((
        By.CSS_SELECTOR, "div.section-container > section"
    )))
section = sections[1]

In [31]:
papers = section.find_elements(
        By.CSS_SELECTOR, "div.al-article-items"
    )
paper = papers[0]

In [32]:
title_link = paper.find_element(
    By.CSS_SELECTOR, "a.at-articleLink"
)

url = title_link.get_attribute('href')

try:
    # if it has "get access", for example, https://academic.oup.com/hcr/issue/33/1
    #  the real title is within the first span
    title = title_link.find_element(
        By.CSS_SELECTOR, "span.access-title"
    ).text 
# if there is no such "span.access-title", then just get the title as usual
except:
    title = title_link.text 

print(title)

Dynamic Social Impact Theory and the Study of Human Communication


In [33]:
# go to: https://academic.oup.com/joc/issue/46/4?browseBy=volume
# here the paper of 'dynamic social impact' has two paragraphs for abstract
abstract_tab = driver.find_elements(
    By.CSS_SELECTOR, "div.abstract-link > a"
)[0]
abstract_header = abstract_tab.text
abstract_header

'Abstract'

In [34]:
abstract_tab.click()

In [35]:
abstracts = WebDriverWait(paper, 2).until(
    EC.presence_of_all_elements_located((
        By.CSS_SELECTOR, "p.chapter-para"
)))
# number of paragraphs when you open the abstract tab
# this is to make sure I didn't omit anything
abstract_para_num = len(abstracts)
abstract_para_num

2

In [54]:
all_abstract_text = [a.text for a in abstracts]
all_abstract_text

['No science today can consider the structures with which it has to deal as being more than a haphazard arrangement. That arrangement alone is structured which meets two conditions: that it be a system, ruled by an internal cohesiveness; that this cohesiveness, inaccessible to observation in an isolated system, be revealed in the study of transformations, through which similar properties in apparently different systems are brought to light. (Claude Levi-Strauss, in S. O. Paul & R. A. Paul, translators, The Scope of Anthropology, 1967, p. 27)',
 'A man, viewed as a behaving system, is quite simple. The apparent complexity of his behavior over time is largely a reflection of the complexity of the environment in which he finds himself (Herbert A. Simon, The Sciences of the Artificial, 1969, p. 52)']

In [55]:
line1 = all_abstract_text[0]
line1

'No science today can consider the structures with which it has to deal as being more than a haphazard arrangement. That arrangement alone is structured which meets two conditions: that it be a system, ruled by an internal cohesiveness; that this cohesiveness, inaccessible to observation in an isolated system, be revealed in the study of transformations, through which similar properties in apparently different systems are brought to light. (Claude Levi-Strauss, in S. O. Paul & R. A. Paul, translators, The Scope of Anthropology, 1967, p. 27)'

In [56]:
line2 = all_abstract_text[1]
line2

'A man, viewed as a behaving system, is quite simple. The apparent complexity of his behavior over time is largely a reflection of the complexity of the environment in which he finds himself (Herbert A. Simon, The Sciences of the Artificial, 1969, p. 52)'

In [57]:
print('\n\n'.join(all_abstract_text))

No science today can consider the structures with which it has to deal as being more than a haphazard arrangement. That arrangement alone is structured which meets two conditions: that it be a system, ruled by an internal cohesiveness; that this cohesiveness, inaccessible to observation in an isolated system, be revealed in the study of transformations, through which similar properties in apparently different systems are brought to light. (Claude Levi-Strauss, in S. O. Paul & R. A. Paul, translators, The Scope of Anthropology, 1967, p. 27)

A man, viewed as a behaving system, is quite simple. The apparent complexity of his behavior over time is largely a reflection of the complexity of the environment in which he finds himself (Herbert A. Simon, The Sciences of the Artificial, 1969, p. 52)


In [50]:
# with open('try.txt', 'w') as f:
#     f.write('\n\n'.join(all_abstract_text))

### What if there is only one abstract paragraph?

In [51]:
all_abstract_text = abstracts[0].text
all_abstract_text

'No science today can consider the structures with which it has to deal as being more than a haphazard arrangement. That arrangement alone is structured which meets two conditions: that it be a system, ruled by an internal cohesiveness; that this cohesiveness, inaccessible to observation in an isolated system, be revealed in the study of transformations, through which similar properties in apparently different systems are brought to light. (Claude Levi-Strauss, in S. O. Paul & R. A. Paul, translators, The Scope of Anthropology, 1967, p. 27)'

In [58]:
# print('\n\n'.join(all_abstract_text))

## Debugging

In [61]:
journals, j_urls, url_j_dic = get_journal_and_urls()

In [64]:
[j_urls[-1]]

['https://academic.oup.com/ccc/issue']

In [65]:
for j_url in [j_urls[-1]]:
    journal = url_j_dic[j_url]