## Aim

The aim of this notebook is to see whether I can scrape the website using the patterns of the url. This is because my previous methods does not work (an element will block the volume Select). 

In [30]:
import pandas as pd
import numpy as np
import time 
import random
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException 
from selenium.webdriver.support.ui import Select
import sys

In [52]:
def get_journal_and_urls():
    journals = [
        'Journal of Communication',
        'Human Communication Research',
        'Communication Theory',
        'Journal of Computer-Mediated Communication',
        'Communication, Culture and Critique',
    ]
    j_urls = [
        'https://academic.oup.com/joc/issue',
        'https://academic.oup.com/hcr/issue',
        'https://academic.oup.com/ct/issue',
        'https://academic.oup.com/jcmc/issue',
        'https://academic.oup.com/ccc/issue',
    ]
    url_j_dic = dict(zip(j_urls, journals))
    return journals, j_urls, url_j_dic 

In [53]:
def get_issue_num_year_and_month(issue_text):
    ''' to extract issue number, year, and month information from an issue text 
      such as  "Issue 1, March 1981, Pages 3–240"
    '''
    issue_info_list = issue_text.split(', ')
    issue_num = issue_info_list[0]
    # year and month:
    yr_n_mo = issue_info_list[1]
    split_text = yr_n_mo.split(' ')
    year = split_text[-1]
    # sometimes the issue_date is in the format of "1 March 2004"
    # for example: https://academic.oup.com/joc/issue/54/1
    if len(split_text) > 2:
        month = split_text[1]
    else:
        month = split_text[0]
    return issue_num, month, year 

In [54]:
def click_browse_by_volume():
    browse_volume_link = wait.until(EC.element_to_be_clickable((
        By.CSS_SELECTOR, "div.issue-browse-volume-link > a"
    )))
    browse_volume_link.click()

In [55]:
def extract_paper_info(tuples, journal, volume_num, issue_num, month, year):
    # There are several sections. For example, 
    # in 'https://academic.oup.com/joc/issue/72/3?browseBy=volume'
    # there are four sections: articles, Corrigendum, correction, and book reviews
    sections = wait.until(EC.presence_of_all_elements_located((
            By.CSS_SELECTOR, "div.section-container > section"
        )))

    for section in sections:
        section_index = sections.index(section) + 1

        # the section name is the category, e.g., Articles
        category = section.find_element(
            By.CSS_SELECTOR, 'h4'
        ).get_attribute('innerHTML')

        # all individual papers in each section
        papers = section.find_elements(
            By.CSS_SELECTOR, "div.al-article-items"
        )

        for paper in papers:

            paper_index = papers.index(paper) + 1

            # title

            title_link = paper.find_element(
                By.CSS_SELECTOR, "a.at-articleLink"
            )

            url = title_link.get_attribute('href')

            try:
                # if it has "get access", for example, https://academic.oup.com/hcr/issue/33/1
                #  the real title is within the first span
                title = title_link.find_element(
                    By.CSS_SELECTOR, "span.access-title"
                ).text 
            # if there is no such "span.access-title", then just get the title as usual
            except:
                title = title_link.text 

            # Sometimes, the title is weird, for example:
            # some of those in https://academic.oup.com/joc/issue/57/1
            if title not in [
                'German Abstract',
                'Chinese Abstract',
                'Korean Abstract',
                'Japanese Abstract',
                'French Abstract',
                'Russian Abstract',
                'Abrabic Abstract',
                'Spanish Abstract',
            ]:

                # publication info (page numbers, doi, url)
                pub_info = paper.find_element(
                    By.CSS_SELECTOR, ".pub-history-row.clearfix"
                ).text

                pub_info_elements = pub_info.split(', ')
                for e in pub_info_elements:
                    # JCMC is special as it does not contain pages information
                    if journal != 'Journal of Computer-Mediated Communication':
                        if "Pages" in e:
                            pages = re.sub('Pages ', '', e)
                        elif "Page" in e:
                            pages = re.sub('Page ', '', e)
                    else:
                        pages = np.nan 
                    if 'https://' in e:
                        doi = re.sub('https://doi.org/', '', e)

                # abstract
                try:
                    abstract_tab = WebDriverWait(paper, 2).until(
                        EC.element_to_be_clickable((
                            By.CSS_SELECTOR, "div.abstract-link > a"
                    )))
                    abstract_header = abstract_tab.text 
                    # Sometimes, the header is 'Extract'
                    if abstract_header == 'Abstract':
                        abstract_tab.click()
                        # sometimes, you can click the tab but there is no content in it
                        try:
                            abstracts = WebDriverWait(paper, 2).until(
                                EC.presence_of_all_elements_located((
                                    By.CSS_SELECTOR, "p.chapter-para"
                            )))
                            # number of paragraphs when you open the abstract tab
                            # this is to make sure I didn't omit anything
                            abstract_para_num = len(abstracts)
                            # if multiple paragraphs, include all of them 
                            if abstract_para_num != 1:
                                all_abstract_text = [i.text for i in abstracts]
                                abstract = '\n\n'.join(all_abstract_text)
                            # otherwise get the first and of course, the only one para
                            else:
                                abstract = abstracts[0].text 
                        # there is tab and it is "Abstract", but abstract is empty:
                        except:
                            abstract = np.nan 
                            abstract_para_num = 0
                    # There is abstract_tab but it is "Extract", not "Abstract"
                    else:
                        abstract = np.nan 
                        abstract_para_num = np.nan 
                # if there is no abstract_tab
                except (NoSuchElementException, TimeoutException):
                    abstract = np.nan 
                    abstract_para_num = np.nan

                tuples.append((
                    journal,
                    volume_num,
                    issue_num,
                    month,
                    year,
                    category,
                    title,
                    url,
                    doi,
                    pages,
                    abstract,
                    abstract_para_num,
                ))
            time.sleep(0.1+random.uniform(0,0.1))
    time.sleep(0.1+random.uniform(0,0.1)) 

In [56]:
journals, j_urls, url_j_dic = get_journal_and_urls()

In [57]:
def get_volume_option_texts():
	'''
	get all volume options
	'''
	volume_options = driver.find_elements(
		By.CSS_SELECTOR, '.issue-browse-year-list.issue-browse-select > option')
	volume_option_texts = [v.text for v in volume_options]
	return volume_option_texts

In [58]:
def get_issue_option_texts():
	issue_options = driver.find_elements(
		By.CSS_SELECTOR, '.issue-browse-issues-list > option'
	)
	issue_option_texts = [i.text for i in issue_options]
	return issue_option_texts

In [60]:
driver = webdriver.Firefox()
wait = WebDriverWait(driver, 3)
driver.get(j_urls[-1])
click_browse_by_volume()
volume_option_texts = get_volume_option_texts()
total_volume = int(volume_option_texts[0])
print(f'Total volume: {total_volume}')

Total volume: 15


In [61]:
start_str = j_urls[-1]
end_str = '?browseBy=volume'

In [62]:
tuples = []
journal = url_j_dic[j_urls[-1]]
for v in reversed(range(1,total_volume+1)):
    volume_num = f'Volume {v}'
    print(f'{volume_num} has started!')
    issues = get_issue_option_texts()
    for issue in reversed(issues):
        issue_idx = issues.index(issue) + 1
        issue_num, month, year = get_issue_num_year_and_month(issue)
        driver.get(f'{start_str}/{v}/{issue_idx}{end_str}')
        extract_paper_info(tuples, journal, volume_num, issue_num, month, year)
        print(f'({volume_num}, {issue}) is done')

Volume 15 has started!
Volume 15, Issue 2, June 2022, Pages 103–298 is done
Volume 15, Issue 1, March 2022, Pages 1–101 is done
Volume 14 has started!
Volume 14, Issue 2, June 2022, Pages 103–298 is done
Volume 14, Issue 1, March 2022, Pages 1–101 is done
Volume 13 has started!
Volume 13, Issue 4, December 2021, Pages 551–695 is done


NoSuchWindowException: Message: Browsing context has been discarded
Stacktrace:
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:186:5
NoSuchWindowError@chrome://remote/content/shared/webdriver/Errors.jsm:440:5
assert.that/<@chrome://remote/content/shared/webdriver/Assert.jsm:445:13
assert.open@chrome://remote/content/shared/webdriver/Assert.jsm:153:4
GeckoDriver.prototype.findElement@chrome://remote/content/marionette/driver.js:1429:10
despatch@chrome://remote/content/marionette/server.js:306:40
execute@chrome://remote/content/marionette/server.js:279:16
onPacket/<@chrome://remote/content/marionette/server.js:252:20
onPacket@chrome://remote/content/marionette/server.js:253:9
_onJSONObjectReady/<@chrome://remote/content/marionette/transport.js:500:20


In [64]:
df = pd.DataFrame(
		list(tuples), 
		columns = [
			'journal', 'volumn', 'issue', 
			'month', 'year', 'category', 'title', 
			'url', 'doi', 'pages',  
			'abstract', 'abstract_para_num',
		])

In [65]:
df

Unnamed: 0,journal,volumn,issue,month,year,category,title,url,doi,pages,abstract,abstract_para_num
0,"Communication, Culture and Critique",Volume 15,Issue 2,June,2022,Editorial,Digital Migration Practices and the Everyday,https://academic.oup.com/ccc/article/15/2/103/...,10.1093/ccc/tcac016,103–121,This special issue explores the role that digi...,1.0
1,"Communication, Culture and Critique",Volume 15,Issue 2,June,2022,Original Articles,"Working with “Wogs”: Aliens, Denizens and the ...",https://academic.oup.com/ccc/article/15/2/122/...,10.1093/ccc/tcac012,122–138,This article uses a discussion of the currency...,1.0
2,"Communication, Culture and Critique",Volume 15,Issue 2,June,2022,Original Articles,"Viral Borders: Migration, Deceleration, and th...",https://academic.oup.com/ccc/article/15/2/139/...,10.1093/ccc/tcac009,139–156,States’ efforts to govern the COVID-19 public ...,1.0
3,"Communication, Culture and Critique",Volume 15,Issue 2,June,2022,Original Articles,"Digitalization, Digitization and Datafication:...",https://academic.oup.com/ccc/article/15/2/157/...,10.1093/ccc/tcac007,157–175,"Digitalization, digitization, and datafication...",1.0
4,"Communication, Culture and Critique",Volume 15,Issue 2,June,2022,Original Articles,Extractive Humanitarianism: Participatory Conf...,https://academic.oup.com/ccc/article/15/2/176/...,10.1093/ccc/tcac018,176–192,This article advances the notion of “extractiv...,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
74,"Communication, Culture and Critique",Volume 13,Issue 3,September,2021,Original Articles,"Fluorescent Flags: Black Power, Publicity, and...",https://academic.oup.com/ccc/article/13/3/275/...,10.1093/ccc/tcz058,275–294,This semiotic landscape analysis probes urban ...,1.0
75,"Communication, Culture and Critique",Volume 13,Issue 3,September,2021,Original Articles,Virtual Empathy,https://academic.oup.com/ccc/article/13/3/295/...,10.1093/ccc/tcz035,295–310,This article offers a theoretical response to ...,1.0
76,"Communication, Culture and Critique",Volume 13,Issue 3,September,2021,Original Articles,Civil Society Must Be Defended: Misinformation...,https://academic.oup.com/ccc/article/13/3/311/...,10.1093/ccc/tcz041,311–332,"In this article, I propose that we think of th...",1.0
77,"Communication, Culture and Critique",Volume 13,Issue 3,September,2021,Original Articles,To Affinity and Beyond: Clicking as Communicat...,https://academic.oup.com/ccc/article/13/3/333/...,10.1093/ccc/tcaa005,333–348,This article analyzes how users' engagements w...,1.0
