## Aim

This notebooks recreates the script to scrape the author data. 

In [175]:
import sys
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import random
import time 
import requests
import re
import json

In [176]:
ICA_PAPER_DF = '../data/interim/ica_paper_df.csv'
ICA_AUTHOR_DF = 'ica_author_df.csv'
ICA_ERROR_URLS = 'ica_error_urls'

In [177]:
ica_papers = pd.read_csv(ICA_PAPER_DF)

In [178]:
ica_papers.shape

(7708, 14)

In [179]:
# Are urls unique? Yes
len(list(ica_papers.url))

7708

In [180]:
# extract useful info
paper_urls = ica_papers.url.tolist()
paper_dois = ica_papers.doi.tolist()
years = ica_papers.year.tolist()
journals = ica_papers.journal.tolist()
titles = ica_papers.title.tolist()

# dics
url_doi_dic = dict(zip(paper_urls, paper_dois))
url_year_dic = dict(zip(paper_urls, years))
url_title_dic = dict(zip(paper_urls, titles))
url_journal_dic = dict(zip(paper_urls, journals))

# initiate data tuples
author_data_tuples = []

In [181]:
def get_soup(url, idx):
    headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36" \
    "(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
    }
    response = requests.get(url=url, headers=headers)
    while response.status_code != 200:
        time.sleep(1)
        print(f"{idx} : {url} status code is {response.status_code}, retrying ...")
        response = requests.get(url=url, headers=headers)
    html = response.text 
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [182]:
def update_author_data(url, soup, author_data_tuples):
    doi = url_doi_dic[url]
    journal = url_journal_dic[url]
    title = url_title_dic[url]
    year = url_year_dic[url]
    # this return a list
    # sometimes there is no authors: https://academic.oup.com/joc/article/33/4/20/4282700
    try:
        authors = soup.find(class_="al-authors-list").find_all(
            class_="al-author-name-more js-flyout-wrap")
        author_num = len(authors)
        for author in authors:
            author_position = authors.index(author) + 1
            '''author names
            '''
            try:
                fullname = author.find(class_="info-card-name").text.strip()
                fullname_split = fullname.split(' ')
                lastname = fullname_split[-1]
                if fullname_split:
                    if len(fullname_split) == 2:
                        firstname = fullname_split[0]
                    # e.g., "M. J. Clarke"
                    elif len(fullname_split) > 2 and len(fullname_split[0]) <= 2 and len(fullname_split[1]) <= 2:
                        firstname = fullname_split[0]
                    # e.g., "M. Jennifer Clarke"
                    elif len(fullname_split) > 2 and len(fullname_split[0]) <= 2 and len(fullname_split[1]) > 2:
                        firstname = fullname_split[1]
                    # g.g., "Mike John Clarke"
                    elif len(fullname_split) > 2 and len(fullname_split[0]) > 2:
                        firstname = fullname_split[0]
                    else:
                        firstname = 'ERROR!'
            except:
                fullname = np.nan
                lastname = np.nan 
                firstname = np.nan
            '''author affiliations
            '''
            try:
                aff = author.find(class_="aff")
#                 sometimes, there is no 'span': 
#                 https://academic.oup.com/ccc/article/15/2/269/6561482
                if aff.find('span') is not None:
                    aff.find('span').extract()
                aff = aff.text.strip()
            except:
                aff = np.nan
            '''author correspondence
            '''
            try:
                cor = author.find(class_="info-author-correspondence").text.strip()
            except:
                cor = np.nan
            '''google scholar link
            '''
            try:
                gscholar_link = author.find(
                    class_="info-card-search info-card-search-google").find('a')['href']
            except:
                gscholar_link = np.nan
            '''update data tuples
            '''
            author_data_tuples.append((
                doi, url, journal, title, year,
                author_num, author_position, fullname,
                firstname, lastname, aff, cor, gscholar_link
            ))
    except:
        author_data_tuples.append((
            doi, url, journal, title, year
        ))
        print(f'{url}')
    return author_data_tuples

In [183]:
# soup = get_soup('https://academic.oup.com/ccc/article/15/2/269/6561482',1)

In [186]:
# update_author_data(url, soup, author_data_tuples)

[('10.1111/j.1460-2466.1977.tb01850.x',
  'https://academic.oup.com/joc/article/27/2/211/4553980',
  'Journal of Communication',
  'News vs. “a picture of reality”',
  '1977',
  1,
  1,
  'Myria Georgiou',
  'Myria',
  'Georgiou',
  'Department of Media and Communications, London School of Economics and Political Science, Houghton Street, London WC2A 2AE, UK',
  'e-mail: m.a.georgiou@lse.ac.uk.',
  'http://scholar.google.com/scholar?q=author:%22Georgiou Myria%22'),
 ('10.1111/j.1460-2466.1977.tb01850.x',
  'https://academic.oup.com/joc/article/27/2/211/4553980',
  'Journal of Communication',
  'News vs. “a picture of reality”',
  '1977',
  1,
  1,
  'Myria Georgiou',
  'Myria',
  'Georgiou',
  'Department of Media and Communications, London School of Economics and Political Science, Houghton Street, London WC2A 2AE, UK',
  'e-mail: m.a.georgiou@lse.ac.uk.',
  'http://scholar.google.com/scholar?q=author:%22Georgiou Myria%22')]

In [146]:
if __name__ == '__main__':
    # read data
    ica_papers = pd.read_csv(ICA_PAPER_DF)
    
    # extract useful info
    paper_urls = ica_papers.url.tolist()
    paper_dois = ica_papers.doi.tolist()
    years = ica_papers.year.tolist()
    journals = ica_papers.journal.tolist()
    titles = ica_papers.title.tolist()
    
    # dics
    url_doi_dic = dict(zip(paper_urls, paper_dois))
    url_year_dic = dict(zip(paper_urls, years))
    url_title_dic = dict(zip(paper_urls, titles))
    url_journal_dic = dict(zip(paper_urls, journals))
    
    # initiate data tuples
    author_data_tuples = []

    total_urls = len(paper_urls)
    print(f'Total number of URLs to crawl: {total_urls}')
    
    random_paper_urls = random.sample(paper_urls, 10)

    for url in random_paper_urls:
        idx = random_paper_urls.index(url) + 1
#     for url in paper_urls:
#         idx = paper_urls.index(url) + 1
        soup = get_soup(url, idx)
        update_author_data(url, soup, author_data_tuples)
        print(f'{idx} is done')
        time.sleep(0.5 + random.uniform(0, 0.4))
    
    # build dataframe
    author_df = pd.DataFrame(author_data_tuples, columns=[
        'doi',
        'url',
        'journal',
        'title',
        'year',
        'numberOfAuthros',
        'authorPosition',
        'fullname',
        'firstname',
        'lastname',
        'affiliation',
        'correspondence',
        'googleScholarLink'
    ])
    
    author_df.to_csv(ICA_AUTHOR_DF, index=False)

Total number of URLs to crawl: 7708
1 is done
2 is done
3 is done
4 is done
5 is done
6 is done
7 is done
8 is done
9 is done
https://academic.oup.com/joc/article/27/2/211/4553980
10 is done


In [147]:
author_df

Unnamed: 0,doi,url,journal,title,year,numberOfAuthros,authorPosition,fullname,firstname,lastname,affiliation,correspondence,googleScholarLink
0,10.1111/j.1468-2958.1976.tb00485.x,https://academic.oup.com/hcr/article/2/3/262/4...,Human Communication Research,"Medium of Communication, Differential Power, a...",1976,3.0,1.0,Allen A. Turnbull,Allen,Turnbull,1Randolph-Mac on Woman’s College,,http://scholar.google.com/scholar?q=author:%22...
1,10.1111/j.1468-2958.1976.tb00485.x,https://academic.oup.com/hcr/article/2/3/262/4...,Human Communication Research,"Medium of Communication, Differential Power, a...",1976,3.0,2.0,Lloyd Strickland,Lloyd,Strickland,2Carleton University,,http://scholar.google.com/scholar?q=author:%22...
2,10.1111/j.1468-2958.1976.tb00485.x,https://academic.oup.com/hcr/article/2/3/262/4...,Human Communication Research,"Medium of Communication, Differential Power, a...",1976,3.0,3.0,Kelly G. Shaver,Kelly,Shaver,3College of William and Mary,,http://scholar.google.com/scholar?q=author:%22...
3,10.1093/joc/jqx021,https://academic.oup.com/joc/article/68/2/233/...,Journal of Communication,Cultural Voyeurism: A New Framework for Unders...,2018,1.0,1.0,Osei Appiah,Osei,Appiah,"School of Communication, The Ohio State Univer...","Corresponding author: Osei Appiah, Professor a...",http://scholar.google.com/scholar?q=author:%22...
4,10.1111/j.1460-2466.1978.tb01675.x,https://academic.oup.com/joc/article/28/4/225/...,Journal of Communication,A palpable hit,1978,1.0,1.0,Marc U. Porat,Marc,Porat,1Aspen Institute,,http://scholar.google.com/scholar?q=author:%22...
5,10.1111/j.1460-2466.1992.tb00789.x,https://academic.oup.com/joc/article/42/2/163/...,Journal of Communication,The History of the Public Sphere,1992,1.0,1.0,John Nerone,John,Nerone,1University of Illinois at Urbana-Champaign,,http://scholar.google.com/scholar?q=author:%22...
6,10.1111/j.1468-2958.2004.tb00729.x,https://academic.oup.com/hcr/article/30/2/153/...,Human Communication Research,Is Psychopathology the Key to Understanding Wh...,2004,5.0,1.0,Tom Grimes,Tom,Grimes,1Kansas State University,"5Tom Grimes, Journalism & Mass Communication, ...",http://scholar.google.com/scholar?q=author:%22...
7,10.1111/j.1468-2958.2004.tb00729.x,https://academic.oup.com/hcr/article/30/2/153/...,Human Communication Research,Is Psychopathology the Key to Understanding Wh...,2004,5.0,2.0,Lori Bergen,Lori,Bergen,1Kansas State University,,http://scholar.google.com/scholar?q=author:%22...
8,10.1111/j.1468-2958.2004.tb00729.x,https://academic.oup.com/hcr/article/30/2/153/...,Human Communication Research,Is Psychopathology the Key to Understanding Wh...,2004,5.0,3.0,Kathie Nichols,Kathie,Nichols,2Menninger Clinic,,http://scholar.google.com/scholar?q=author:%22...
9,10.1111/j.1468-2958.2004.tb00729.x,https://academic.oup.com/hcr/article/30/2/153/...,Human Communication Research,Is Psychopathology the Key to Understanding Wh...,2004,5.0,4.0,Eric Vernberg,Eric,Vernberg,3The University of Kansas,,http://scholar.google.com/scholar?q=author:%22...
