## Aim

I need to go to each paper individually through DOI and collect the author information.

In [151]:
import sys
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import random
import time 
import requests
import re
import json

In [152]:
ica_papers = pd.read_csv('../data/interim/ica_paper_df.csv')

In [153]:
ica_papers.head(1)

Unnamed: 0,journal,volumn,issue,month,year,category,title,url,doi,pages,abstract,abstract_para_num
0,Journal of Communication,Volume 72,Issue 2,April,2022,Articles,Media Systems in the Digital Age: An Empirical...,https://academic.oup.com/joc/article/72/2/145/...,10.1093/joc/jqab054,145–164,,0.0


In [154]:
def get_response(url, idx):
    headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36" \
    "(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
    }
    response = requests.get(url=url, headers=headers)
    while response.status_code != 200:
        time.sleep(1)
        print(f"{idx} : {url} status code is {response.status_code}, retrying ...")
        response = requests.get(url=url, headers=headers)
    return response 

def get_j(response):
    html = response.text 
    soup = BeautifulSoup(html, 'html.parser')
    try:
        string = soup.select_one(
            "script[type='application/ld+json']"
        ).string.strip()

        j = json.loads(string)
    except:
        j = None
    return j 

def update_paper_data_tuples(doi, url, idx, j, paper_data_tuples):
    title = j['name']
    paper_type = j['@type']
    journal = j['isPartOf']['isPartOf']['name'] or np.nan
    date_published = j['datePublished']
    try:
        keywords_list = j['keywords']
        keywords = ", ".join(keywords_list)
    except:
        keywords = np.nan
        # print(f"there are no keywords in {idx} : {url}")
    paper_data_tuples.append((doi, url, title, paper_type, journal, date_published, keywords))
    return paper_data_tuples

def update_author_data_tuples(doi, url, idx, j, author_data_tuples):
    journal = j['isPartOf']['isPartOf']['name'] or np.nan
    date_published = j['datePublished'] or np.nan
    title = j['name'] or np.nan
    
    try:
        authors = j['author']
        author_num = len(authors)
    except:
        # print(f"there is no author data in {idx} : {url}")
        authors = None
        author_num = None
        name = np.nan; aff = np.nan; author_type = np.nan
    # if there is no author data, I don't need to proceed here
    if authors is not None:
        for author in authors:
            position = authors.index(author) + 1
            try:
                """
                authors[0]['name'] returns last, and then first.
                one example: 'Read, Stephen J.'
                so I need to reverse the order
                """
                name_elements = author['name'].split(', ')
                name_elements.reverse()
                name = ' '.join(name_elements)
            except:
                name = np.nan
            try:
                aff = author['affiliation']
            except:
                aff = np.nan
            try:
                author_type = author['@type']
            except:
                author_type = np.nan
            author_data_tuples.append((
                doi, url, title, journal, date_published, name, author_num, position, aff, author_type))
    else:
        author_data_tuples.append((
                doi, url, title, journal, date_published))
    return author_data_tuples

In [170]:
paper_urls = ica_papers.url.tolist()
paper_dois = ica_papers.doi.tolist()
url_doi_dic = dict(zip(paper_urls, paper_dois))

In [156]:
error_urls = []
paper_data_tuples = []
author_data_tuples = []

In [157]:
url = 'https://academic.oup.com/joc/article/28/2/235/4371390'
doi = url_doi_dic[url]
idx = paper_urls.index(url) + 1
response = get_response(url, idx)
j = get_j(response)

In [158]:
update_paper_data_tuples(doi, url, idx, j, paper_data_tuples)

[('10.1111/j.1460-2466.1978.tb01619.x',
  'https://academic.oup.com/joc/article/28/2/235/4371390',
  'Colloquy',
  'ScholarlyArticle',
  'Journal of Communication',
  '2006-02-07',
  'social control, informal, self control')]

In [159]:
update_author_data_tuples(doi, url, idx, j, author_data_tuples)

[('10.1111/j.1460-2466.1978.tb01619.x',
  'https://academic.oup.com/joc/article/28/2/235/4371390',
  'Colloquy',
  'Journal of Communication',
  '2006-02-07')]

In [119]:
authors = j['author']
author_num = len(authors)

KeyError: 'author'

In [98]:
authors

[{'name': 'Wessler, Hartmut',
  'affiliation': '1 Institute for Media and Communication Studies, University of Mannheim, 68159, Mannheim, Germany',
  '@type': 'Person'},
 {'name': 'Rinke, Eike Mark',
  'affiliation': '1 Institute for Media and Communication Studies, University of Mannheim, 68159, Mannheim, Germany',
  '@type': 'Person'},
 {'name': 'Löb, Charlotte',
  'affiliation': '1 Institute for Media and Communication Studies, University of Mannheim, 68159, Mannheim, Germany',
  '@type': 'Person'}]

In [99]:
authors[0]

{'name': 'Wessler, Hartmut',
 'affiliation': '1 Institute for Media and Communication Studies, University of Mannheim, 68159, Mannheim, Germany',
 '@type': 'Person'}

In [100]:
for author in authors:
    position = authors.index(author) + 1
    name_elements = author['name'].split(', ')
    name_elements.reverse()
    name = ' '.join(name_elements)
    print(position, name)

1 Hartmut Wessler
2 Eike Mark Rinke
3 Charlotte Löb


In [171]:
paper_urls = random.sample(paper_urls, 5)
paper_urls.append('https://academic.oup.com/joc/article/28/2/235/4371390')

In [172]:
paper_urls

['https://academic.oup.com/joc/article/14/3/136/4569101',
 'https://academic.oup.com/joc/article/25/4/25/4553704',
 'https://academic.oup.com/hcr/article/23/3/370/4564960',
 'https://academic.oup.com/joc/article/72/2/145/6509144',
 'https://academic.oup.com/joc/article/26/3/154/4553870',
 'https://academic.oup.com/joc/article/28/2/235/4371390']

In [173]:
error_urls = []
paper_data_tuples = []
author_data_tuples = []

In [174]:
for url in paper_urls:
    doi = url_doi_dic[url]
    idx = paper_urls.index(url) + 1
    response = get_response(url, idx)
    j = get_j(response)
    if j is not None:
        update_paper_data_tuples(doi, url, idx, j, paper_data_tuples)
        update_author_data_tuples(doi, url, idx, j, author_data_tuples)
    else:
        print(f'something wrong with {idx} : {url}')
        paper_data_tuples.append((doi, url, np.nan, np.nan, np.nan, np.nan))
        author_data_tuples.append((doi, url, np.nan, np.nan, np.nan, np.nan))
        error_urls.append(url)
    print(f'{idx} is done')
    time.sleep(0.5 + random.uniform(0, 0.4))

1 is done
2 is done
3 is done
4 is done
5 is done
6 is done


In [175]:
len(author_data_tuples)

12

In [176]:
author_data = pd.DataFrame(
        list(author_data_tuples),
        columns = [
            'doi',
            'url',
            'title',
            'journal',
            'pubdate',
            'authorName',
            'numberOfAuthors',
            'authorPosition',
            'affiliation',
            'authorType',
        ]
    )

In [177]:
author_data

Unnamed: 0,doi,url,title,journal,pubdate,authorName,numberOfAuthors,authorPosition,affiliation,authorType
0,10.1111/j.1460-2466.1964.tb02360.x,https://academic.oup.com/joc/article/14/3/136/...,Readability Re-Examined,Journal of Communication,2006-02-07,Niel K. Snortum,1.0,1.0,1 Dr. Snortum is an Associate Professor at San...,Person
1,10.1111/j.1460-2466.1975.tb00635.x,https://academic.oup.com/joc/article/25/4/25/4...,The Evidence So Far,Journal of Communication,2006-02-07,George Comstock,1.0,1.0,1 George Comstock is a senior social psycholog...,Person
2,10.1111/j.1468-2958.1997.tb00401.x,https://academic.oup.com/hcr/article/23/3/370/...,Predicting Employee Turnover from Communicatio...,Human Communication Research,2006-03-17,Thomas H. Feeley,2.0,1.0,"1 Thomas H. Feeley (Ph.D., State University of...",Person
3,10.1111/j.1468-2958.1997.tb00401.x,https://academic.oup.com/hcr/article/23/3/370/...,Predicting Employee Turnover from Communicatio...,Human Communication Research,2006-03-17,George A. Barnett,2.0,2.0,"2 George Barnett (Ph.D., Michigan State Univer...",Person
4,10.1093/joc/jqab054,https://academic.oup.com/joc/article/72/2/145/...,Media Systems in the Digital Age: An Empirical...,Journal of Communication,2022-01-17,Edda Humprecht,5.0,1.0,Department of Communication and Media Research...,Person
5,10.1093/joc/jqab054,https://academic.oup.com/joc/article/72/2/145/...,Media Systems in the Digital Age: An Empirical...,Journal of Communication,2022-01-17,Laia Castro Herrero,5.0,2.0,Department of Communication and Media Research...,Person
6,10.1093/joc/jqab054,https://academic.oup.com/joc/article/72/2/145/...,Media Systems in the Digital Age: An Empirical...,Journal of Communication,2022-01-17,Sina Blassnig,5.0,3.0,Department of Communication and Media Research...,Person
7,10.1093/joc/jqab054,https://academic.oup.com/joc/article/72/2/145/...,Media Systems in the Digital Age: An Empirical...,Journal of Communication,2022-01-17,Michael Brüggemann,5.0,4.0,Department of Journalism and Mass Communicatio...,Person
8,10.1093/joc/jqab054,https://academic.oup.com/joc/article/72/2/145/...,Media Systems in the Digital Age: An Empirical...,Journal of Communication,2022-01-17,Sven Engesser,5.0,5.0,"Institute of Media and Communication, Technica...",Person
9,10.1111/j.1460-2466.1976.tb01919.x,https://academic.oup.com/joc/article/26/3/154/...,Putdown Humor,Journal of Communication,2006-02-07,Dolf Zillmann,2.0,1.0,1 Dolf Zillman is Professor of Communication a...,Person


In [None]:
paper_data = pd.DataFrame(
        list(paper_data_tuples),
        columns = [
            'doi',
            'url',
            'title',
            'paperType',
            'journal',
            'datePublished',
            'keywords'
        ]
    )