## Aim

I need to go to each paper individually through DOI and collect the author information.

In [1]:
import sys
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import random
import time 
import requests
import re
import json

In [2]:
ica_papers = pd.read_csv('../data/interim/ica_paper_df.csv')

In [56]:
ica_papers.head(1)

Unnamed: 0,journal,volumn,issue,month,year,category,title,url,doi,pages,abstract,abstract_para_num
0,Journal of Communication,Volume 72,Issue 2,April,2022,Articles,Media Systems in the Digital Age: An Empirical...,https://academic.oup.com/joc/article/72/2/145/...,10.1093/joc/jqab054,145–164,,0.0


In [57]:
def get_response(url, idx):
    headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36" \
    "(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
    }
    response = requests.get(url=url, headers=headers)
    while response.status_code != 200:
        time.sleep(1)
        print(f"{idx} : {url} status code is {response.status_code}, retrying ...")
        response = requests.get(url=url, headers=headers)
    return response 

def get_j(response):
    html = response.text 
    soup = BeautifulSoup(html, 'html.parser')
    try:
        string = soup.select_one(
            "script[type='application/ld+json']"
        ).string.strip()

        j = json.loads(string)
    except:
        j = None
    return j 

def update_paper_data_tuples(doi, url, idx, j, paper_data_tuples):
    title = j['name']
    paper_type = j['@type']
    journal = j['isPartOf']['isPartOf']['name'] or np.nan
    date_published = j['datePublished']
    try:
        keywords_list = j['keywords']
        keywords = ", ".join(keywords_list)
    except:
        keywords = np.nan
        # print(f"there are no keywords in {idx} : {url}")
    paper_data_tuples.append((doi, url, title, paper_type, journal, date_published, keywords))
    return paper_data_tuples

def update_author_data_tuples(doi, url, idx, j, author_data_tuples):
    journal = j['isPartOf']['isPartOf']['name'] or np.nan
    date_published = j['datePublished'] or np.nan
    title = j['name'] or np.nan
    
    try:
        authors = j['author']
        author_num = len(authors)
    except:
        # print(f"there is no author data in {idx} : {url}")
        authors = None
        author_num = None
        name = np.nan; aff = np.nan; author_type = np.nan
    # if there is no author data, I don't need to proceed here
    if authors is not None:
        for author in authors:
            position = authors.index(author) + 1
            try:
                """
                authors[0]['name'] returns last, and then first.
                one example: 'Read, Stephen J.'
                so I need to reverse the order
                """
                name_elements = author['name'].split(', ')
                name_elements.reverse()
                name = ' '.join(name_elements)
            except:
                name = np.nan
            try:
                aff = author['affiliation']
            except:
                aff = np.nan
            try:
                author_type = author['@type']
            except:
                author_type = np.nan
            author_data_tuples.append((
                doi, url, title, journal, date_published, name, author_num, position, aff, author_type))
    else:
        author_data_tuples.append((
                doi, url, title, journal, date_published))
    return author_data_tuples

In [58]:
paper_urls = ica_papers.url.tolist()
paper_dois = ica_papers.doi.tolist()
url_doi_dic = dict(zip(paper_urls, paper_dois))

In [59]:
error_urls = []
paper_data_tuples = []
author_data_tuples = []

In [60]:
# url = 'https://academic.oup.com/joc/article/28/2/235/4371390'
# doi = url_doi_dic[url]
# idx = paper_urls.index(url) + 1
# response = get_response(url, idx)
# j = get_j(response)

In [61]:
# update_paper_data_tuples(doi, url, idx, j, paper_data_tuples)

In [62]:
# update_author_data_tuples(doi, url, idx, j, author_data_tuples)

In [63]:
# authors = j['author']
# author_num = len(authors)

In [64]:
# authors

In [65]:
# authors[0]

In [66]:
# for author in authors:
#     position = authors.index(author) + 1
#     name_elements = author['name'].split(', ')
#     name_elements.reverse()
#     name = ' '.join(name_elements)
#     print(position, name)

In [67]:
# paper_urls = random.sample(paper_urls, 5)
# paper_urls.append('https://academic.oup.com/joc/article/28/2/235/4371390')

In [68]:
error_urls = []
paper_data_tuples = []
author_data_tuples = []

In [69]:
# paper_urls = random.sample(paper_urls, 5)

In [70]:
for url in paper_urls:
    doi = url_doi_dic[url]
    idx = paper_urls.index(url) + 1
    response = get_response(url, idx)
    j = get_j(response)
    if j is not None:
        update_paper_data_tuples(doi, url, idx, j, paper_data_tuples)
        update_author_data_tuples(doi, url, idx, j, author_data_tuples)
    else:
        print(f'something wrong with {idx} : {url}')
        paper_data_tuples.append((doi, url, np.nan, np.nan, np.nan, np.nan))
        author_data_tuples.append((doi, url, np.nan, np.nan, np.nan, np.nan))
        error_urls.append(url)
    print(f'{idx} is done')
    time.sleep(0.5 + random.uniform(0, 0.4))

1 is done
2 is done
3 is done
4 is done
5 is done
6 is done
7 is done
8 is done
9 is done
10 is done
11 is done
12 is done
13 is done
14 is done
15 is done
16 is done
17 is done
18 is done
19 is done
20 is done
21 is done
22 is done
23 is done
24 is done
25 is done
26 is done
27 is done
28 is done
29 is done
30 is done
31 is done
32 is done
33 is done
34 is done
35 is done
36 is done
37 is done
38 is done
39 is done
40 is done
41 is done
42 is done
43 is done
44 is done
45 is done
46 is done
47 is done
48 is done
49 is done
50 is done
51 is done
52 is done
53 is done
54 is done
55 is done
56 is done
57 is done
58 is done
59 is done
60 is done
61 is done
62 is done
63 is done
64 is done
65 is done
66 is done
67 is done
68 is done
69 is done
70 is done
71 is done
72 is done
73 is done
74 is done
75 is done
76 is done
77 is done
78 is done
79 is done
80 is done
81 is done
82 is done
83 is done
84 is done
85 is done
86 is done
87 is done
88 is done
89 is done
90 is done
91 is done
92 is do

KeyboardInterrupt: 

In [None]:
author_data = pd.DataFrame(
        list(author_data_tuples),
        columns = [
            'doi',
            'url',
            'title',
            'journal',
            'datePublished',
            'authorName',
            'numberOfAuthors',
            'authorPosition',
            'affiliation',
            'authorType',
        ]
    )

In [None]:
paper_data = pd.DataFrame(
        list(paper_data_tuples),
        columns = [
            'doi',
            'url',
            'title',
            'paperType',
            'journal',
            'datePublished',
            'keywords'
        ]
    )

In [None]:
ICA_PAPER_DATA = '../data/interim/ica_paper_data.csv'
ICA_AUTHOR_DATA = '../data/interim/ica_author_data.csv'
ICA_ERROR_URLS = '../data/interim/ica_error_urls.txt'

In [None]:
paper_data.to_csv(ICA_PAPER_DATA, index=False)
author_data.to_csv(ICA_AUTHOR_DATA, index=False)
with open(ICA_ERROR_URLS, 'w') as f:
    for url in error_urls:
        f.write("%s\n" % url)