In [None]:
import pandas as pd
import requests
from itertools import chain

In [None]:
#API keys
api_springer = "" #full text for open-access articles in XML format (api.springernature.com/openaccess/jats/doi/[DOI]?api_key=[API])
api_elsevier = "" #full text for any article I have access to in XML format (https://api.elsevier.com/content/article/doi/[DOI]?APIKey=[API]?view=FULL)
api_wiley = "" #full text, but PDF download only.
api_gpt = ""
#BioC API: offers PMC OA and Author Manuscript Collection in XML format via PubMed ID or PMC ID (https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_[format]/[ID]/[encoding])
api_semanticscholar = ""

In [None]:
# I can filter using multiple ORCIDs by using the '|' operator. (Up to 50 values)

def build_author_works_url(id_list):
    # specify endpoint
    endpoint = 'works'

    ids = '|'.join(id_list)

    # build the 'filter' parameter
    filters = (
      f'author.id:{ids}',
      'type:article', #excludes book-chapter, dissertation, book, dataset, paratext, other, reference-entry, report, peer-review, standard, editorial, erratum, grant, letter
      'is_paratext:false' #excludes paratext
    )

    # put the URL together
    return f'https://api.openalex.org/{endpoint}?filter={",".join(filters)}'


In [None]:
def get_publications(id_longlist):
    # split the list into chunks of 50
    id_chunks = [id_longlist[i:i + 50] for i in range(0, len(id_longlist), 50)]
    df = pd.DataFrame()

    # get the URL
    for id_list in id_chunks:
        url_with_cursor = build_author_works_url(id_list) + "&cursor={}"

        # initialize
        cursor = '*'
        while cursor:
            url = url_with_cursor.format(cursor)
            response = requests.get(url).json()
            data = pd.DataFrame(response['results'])
            df = pd.concat([df, data])
            cursor = response['meta']['next_cursor']

    return df

In [None]:
#get source ID for nature and science
url = "https://api.openalex.org/sources?filter=display_name.search:nature|science"
response = requests.get(url).json()
for result in response['results']:
    print(result['id'], result['display_name'])

In [None]:
# Nature_ID = response['results'][0]['id'] #https://openalex.org/S137773608
# Science_ID = response['results'][1]['id'] #https://openalex.org/S3880285
Nature_ID = "https://openalex.org/S137773608"
Science_ID = "https://openalex.org/S3880285"

In [None]:
# Get works published in Nature and Science
def NS_works(year):
    df = pd.DataFrame()

    filters = (
        'type:article', #excludes book-chapter, dissertation, book, dataset, paratext, other, reference-entry, report, peer-review, standard, editorial, erratum, grant, letter
        'is_paratext:false', #excludes paratext
        f'primary_location.source.id:{Nature_ID}|{Science_ID}',
        f'publication_year:{year}'
    )

    url_with_cursor = f'https://api.openalex.org/works?filter={",".join(filters)}' + '&cursor={}'
    # initialize
    cursor = '*'
    while cursor:
        url = url_with_cursor.format(cursor)
        response = requests.get(url).json()
        data = pd.DataFrame(response['results'])
        df = pd.concat([df, data])
        cursor = response['meta']['next_cursor']

    return df

In [None]:
ns_2018 = NS_works(2018)

In [None]:
ns_2020 = NS_works(2020)

In [None]:
ns_2020.to_parquet('ns_2020.parquet')

In [None]:
ns_2018.to_parquet('ns_2018.parquet')

In [None]:
ns_2018_parquet = pd.read_parquet('ns_2018.parquet')

In [None]:
ns_2018_parquet

In [None]:
ns_2018['authorships']

In [None]:
author_ids_2 = []
for authorships in ns_2018['authorships']:
    author_list = [d['author']['id'] for d in authorships]
    author_ids_2.append(author_list)

author_ids_2 = list(set(chain.from_iterable(author_ids_2)))

In [None]:
author_ids = []
for authorships in ns_2018_parquet['authorships']:
    author_list = [d['author']['id'] for d in authorships]
    author_ids.append(author_list)

author_ids = list(set(chain.from_iterable(author_ids)))

In [None]:
author_ids_2[:10]

In [None]:
author_ids[:10]

In [None]:
len(author_ids)

In [None]:
#save author_ids
with open('author_ids.txt', 'w') as f:
    for item in author_ids:
        f.write("%s\n" % item)

In [None]:
#load author_ids disregarding the newline character
with open('author_ids.txt', 'r') as f:
    author_ids = f.read().splitlines()

In [None]:
author_ids[:10]

In [None]:
corr_author_ids = []
for authorships in ns_2018['authorships']:
    author_list = [d['author']['id'] for d in authorships if d['is_corresponding'] or d['author_position']=='last']
    corr_author_ids.append(author_list)

corr_author_ids = list(set(chain.from_iterable(corr_author_ids)))

In [None]:
corr_author_ids = []
for authorships in ns_2018_parquet['authorships']:
    author_list = [d['author']['id'] for d in authorships if d['is_corresponding'] or d['author_position']=='last']
    corr_author_ids.append(author_list)

corr_author_ids = list(set(chain.from_iterable(corr_author_ids)))

In [None]:
len(corr_author_ids)

In [None]:
def chunk_list(input_list, N):
    return [input_list[i:i+N] for i in range(0, len(input_list),N)]

In [None]:
author_chunks = chunk_list(author_ids,50)

In [None]:
author_chunks[511]

In [None]:
#save author_chunks
with open('author_chunks.txt', 'w') as f:
    for item in author_chunks:
        f.write("%s\n" % item)

In [None]:
# load author_chunks disregarding the newline character
with open('author_chunks.txt', 'r') as f:
    author_chunks = f.read().splitlines()

In [None]:
author_chunks = [eval(x) for x in author_chunks]

In [None]:
def extract_author_ids(authorships):
    author_ids = []
    for authorship in authorships:
        author_ids.append(authorship['author']['id'])
    return author_ids

In [None]:
def add_author_id_list(chunk):
    chunk['author_ids'] = chunk['authorships'].apply(extract_author_ids)
    return chunk


In [None]:
for i in [50]:
    try:
        df = get_publications(author_chunks[i])
        df = add_author_id_list(df)
        df.to_parquet(f'author_publications_{i}.parquet', engine='fastparquet')
        # write progress in txt
        with open('ns2018_progress.txt', 'a') as f:
            f.write(f"chunk {i} done\n")
    except:
        with open('ns2018_errorlist.txt', 'a') as f:
            f.write(f"chunk {i} failed\n")

In [None]:
print(build_author_works_url(author_chunks[7]))

In [None]:
df1 = pd.read_parquet('/Volumes/Samsung_T5/OpenAlex_Works/author_publications_7.parquet 2')
df2 = pd.read_parquet('/Volumes/Samsung_T5/OpenAlex_Works/author_publications_7.parquet')

In [None]:
for i in range(len(author_chunks)):
    chunk = author_chunks[i]
    df = pd.read_parquet(f'/Volumes/Samsung_T5/OpenAlex_Works/author_publications_{i}.parquet')
    df.author_ids = df.author_ids.apply(lambda x: eval(x))
    for author in chunk:
        author_name = author.split('/')[-1]
        author_df = df[df.author_ids.apply(lambda x: author in x)]
        author_df.to_parquet(f'/Volumes/Samsung_T5/OpenAlex_Works/author_publications_{author_name}.parquet')
    

In [None]:
s2orc_meta = pd.read_csv('metadata.csv', usecols=['corpusid','externalids'])

In [None]:
s2orc_meta['externalids'] = s2orc_meta.externalids.apply(lambda x: eval(x) if pd.notna(x) else x)

In [None]:
s2orc_meta

In [None]:
s2orc_mag = s2orc_meta.externalids.apply(lambda x:)