In [1]:
def get_attributes(object):
    attributes = dir(object)
    filtered_attributes = [
        attr for attr in attributes if not attr.startswith('__')]
    return filtered_attributes


def get_methods(object):
    methods = dir(object)
    filtered_methods = [
        method for method in methods if not method.startswith('__') and callable(getattr(object, method))]
    return filtered_methods


def get_properties(object):
    properties = dir(object)
    filtered_properties = [
        prop for prop in properties if not prop.startswith('__') and not callable(getattr(object, prop))]
    return filtered_properties

In [2]:
import requests
import json
import pandas as pd
from pprint import pprint
import pickle

base_url = "https://api.openalex.org/"
email = "katz.562@osu.edu"


def pickle_dump(data, filename):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)


author_list = []
for j in range(10):
    try:
        author_list.extend(pickle.load(
            open(f"data/author_list_{j}.pkl", 'rb')))
        print(f"Author list {j} loaded from file.")
    except FileNotFoundError:
        sample = 10000
        per_page = 200
        while j < 10:
            new_author_list = []
            print(f"{j}th sample")
            seed = 3142+j
            i = 1
            while i <= sample/per_page:
                print(f"Page {i}")
                response = requests.get(base_url + "authors",
                                        params={'sample': sample, 'seed': seed, 'per-page': per_page, 'page': i})
                data = response.json()
                if len(data['results']) == 0:
                    break
                new_author_list.extend(data['results'])
                i += 1
            filename = f"data/author_list_{j}.json"
            print(f'Saving {len(new_author_list)} authors to {filename}')
            pickle_dump(new_author_list, f"data/author_list_{j}.pkl")
            j += 1

Author list 0 loaded from file.
Author list 1 loaded from file.
Author list 2 loaded from file.
Author list 3 loaded from file.
Author list 4 loaded from file.
Author list 5 loaded from file.
Author list 6 loaded from file.
Author list 7 loaded from file.
Author list 8 loaded from file.
Author list 9 loaded from file.


In [9]:
authors_df = pd.DataFrame(author_list)
unnecessary_columns = ['orcid', 'ids', 'affiliations', 'last_known_institutions',
                       'summary_stats', 'created_date', 'updated_date', 'topics', 'topic_share', 'relevance_score']
df = authors_df.drop(columns=unnecessary_columns)

In [11]:
def fetch_works(api_url):
    params = {
        'per_page': 200
    }
    response = requests.get(api_url, params)
    if response.status_code == 200:
        return [work['id'] for work in response.json()['results']]
    else:
        return None


n = 10  # Number of rows to process
df.loc[:n-1, 'works'] = df.loc[:n-1, 'works_api_url'].apply(fetch_works)

In [17]:
df.head(1)['works']

0    [https://openalex.org/W4382515348, https://ope...
Name: works, dtype: object