In [21]:
import requests
import networkx as nx
import numpy as np
from tqdm.notebook import tqdm
from datetime import datetime
import time
import json
import logging

In [33]:
niu_institution_id = "i102502594"
authors_url = f"https://api.openalex.org/authors?filter=last_known_institutions.id:{niu_institution_id}"

In [95]:
# Functions to fetch data from OpenAlex

def fetch_from_open_alex(url: str, sleep_time=0.5, params=None):
    cursor = '*'
    final_results = []
    
    while cursor:
        try:
            if params is None:
                params = {
                    'per_page': 1,
                    'cursor': cursor
                }
            else:
                params['per_page'] = 100
                params['cursor'] = cursor
            response = requests.get(url, params=params)
            data = response.json()
            cursor = data['meta']['next_cursor']
            final_results.extend(data['results'])
            time.sleep(sleep_time)
            # break # remove after testing
        except Exception as e:
            print(e)
            pass
    return final_results


def fetch_authors_by_institution(institution_id: str="i102502594"):
    return fetch_from_open_alex(f"https://api.openalex.org/authors?filter=last_known_institutions.id:{niu_institution_id}")

def fetch_author_works_since_last_year(works_url):
    current_year = str(datetime.now().year)
    prev_year = str(datetime.now().year - 1)
    
    current_year_works_url = f"{works_url},publication_year:{current_year}"
    prev_year_works_url = f"{works_url},publication_year:{prev_year}"
    
    return fetch_from_open_alex(current_year_works_url) + fetch_from_open_alex(prev_year_works_url)

In [100]:
# Functions to process abstracts
def get_abstract_from_inverted_index(inverted_index):
    if inverted_index != None:
        try:

            abstract_words = []
            max_index = max(max(positions) for positions in inverted_index.values())
            abstract_words = [None] * (max_index + 1)
            for word, positions in inverted_index.items():
                for position in positions:
                    abstract_words[position] = word

            abstract = ' '.join([word if word is not None else '' for word in abstract_words])
            return abstract
        except:
            return "Unknown"
    else:
        return "Unknown"
    
    
def get_abstracts(works):
    if works is not None:
        return [get_abstract_from_inverted_index(work.get("abstract_inverted_index")) for work in works]
    return []

In [105]:
# Functions to process authors data


def process_authors_data(authors):
    all_author_ids = [author.get("id") for author in authors]
    processed = []
    seen_ids = []
    
    for author in authors:
        # Get author ids
        author_id = author.get("id")
        if author_id in seen_ids:
            continue
        seen_ids.append(author_id)
        
        # Get author names
        author_name = author.get("display_name", "Unknown")
        orcid = author.get("orcid", None)
        works_count = author.get("works_count", 0)
        cited_by_count = author.get("cited_by_count", 0)
        
        # Get institution name
        institution_data = author.get("last_known_institutions", [])
        institution_name = "Unknown"
        if len(institution_data) > 0:
            institution_name = institution_data[0].get("display_name", "")
            
        # Get topics the author affiliates with
        topics_data = author.get("topics", [])
        topics = [topic.get("display_name", "")for topic in topics_data]
        
        # Get abstracts of papers authored in the current and last year 
        works_url = author.get("works_api_url")
        works = []
        if works_url is not None:
            works = fetch_author_works_since_last_year(works_url)
        abstracts = get_abstracts(works)
        
        # Get coauthors
        co_authors = [
            authorship["author"]["id"] 
            for work in works 
            for authorship in work["authorships"] 
            if (
                authorship["author"]["id"] != author_id
            ) and 
            (
                authorship["author"]["id"] in all_author_ids
            )
        ]
        
            
        author_record = {
            "author_id": author_id,
            "author_name": author_name,
            "orcid": orcid,
            "works_count": works_count,
            "cited_by_count": cited_by_count,
            "institution": institution_name,
            "topics": topics,
            "abstracts": abstracts,
            "co_authors": co_authors
        }
        processed.append(author_record)
    return processed

def save_authors_to_json(processed_authors, filename="niu_authors_cleaned.json"):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(processed_authors, f, indent=4, ensure_ascii=False)
    print(f"Saved {len(processed_authors)} author records to {filename}")
    
def read_authors_from_json(file_path="niu_authors_cleaned.json"):
    try:
        with open(file_path, 'r', encoding="utf-8") as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"Error: File not found: {file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in: {file_path}")
        return None

In [102]:
def fetch_institution_data_from_openalex(institution_id="i102502594"):
    authors = fetch_authors_by_institution()
    processed_authors = process_authors_data(authors)
    save_authors_to_json(processed_authors)

In [103]:
fetch_institution_data_from_openalex()

Saved 6908 author records to niu_authors_cleaned.json


In [106]:
processed_authors = read_authors_from_json()
processed_authors

[{'author_id': 'https://openalex.org/A5078660500',
  'author_name': 'J. Adelman',
  'orcid': 'https://orcid.org/0000-0002-1041-3496',
  'works_count': 2129,
  'cited_by_count': 114993,
  'institution': 'Northern Illinois University',
  'topics': ['Particle physics theoretical and experimental studies',
   'High-Energy Particle Collisions Research',
   'Particle Detector Development and Performance',
   'Quantum Chromodynamics and Particle Interactions',
   'Dark Matter and Cosmic Phenomena',
   'Computational Physics and Python Applications',
   'Cosmology and Gravitation Theories',
   'Neutrino Physics Research',
   'Distributed and Parallel Computing Systems',
   'Black Holes and Theoretical Physics',
   'Astrophysics and Cosmic Phenomena',
   'Medical Imaging Techniques and Applications',
   'Radiation Detection and Scintillator Technologies',
   'Particle Accelerators and Free-Electron Lasers',
   'Advanced Data Storage Technologies',
   'Atomic and Subatomic Physics Research',
   