In [38]:
import requests

BASE_URL = "https://data.htrc.illinois.edu/ef-api"

# FETCH DATA
def get_ef_data_by_volume_id(volume_id):
    """"Fetches all the data"""
    url = f"{BASE_URL}/volumes/{volume_id}"
    response = requests.get(url)
    return response.json()

def check_if_volume_exists(volume_id):
    url = f"{BASE_URL}/volumes/{volume_id}"
    response = requests.head(url)
    return response.status_code == 200

def get_volume_metadata_by_volume_id(volume_id):
    """"Only metadata"""
    url = f"{BASE_URL}/volumes/{volume_id}/metadata"
    response = requests.get(url)
    return response.json()

def get_volume_pages_by_volume_id(volume_id):
    url = f"{BASE_URL}/volumes/{volume_id}/pages"
    response = requests.get(url)
    return response.json()

In [35]:
def extract_relevant_info_for_volume(volume_id):
    if check_if_volume_exists(volume_id):
        ef_data = get_ef_data_by_volume_id(volume_id)
        metadata = get_volume_metadata_by_volume_id(volume_id)
        
        author_name = metadata.get('data', {}).get('metadata', {}).get('contributor', {}).get('name', 'Unknown')
        year_of_publication = metadata.get('data', {}).get('metadata', {}).get('pubDate', 'Unknown')
        language = metadata.get('data', {}).get('metadata', {}).get('language', 'Unknown')
        
        token_counts = {}
        for page in ef_data.get('data', {}).get('pages', []):
            body = page.get('body', {})
            tokens_count = body.get('tokensCount', {})
            for token, count in tokens_count.items():
                if token in token_counts:
                    token_counts[token] += count
                else:
                    token_counts[token] = count

        return {
            "volume_id": volume_id,
            "author_name": author_name,
            "year_of_publication": year_of_publication,
            "language": language,
            "token_counts": token_counts
        }
    else:
        print(f"Volume {volume_id} does not exist.")
        return None

In [39]:
# WORKSET MANAGEMENT
""""
id: The workset identifier. Eg. 63e2a4e467720167725e440c
htids: The list of volume IDs in the workset. 
created: The workset creation date/time Eg. 2023-02-07T19:22:13.221Z
"""
        
def create_workset(data):
    url = f"{BASE_URL}/worksets"
    response = requests.post(url, json=data)
    return response.json()

def delete_workset_by_id(workset_id):
    url = f"{BASE_URL}/worksets/{workset_id}"
    response = requests.delete(url)
    return response.status_code == 204

def get_workset_by_id(workset_id):
    url = f"{BASE_URL}/worksets/{workset_id}"
    response = requests.get(url)
    return response.json()

def get_workset_volumes_by_id(workset_id):
    url = f"{BASE_URL}/worksets/{workset_id}/volumes"
    response = requests.get(url)
    return response.json()

def get_workset_volumes_aggregated_by_id(workset_id):
    url = f"{BASE_URL}/worksets/{workset_id}/volumes/aggregated"
    response = requests.get(url)
    return response.json()

def get_workset_volumes_metadata_by_id(workset_id):
    url = f"{BASE_URL}/worksets/{workset_id}/metadata"
    response = requests.get(url)
    return response.json()

In [40]:
def get_volume_body_tokens(volume_id):
    if check_if_volume_exists(volume_id):
        ef_data = get_ef_data_by_volume_id(volume_id)
        
        token_counts = {}
        for page in ef_data.get('data', {}).get('pages', []):
            body = page.get('body', {})
            tokens_count = body.get('tokensCount', {})
            for token, count in tokens_count.items():
                if token in token_counts:
                    token_counts[token] += count
                else:
                    token_counts[token] = count

        return {
            "volume_id": volume_id,
            "token_counts": token_counts
        }
    else:
        return {"error": f"Volume {volume_id} does not exist."}

In [41]:
volume_ids = "uc2.ark+=13960=t17m0815m"
volume_info = get_volume_body_tokens(volume_ids)

In [42]:
volume_info

{'volume_id': 'uc2.ark+=13960=t17m0815m', 'token_counts': {}}