In [4]:
import os
import requests
import json
from pathlib import Path
from dotenv import load_dotenv
from rich import print

load_dotenv()

# Fetch the data
url = "https://itell-strapi-um5h.onrender.com/api/pages"
headers = {"Authorization": f"Bearer {os.getenv('CMS_KEY')}"}

In [16]:
volume_slugs = [
    "research-methods-in-psychology",
    "communication-for-business",
    "natural-language-processing",
    "introduction-to-computing",
    "one-nation-one-people-the-uscis-civics-test-textbook",
]

params = {
    "fields[0]": "Title",
    "fields[1]": "Slug",
    "filters[Volume][Slug][$eq]": None,  # populated later
    "populate[Content][on][page.chunk][fields][0]": "Header",
    "populate[Content][on][page.chunk][fields][1]": "MDX",
    "populate[Content][on][page.plain-chunk][fields][2]": "Header",
    "populate[Content][on][page.plain-chunk][fields][3]": "MDX",
    "sort": "Order",
    "pagination[pageSize]": "100"
}


# Process and save to files
output_dir = Path("../../data/reference-json-mdx")
output_dir.mkdir(exist_ok=True)

for volume_slug in volume_slugs:
    params["filters[Volume][Slug][$eq]"] = volume_slug
    response = requests.get(url, params=params)
    data = response.json()


    
    
    # Inspect the structure
    print(f"Processing {volume_slug}")
    print(f"Status: {response.status_code}")
    print(f"Number of pages: {len(data.get('data', []))}")

    # Save each page to a separate file
    for page_num, page in enumerate(data.get('data', [])):
        if page_num >= 2:
            break
        # Remove unneeded fields that will not be generated by the LLM
        page_slug = page.pop('Slug', None)
        page.pop('Order', None)
        page.pop('id', None)
        page.pop('documentId', None)

        for chunk in page['Content']:
            chunk.pop('id', None)

        # Create a safe filename
        filename = f"{volume_slug}_{page_slug}_{page_num:0d}.json"
        
        with open(output_dir / filename, 'w') as f:
            json.dump(page, f, indent=2)
        
        print(f"Saved: {filename}")