In [None]:
import requests
import time
from datetime import datetime, timedelta
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import lru_cache

BASE_DOMAIN = "https://api.penguinrandomhouse.com/resources/v2/title/domains/PRH.US"

BASE_WORKS = f"{BASE_DOMAIN}/works"
BASE_TITLES = f"{BASE_DOMAIN}/titles"

session = requests.Session()

In [2]:
API_KEY = "fh5hj47dynk4nvx4s9ewufj4"

In [3]:
def fast_get(url, params=None, timeout=15):
    try:
        r = session.get(url, params=params, timeout=timeout)
        r.raise_for_status()
        return r.json()
    except:
        return None

In [4]:
def get_works_page(start=0, rows=40):
    params = {
        "api_key": API_KEY,
        "rows": rows,
        "start": start,
        "suppressLinks": "true",
        "suppressRecordCount": "false",
        "preferLanguage": "E",
        "showNewReleases": "true",
        "showComingSoon": "false",
        "showPublishedBooks": "true",
        "ageRangeMin": 7,
        "ageRangeMax": 18,
        "onSaleFrom": "10/01/2025"
    }

    data = fast_get(BASE_WORKS, params=params)
    if not data:
        return []

    return data.get("data", {}).get("works", [])


In [5]:
@lru_cache(maxsize=50000)
def get_work_titles(workId):
    url = f"{BASE_TITLES}/works/{workId}/titles"
    params = {"api_key": API_KEY}

    data = fast_get(url, params=params)
    if not data:
        return []

    titles = data.get("titles", [])
    return [t for t in titles if isinstance(t, dict)]


In [6]:
@lru_cache(maxsize=50000)
def get_title_content(isbn):
    url = f"{BASE_TITLES}/{isbn}/content"
    params = {"api_key": API_KEY}

    data = fast_get(url, params=params)
    if not data:
        return ""

    d = data.get("data", {})

    text = (
        d.get("flapCopy")
        or d.get("longDescription")
        or d.get("shortDescription")
        or ""
    )

    return text.strip()


In [7]:
def fetch_text_for_isbns(isbn_list):
    texts = {}
    with ThreadPoolExecutor(max_workers=10) as exe:
        futures = {exe.submit(get_title_content, isbn): isbn for isbn in isbn_list}
        for fut in as_completed(futures):
            isbn = futures[fut]
            text = fut.result()
            if text:
                texts[isbn] = text
    return texts


In [11]:
isbns, texts = collect_corpus(max_titles=200)
print("Final count:", len(texts))
print(texts[0][:500])


NameError: name 'collect_corpus' is not defined

In [None]:
vec = TfidfVectorizer(stop_words="english", max_features=5000)
X = vec.fit_transform(texts)

print("Corpus size:", len(texts))
print("TF-IDF shape:", X.shape)

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
vec = TfidfVectorizer(stop_words="english", max_features=5000)
X = vec.fit_transform(texts)

print("Corpus size:", len(texts))
print("TF-IDF shape:", X.shape)

NameError: name 'texts' is not defined

In [None]:
sample = get_works_page(start=0, rows=5, on_sale_from="10/01/2025")
print(type(sample))
print(sample.keys())
print(sample["data"][:200] if isinstance(sample["data"], str) else sample["data"])


In [None]:
vec = TfidfVectorizer(stop_words="english", max_features=5000)
X = vec.fit_transform(texts)


NameError: name 'texts' is not defined

In [None]:
pd.DataFrame({"isbn": isbns, "text": texts}).to_csv("prh_corpus.csv", index=False)
sparse.save_npz("prh_tfidf.npz", X)

print("Corpus size:", len(isbns))
print("TF-IDF shape:", X.shape)

NameError: name 'isbns' is not defined