In [15]:
import requests
from datetime import datetime, timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import time
from scipy import sparse

In [16]:
API_KEY = "fh5hj47dynk4nvx4s9ewufj4"

In [17]:
DOMAIN = "PRH.US"

BASE_WORKS = (
    f"https://api.penguinrandomhouse.com/resources/v2/domains/"
    f"{DOMAIN}/works"
)

BASE_TITLE = (
    f"https://api.penguinrandomhouse.com/resources/v2/domains/"
    f"{DOMAIN}/titles"
)

In [None]:
def get_works_page(start, rows, on_sale_from, age_min, age_max):
    params = {
        "api_key": API_KEY,
        "rows": rows,
        "start": start,
        "suppressLinks": True,
    }
    
    body = {
        "onSaleFrom": on_sale_from,
        "ageRangeMin": age_min,
        "ageRangeMax": age_max,
        "showActive": True
    }

    r = requests.post(BASE_WORKS, params=params, json=body, timeout=30)
    r.raise_for_status()
    return r.json()

In [None]:
def get_works_page(start, rows, on_sale_from, age_min, age_max):
    params = {
        "api_key": API_KEY,
        "rows": rows,
        "start": start,
        "suppressLinks": True,
    }

    body = {
        "onSaleFrom": on_sale_from,
        "onSaleTo": datetime.now().strftime("%Y-%m-%d"),
        "ageRangeMin": age_min,
        "ageRangeMax": age_max,
        "divisionCode": ["12", "13", "14"],   # critical to avoid 504
        "showActive": True
    }

    r = requests.post(BASE_WORKS, params=params, json=body, timeout=30)
    r.raise_for_status()
    return r.json()

In [None]:
def collect_corpus(max_titles=1000):

    one_year_ago = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")

    isbns = []
    start = 0
    batch = 100

    while len(isbns) < max_titles:
        data = get_works_page(
            start=start,
            rows=batch,
            on_sale_from=one_year_ago,
            age_min=7,
            age_max=18
        )

        works = data.get("data", [])
        if not works:
            break

        for w in works:
            for t in w.get("titles", []):
                isbn = t.get("isbn")
                if isbn:
                    isbns.append(isbn)
                    if len(isbns) >= max_titles:
                        break
            if len(isbns) >= max_titles:
                break

        start += batch
        time.sleep(0.2)  # polite pacing to avoid rate limits

    corpus = []
    kept_isbns = []

    for isbn in isbns:
        t = fetch_title(isbn)
        if not t:
            continue
        text = extract_description(t)
        if text:
            corpus.append(text)
            kept_isbns.append(isbn)

    return kept_isbns, corpus

In [None]:
isbns, corpus = collect_corpus(max_titles=1000)

vec, X = build_tfidf(corpus)

print("Titles collected:", len(isbns))
print("TF-IDF shape:", X.shape)

HTTPError: 504 Server Error: Gateway Timeout for url: https://api.penguinrandomhouse.com/resources/v2/domains/PRH.US/works?api_key=fh5hj47dynk4nvx4s9ewufj4&rows=100&start=0&suppressLinks=True

In [None]:
df = pd.DataFrame({"isbn": isbns, "text": corpus})
df.to_csv("corpus.csv", index=False)

sparse.save_npz("tfidf_matrix.npz", X)