In [13]:
import requests
import time
from datetime import datetime, timedelta
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import json


#### API

In [None]:
BASE_WORKS = "https://api.penguinrandomhouse.com/resources/v2/title/domains/PRH.US/works"
BASE_WORK_TITLES = "https://api.penguinrandomhouse.com/resources/v2/title/domains/PRH.US/works"
BASE_TITLES = "https://api.penguinrandomhouse.com/resources/v2/title/domains/PRH.US/titles"

In [2]:
API_KEY = "fh5hj47dynk4nvx4s9ewufj4"

In [3]:
DOMAIN = "PRH.US"

BASE_WORKS = "https://api.penguinrandomhouse.com/resources/v2/title/domains/PRH.US/works"
BASE_TITLE = "https://api.penguinrandomhouse.com/resources/v2/title/domains/PRH.US/titles"

In [4]:
def get_works_page(start, rows, on_sale_from):
    params = {
        "api_key": API_KEY,
        "rows": rows,
        "start": start,
        "preferLanguage": "E",
        "ageRangeMax": 18,
        "ageRangeMin": 7,
        "showPublishedBooks": "true",
        "showComingSoon": "false",
        "showNewReleases": "true",
        "showFlapCopy": "true",
        "suppressLinks": "true",
        "suppressRecordCount": "true",
        "onSaleFrom": on_sale_from,  # mm/dd/yyyy
    }

    r = requests.get(BASE_WORKS, params=params, timeout=15)
    r.raise_for_status()
    data = r.json()

    # IMPORTANT: works list is here
    return data["data"]["works"]

In [5]:
def get_work_titles(workId):
    url = f"{BASE_WORK_TITLES}/{workId}/titles"
    params = {"api_key": API_KEY}

    r = requests.get(url, params=params, timeout=15)
    r.raise_for_status()
    data = r.json()

    # The correct structure for your API:
    # { "titles": [ {...}, {...}, ... ] }
    titles = data.get("titles", [])

    # Guarantee titles is a list of dicts
    if isinstance(titles, list):
        titles = [t for t in titles if isinstance(t, dict)]
    else:
        titles = []

    return titles

In [6]:
def fetch_title(isbn):
    url = f"{BASE_TITLES}/{isbn}"
    params = {"api_key": API_KEY}

    r = requests.get(url, params=params, timeout=15)
    if r.status_code == 404:
        return None
    r.raise_for_status()
    return r.json()

def extract_description(title_data):
    for field in ["flapCopy", "description", "longDescription", "shortDescription"]:
        if field in title_data and title_data[field]:
            return title_data[field]
    return None

In [7]:
def collect_corpus(max_titles=1000):
    one_year_ago = (datetime.now() - timedelta(days=365)).strftime("%m/%d/%Y")

    isbns = []
    texts = []

    start = 0
    rows = 20

    while len(isbns) < max_titles:

        works = get_works_page(start=start, rows=rows, on_sale_from=one_year_ago)
        if not works:
            break

        for w in works:
            workId = w.get("workId")
            if not workId:
                continue

            # Fetch titles for this work
            title_records = get_work_titles(workId)

            title_records = get_work_titles(workId)

            for t in title_records:
                isbn = t.get("isbn")
                if not isbn:
                    continue

                # Fetch description
                title_data = fetch_title(isbn)
                if not title_data:
                    continue

                desc = extract_description(title_data)
                if not desc:
                    continue

                isbns.append(isbn)
                texts.append(desc)

                if len(isbns) >= max_titles:
                    break

            if len(isbns) >= max_titles:
                break

        start += rows
        time.sleep(0.2)

    return isbns, texts


In [8]:
sample_titles = get_work_titles(262934)  # using the first workId you showed earlier
print("Type:", type(sample_titles))
print(sample_titles)

Type: <class 'list'>
[]


In [9]:
isbns, texts = collect_corpus(max_titles=50)

ReadTimeout: HTTPSConnectionPool(host='api.penguinrandomhouse.com', port=443): Read timed out. (read timeout=15)

In [None]:
vec = TfidfVectorizer(stop_words="english", max_features=5000)
X = vec.fit_transform(texts)

print("Corpus size:", len(texts))
print("TF-IDF shape:", X.shape)

In [None]:
sample = get_works_page(start=0, rows=5, on_sale_from="10/01/2025")
print(type(sample))
print(sample.keys())
print(sample["data"][:200] if isinstance(sample["data"], str) else sample["data"])


In [None]:
vec = TfidfVectorizer(stop_words="english", max_features=5000)
X = vec.fit_transform(texts)


In [None]:
pd.DataFrame({"isbn": isbns, "text": texts}).to_csv("prh_corpus.csv", index=False)
sparse.save_npz("prh_tfidf.npz", X)

print("Corpus size:", len(isbns))
print("TF-IDF shape:", X.shape)

#### TF-IDF

In [18]:
# Step 1: Load the JSON file normally
with open("100_prh_title_sample.json", "r", encoding="utf-8") as f:
    data = json.load(f)
 
# Step 2: Normalize only the "data" list into a DataFrame
df = pd.json_normalize(data["data"])
 
df.head()

Unnamed: 0,isbn,isbnHyphenated,workId,title,author,coverUrl,subformat,binding,trim,edition,...,series,language,seq,titleBlock,description,authors,format.code,format.description,editionTarget.code,editionTarget.description
0,9780028633879,978-0-02-863387-9,359570,The Complete Idiot's Guide to Learning Yiddish,Rabbi Benjamin Blech,https://images.penguinrandomhouse.com/cover/97...,,,7-3/8 x 9-1/8,0,...,,E,,,"You're not idiot, of course. You can serve up ...","[{'code': '309012', 'description': 'Rabbi Benj...",TR,Trade Paperback,,
1,9780130575715,978-0-13-057571-5,350672,Heinerman's Encyclopedia of Healing Juices,John Heinerman,https://images.penguinrandomhouse.com/cover/97...,,,6 x 9,0,...,,E,,,"This publication shows how raw, natural juices...","[{'code': '12511', 'description': 'John Heiner...",TR,Trade Paperback,,
2,9780131088382,978-0-13-108838-2,350688,Super Healing Foods,Frances Sheridan Goulart,https://images.penguinrandomhouse.com/cover/97...,,,6 x 9,0,...,,E,,,From apples (sunburn relief) and avocados (car...,"[{'code': '233283', 'description': 'Frances Sh...",TR,Trade Paperback,,
3,9780131872783,978-0-13-187278-3,299263,A Brief Tour of Human Consciousness,V.S. Ramachandran,https://images.penguinrandomhouse.com/cover/97...,,,5-3/8 x 8-1/4,0,...,,E,,,How can some people come to believe that their...,"[{'code': '2148674', 'description': 'V. S. Ram...",TR,Trade Paperback,,
4,9780132092302,978-0-13-209230-2,353186,Heinerman's New Encyclopedia of Fruits & Veget...,John Heinerman,https://images.penguinrandomhouse.com/cover/97...,,,6 x 9,0,...,,E,,,This book is your total guide to using the inc...,"[{'code': '12511', 'description': 'John Heiner...",TR,Trade Paperback,,


In [19]:
# Focus on descriptions only
corpus = df['description']
corpus

0     You're not idiot, of course. You can serve up ...
1     This publication shows how raw, natural juices...
2     From apples (sunburn relief) and avocados (car...
3     How can some people come to believe that their...
4     This book is your total guide to using the inc...
                            ...                        
95    This memoir tells the remarkable story of how ...
96    This volume contains twenty tales featuring be...
97    "An absolutely unique voice...It would be an i...
98    When your best friend makes you angry, why can...
99    <b>"Essays in direct line from Stanislavsky, C...
Name: description, Length: 100, dtype: object

In [20]:
# TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

# Get the feature names (words in the vocabulary)
feature_names = vectorizer.get_feature_names_out()

# Convert the sparse matrix to a dense array for easier viewing (for small datasets)
dense_matrix = tfidf_matrix.toarray()

# You can then create a DataFrame for better readability
tf_idf_df = pd.DataFrame(dense_matrix, columns=feature_names)
tf_idf_df

Unnamed: 0,000,10,100,1066,151,1577,160,16th,1776,1777,...,yourself,yucca,zarathustra,zeal,zealand,zen,zero,zestful,zone,zones
0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
1,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
2,0.000000,0.076977,0.0,0.0,0.0,0.0,0.088087,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.076977,0.0,0.0,0.076977
3,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
4,0.101603,0.000000,0.0,0.0,0.0,0.0,0.077512,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
96,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
97,0.000000,0.000000,0.0,0.0,0.0,0.0,0.033887,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
98,0.000000,0.000000,0.0,0.0,0.0,0.0,0.235314,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000


In [25]:
# Set isbn as index of tf_idf_df
tf_idf_df.set_index(df['isbn'], inplace=True)

# Extract isbn, title, and author from the original dataframe
isbn_title_author_df = df[['isbn', 'title', 'author']]
isbn_title_author_df.set_index('isbn', inplace=True)

# Add the title and author information to the TF-IDF DataFrame 
doc_importance = pd.merge(isbn_title_author_df, tf_idf_df, on='isbn', how='left')
doc_importance

Unnamed: 0_level_0,title_x,author_x,000,10,100,1066,151,1577,160,16th,...,yourself,yucca,zarathustra,zeal,zealand,zen,zero,zestful,zone,zones
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9780028633879,The Complete Idiot's Guide to Learning Yiddish,Rabbi Benjamin Blech,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
9780130575715,Heinerman's Encyclopedia of Healing Juices,John Heinerman,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
9780131088382,Super Healing Foods,Frances Sheridan Goulart,0.000000,0.076977,0.0,0.0,0.0,0.0,0.088087,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.076977,0.0,0.0,0.076977
9780131872783,A Brief Tour of Human Consciousness,V.S. Ramachandran,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
9780132092302,Heinerman's New Encyclopedia of Fruits & Veget...,John Heinerman,0.101603,0.000000,0.0,0.0,0.0,0.0,0.077512,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9780140089363,Q's Legacy,Helene Hanff,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
9780140089585,The Second Rumpole Omnibus,John Mortimer,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
9780140089738,Saints and Strangers,Angela Carter,0.000000,0.000000,0.0,0.0,0.0,0.0,0.033887,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
9780140089806,Between Women,Luise Eichenbaum,0.000000,0.000000,0.0,0.0,0.0,0.0,0.235314,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000


In [48]:
# Create a query function

query = 'idiot guide chewbacca'
 
query_words = query.lower().split()

# Check if all words in the query are in the dataframe
to_search = []
for word in query_words:
    if word in doc_importance.columns:
        to_search.append(word)
 
to_search.insert(0, 'title_x')
 
sample = doc_importance.loc[:, to_search] # 9780028633879
sample



Unnamed: 0_level_0,title_x,idiot,guide
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9780028633879,The Complete Idiot's Guide to Learning Yiddish,0.231111,0.155540
9780130575715,Heinerman's Encyclopedia of Healing Juices,0.000000,0.000000
9780131088382,Super Healing Foods,0.000000,0.103613
9780131872783,A Brief Tour of Human Consciousness,0.000000,0.000000
9780132092302,Heinerman's New Encyclopedia of Fruits & Veget...,0.000000,0.068380
...,...,...,...
9780140089363,Q's Legacy,0.000000,0.000000
9780140089585,The Second Rumpole Omnibus,0.000000,0.000000
9780140089738,Saints and Strangers,0.000000,0.000000
9780140089806,Between Women,0.000000,0.000000


In [46]:
query = 'idiot the guide chewbacca'

query_words = query.lower().split()
#query_columns.insert(0, 'title_x')

to_search = []
for word in query_words:
    if word in doc_importance.columns:
        to_search.append(word)

to_search



['idiot', 'the', 'guide']

Index(['title_x', 'author_x', '000', '10', '100', '1066', '151', '1577', '160',
       '16th',
       ...
       'yourself', 'yucca', 'zarathustra', 'zeal', 'zealand', 'zen', 'zero',
       'zestful', 'zone', 'zones'],
      dtype='object', length=3991)