In [2]:
import requests
import time
from datetime import datetime, timedelta
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import lru_cache
import json
import bs4
import re
from sklearn.decomposition import NMF

API_KEY = "fh5hj47dynk4nvx4s9ewufj4"
BASE = "https://api.penguinrandomhouse.com/resources/v2/title/domains/PRH.US/titles/views/istca"
CAT = "https://api.penguinrandomhouse.com/resources/v2/domains/PRH.US/categories/"
session = requests.Session()

In [3]:
BISAC = pd.read_json("bisac_prefixes.json", typ="series").reset_index()
BISAC.columns = ["prefix", "category"]
BISAC

Unnamed: 0,prefix,category
0,ANT,Antiques & Collectibles
1,ARC,Architecture
2,BIB,Bibles
3,BIO,Biography & Autobiography
4,BOD,"Body, Mind & Spirit"
5,BUS,Business & Economics
6,CGN,Comics & Graphic Novels
7,COM,Computers
8,CKB,Cooking
9,CRA,Crafts & Hobbies


In [4]:
with open('fiction_BISAC.json', 'r') as f:
    fiction_bisac_codes = json.load(f)
fiction_bisac_codes = fiction_bisac_codes.get('data')
fiction_bisac_codes
f_cat_map = {
    c["catId"]: c["menuText"]
    # c["catId"]: {
    #     "BISAC": c["catUri"],
    #     "Description": c["menuText"]
        for c in fiction_bisac_codes["categories"]
}
f_cat_map

{3000001525: 'Fiction',
 3000001526: 'Absurdist',
 3000001527: 'Action & Adventure',
 3000001528: 'Adaptations & Pastiche',
 3000001529: 'African American & Black',
 3000001530: 'Christian',
 3000001531: 'Erotica',
 3000001532: 'Historical',
 3000001533: 'Mystery & Detective',
 3000001534: 'Urban & Street Lit',
 3000001535: 'Women',
 3000001536: 'Alternative History',
 3000001537: 'Amish & Mennonite',
 3000001538: 'Animals',
 3000001539: 'Anthologies (multiple authors)',
 3000001540: 'Asian American & Pacific Islander',
 3000001541: 'Biographical & Autofiction',
 3000001542: 'Books, Bookstores & Libraries',
 3000001543: 'Buddhist',
 3000001544: 'Christian',
 3000001545: 'Biblical',
 3000001546: 'Classic & Allegory',
 3000001547: 'Collections & Anthologies',
 3000001548: 'Contemporary',
 3000001549: 'Fantasy',
 3000001550: 'Futuristic',
 3000001551: 'Historical',
 3000001552: 'Romance',
 3000001553: 'Historical',
 3000001555: 'Suspense',
 3000001556: 'Western',
 3000001558: 'City Life',

In [5]:
def fetch_istca(catId):

    params = {
            "formatFamily": "Paperback",
            "catId": catId,
            "showFlapCopy": "true",
            "showPublishedBooks": "true",
            "api_key": API_KEY,
        }

    r = session.get(BASE, params=params, timeout=20)

    print("STATUS:", r.status_code)
    print("URL:", r.url)
    print("RAW:", r.text[:250])

    r.raise_for_status()
    return r.json()

In [6]:
data = fetch_istca(catId=3000001550)
print(data)

STATUS: 200
URL: https://api.penguinrandomhouse.com/resources/v2/title/domains/PRH.US/titles/views/istca?formatFamily=Paperback&catId=3000001550&showFlapCopy=true&showPublishedBooks=true&api_key=fh5hj47dynk4nvx4s9ewufj4
RAW: {"status":"ok","recordCount":6,"startTimestamp":"2025-11-20T01:30:41Z","endTimestamp":"2025-11-20T01:30:41Z","timeTaken":189,"data":[{"isbn":9780307457196,"isbnHyphenated":"978-0-307-45719-6","workId":19220,"title":"Broken Angel","author":"Sigmund Br


In [7]:
def fetch_istca_multi(catIds, rows=500):
    results = []

    for catId in catIds:

        params = {
            "formatFamily": "Paperback",
            "catId": catId,
            "showFlapCopy": "true",
            "showPublishedBooks": "true",
            "start": 0,
            "rows": rows,
            "api_key": API_KEY,
        }

        r = session.get(BASE, params=params, timeout=20)

        print("\n---")
        print("CATID:", catId)
        print("STATUS:", r.status_code)
        print("URL:", r.url)
        print("RAW:", r.text[:200])

        r.raise_for_status()
        results.append(r.json())

    return results

In [8]:
fetch_istca_multi([3000001525, 3000001526, 3000001527], rows=10)


---
CATID: 3000001525
STATUS: 200
URL: https://api.penguinrandomhouse.com/resources/v2/title/domains/PRH.US/titles/views/istca?formatFamily=Paperback&catId=3000001525&showFlapCopy=true&showPublishedBooks=true&start=0&rows=10&api_key=fh5hj47dynk4nvx4s9ewufj4
RAW: {"status":"ok","recordCount":33902,"startTimestamp":"2025-11-20T01:32:07Z","endTimestamp":"2025-11-20T01:32:30Z","timeTaken":23206,"data":[{"isbn":9780140014457,"isbnHyphenated":"978-0-14-001445-7","w

---
CATID: 3000001526
STATUS: 200
URL: https://api.penguinrandomhouse.com/resources/v2/title/domains/PRH.US/titles/views/istca?formatFamily=Paperback&catId=3000001526&showFlapCopy=true&showPublishedBooks=true&start=0&rows=10&api_key=fh5hj47dynk4nvx4s9ewufj4
RAW: {"status":"ok","recordCount":94,"startTimestamp":"2025-11-20T01:34:35Z","endTimestamp":"2025-11-20T01:34:35Z","timeTaken":82,"data":[{"isbn":9780140154078,"isbnHyphenated":"978-0-14-015407-8","workId"

---
CATID: 3000001527
STATUS: 200
URL: https://api.penguinrandomhouse

[{'status': 'ok',
  'recordCount': 33902,
  'startTimestamp': '2025-11-20T01:32:07Z',
  'endTimestamp': '2025-11-20T01:32:30Z',
  'timeTaken': 23206,
  'data': [{'isbn': 9780140014457,
    'isbnHyphenated': '978-0-14-001445-7',
    'workId': 323693,
    'title': 'Under the Net',
    'author': 'Iris Murdoch',
    'coverUrl': 'https://images.penguinrandomhouse.com/cover/9780140014457',
    'format': {'code': 'TR', 'description': 'Trade Paperback'},
    'subformat': None,
    'binding': None,
    'editionTarget': {'code': None, 'description': None},
    'trim': '5-1/16 x 7-3/4',
    'edition': '0',
    'onSaleDate': '1977-10-27',
    'exportOnSaleDate': None,
    'price': 16.0,
    'exportPrice': None,
    'globalDivision': None,
    'publishingDivision': 'Penguin Adult HC/TR',
    'imprint': 'Penguin Books',
    'publishingStatus': 'IP',
    'series': None,
    'language': 'E',
    'seq': None,
    'titleBlock': None,
    'description': "<b>Iris Murdoch's debut&mdash;a comic novel about 

In [9]:
fetch = fetch_istca_multi([3000001525, 3000001526, 3000001527], rows=10)


---
CATID: 3000001525
STATUS: 200
URL: https://api.penguinrandomhouse.com/resources/v2/title/domains/PRH.US/titles/views/istca?formatFamily=Paperback&catId=3000001525&showFlapCopy=true&showPublishedBooks=true&start=0&rows=10&api_key=fh5hj47dynk4nvx4s9ewufj4
RAW: {"status":"ok","recordCount":33902,"startTimestamp":"2025-11-20T01:32:07Z","endTimestamp":"2025-11-20T01:32:30Z","timeTaken":23206,"data":[{"isbn":9780140014457,"isbnHyphenated":"978-0-14-001445-7","w

---
CATID: 3000001526
STATUS: 200
URL: https://api.penguinrandomhouse.com/resources/v2/title/domains/PRH.US/titles/views/istca?formatFamily=Paperback&catId=3000001526&showFlapCopy=true&showPublishedBooks=true&start=0&rows=10&api_key=fh5hj47dynk4nvx4s9ewufj4
RAW: {"status":"ok","recordCount":94,"startTimestamp":"2025-11-20T01:34:35Z","endTimestamp":"2025-11-20T01:34:35Z","timeTaken":82,"data":[{"isbn":9780140154078,"isbnHyphenated":"978-0-14-015407-8","workId"

---
CATID: 3000001527
STATUS: 200
URL: https://api.penguinrandomhouse

In [10]:
corpus = pd.DataFrame(fetch[0]['data'])
corpus = corpus.drop(columns=['isbnHyphenated', 'workId', 'coverUrl',
       'format', 'subformat', 'binding', 'editionTarget', 'trim', 'edition',
       'onSaleDate', 'exportOnSaleDate', 'price', 'exportPrice',
       'globalDivision', 'publishingDivision', 'imprint', 'publishingStatus',
       'series', 'language', 'seq', 'titleBlock', 'authors'])
corpus.set_index('isbn', inplace=True)
corpus

Unnamed: 0_level_0,title,author,description
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9780140014457,Under the Net,Iris Murdoch,<b>Iris Murdoch's debut&mdash;a comic novel ab...
9780140014747,The Sandcastle,Iris Murdoch,<b>A sparklingly profound novel about the conf...
9780140020038,A Severed Head,Iris Murdoch,<b>A novel about the frightfulness and ruthles...
9780140024760,The Unicorn,Iris Murdoch,<b>A brilliant mythical drama about well-meani...
9780140030341,The Nice and the Good,Iris Murdoch,From the Booker Prize-winning author of <i>The...
9780140036114,An Accidental Man,Iris Murdoch,"<b>A scintillating novel of fate, accidents, a..."
9780140041118,The Sacred and Profane Love Machine,Iris Murdoch,Swinging between his wife and his mistress in ...
9780140042528,The Dharma Bums,Jack Kerouac,<b>Jack Kerouac&rsquo;s classic novel about fr...
9780140042597,On the Road,Jack Kerouac,<b>Jack Kerouac&rsquo;s classic American novel...
9780140043129,One Flew Over the Cuckoo's Nest,Ken Kesey,"<p><b>Ken Kesey's bracing, inslightful novel a..."


In [11]:
corpus['description'] = (
    corpus['description']
        .str.replace(r"<.*?>", "", regex=True)             # remove HTML tags
        .str.replace(r"&[A-Za-z0-9#]+;", "", regex=True)   # remove HTML entities
        .str.replace(r"\d+", "", regex=True)               # remove numbers
        .str.strip()
)
corpus

Unnamed: 0_level_0,title,author,description
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9780140014457,Under the Net,Iris Murdoch,Iris Murdoch's debuta comic novel about work a...
9780140014747,The Sandcastle,Iris Murdoch,A sparklingly profound novel about the conflic...
9780140020038,A Severed Head,Iris Murdoch,A novel about the frightfulness and ruthlessne...
9780140024760,The Unicorn,Iris Murdoch,A brilliant mythical drama about well-meaning ...
9780140030341,The Nice and the Good,Iris Murdoch,From the Booker Prize-winning author of The Se...
9780140036114,An Accidental Man,Iris Murdoch,"A scintillating novel of fate, accidents, and ..."
9780140041118,The Sacred and Profane Love Machine,Iris Murdoch,Swinging between his wife and his mistress in ...
9780140042528,The Dharma Bums,Jack Kerouac,"Jack Kerouacs classic novel about friendship, ..."
9780140042597,On the Road,Jack Kerouac,Jack Kerouacs classic American novel of freedo...
9780140043129,One Flew Over the Cuckoo's Nest,Ken Kesey,"Ken Kesey's bracing, inslightful novel about t..."


#### Step 1. Establish corpus

In [12]:
# Focus on descriptions only
working_corpus = corpus['description']
working_corpus

isbn
9780140014457    Iris Murdoch's debuta comic novel about work a...
9780140014747    A sparklingly profound novel about the conflic...
9780140020038    A novel about the frightfulness and ruthlessne...
9780140024760    A brilliant mythical drama about well-meaning ...
9780140030341    From the Booker Prize-winning author of The Se...
9780140036114    A scintillating novel of fate, accidents, and ...
9780140041118    Swinging between his wife and his mistress in ...
9780140042528    Jack Kerouacs classic novel about friendship, ...
9780140042597    Jack Kerouacs classic American novel of freedo...
9780140043129    Ken Kesey's bracing, inslightful novel about t...
Name: description, dtype: object

#### Step 2. Perform TF-IDF on corpus

In [13]:
# TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(working_corpus)

# Get the feature names (words in the vocabulary)
feature_names = vectorizer.get_feature_names_out()

# Convert the sparse matrix to a dense array for easier viewing (for small datasets)
dense_matrix = tfidf_matrix.toarray()

# Create a DataFrame for better readability
tf_idf_df = pd.DataFrame(dense_matrix, columns=feature_names)
tf_idf_df

Unnamed: 0,abandon,accident,accidental,accidents,acclaim,acquaintance,actress,adapted,admonished,adolescent,...,work,world,writer,writerwhose,written,year,york,young,zen,zestful
0,0.0,0.0,0.0,0.0,0.0,0.101486,0.101486,0.0,0.101486,0.0,...,0.075478,0.0,0.202972,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091969,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129263,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116954,...,0.0,0.116954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.120138,0.240276,0.240276,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08935,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.117715,0.0,0.117715,0.0,0.087548,0.117715,0.117715
8,0.120187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.089386,0.0,0.0,0.0,0.120187,0.0,0.120187,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.090977,0.0,0.0,0.0,0.0,0.0,...,0.067662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Step 3. Perform NMF

In [14]:
# Instantiate the NMF model & specify the number of topics
# Set random_state for reproducibility
n_topics = 10
nmf_model = NMF(n_components=n_topics, random_state=1).set_output(transform="pandas")

# Fit the NMF model to the TF-IDF matrix
# Note that the 'H' matrix (topic-term distribution) is in nmf_model.components_
# Note that the 'W' matrix (document-topic distribution) can be obtained with model.transform(tfidf_matrix)
nmf_model.fit(tf_idf_df)

# Function to print the top words for each topic (from search results)
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(nmf_model.components_):
        print(f"Topic #{topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [15]:
# Print the top words per topic from search results
print_top_words(nmf_model, feature_names, 10)

Topic #0:
road kerouacs american freedom changed jack trips hopea timesinspired vision
Topic #1:
later wife splendor intensive attempts attenborough medusa repels beautifully behave
Topic #2:
ducane begins sea respected poison blackmail black swim revenge trapping
Topic #3:
fate moral accidental accidents prone war drafted bring grey happily
Topic #4:
jake hugo hugos secret writer film learn presumptuously formidable hunter
Topic #5:
woman people spiritual remote come ordinary castle guilty obsessions brilliant
Topic #6:
mcmurphy turns kesey flew nurse nest ward cuckoo mental madness
Topic #7:
morality sacred profane love machine play mistress instead blaise swinging
Topic #8:
mor rain love carter decision develops sparklingly run arrives loyaltythe
Topic #9:
kerouacs truth seekers jack road zestful sessions dignity dharma ryder



In [20]:
# Get the document-topic distribution
document_topic = nmf_model.transform(tf_idf_df)
document_topic

Unnamed: 0,nmf0,nmf1,nmf2,nmf3,nmf4,nmf5,nmf6,nmf7,nmf8,nmf9
0,6.438336e-11,6.07324e-11,0.0,3.629252e-12,0.5774486,0.0,1.757497e-11,2.120402e-10,4.755916e-10,2.178192e-11
1,0.0,0.0,0.0,8.831624e-09,0.0,0.0,0.0,3.911151e-08,1.115224,0.0
2,0.0,0.4597323,4.306935e-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.9437884,0.0,2.256352e-13,0.0,0.0
4,0.0,2.396274e-09,0.6082885,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,1.257843e-13,0.443503,0.0,0.0,2.438579e-14,0.0,4.059208e-14,7.505884e-14
6,1.596994e-09,3.036735e-09,0.0,0.0,1.202903e-09,4.50279e-10,1.988417e-10,0.8152244,1.842474e-08,0.0
7,0.0001122237,2.779934e-07,1.249103e-06,0.0,0.0,1.509818e-06,0.0,0.0,0.0,1.087377
8,0.6633273,1.840537e-09,0.0,2.81566e-10,4.053457e-09,0.0,8.17983e-09,1.044859e-08,2.915505e-08,0.0
9,0.0,0.0,0.0,0.0,0.0,6.603667e-12,0.9108629,0.0,6.918624e-12,4.120045e-11


#### Step 4. Create a query function

In [21]:
# Get top words per topic from search results
n_top_words = 10
topic_dict = {}
for topic_idx, topic in enumerate(nmf_model.components_):
        topic_dict[topic_idx] = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
topic_dict

{0: ['road',
  'kerouacs',
  'american',
  'freedom',
  'changed',
  'jack',
  'trips',
  'hopea',
  'timesinspired',
  'vision'],
 1: ['later',
  'wife',
  'splendor',
  'intensive',
  'attempts',
  'attenborough',
  'medusa',
  'repels',
  'beautifully',
  'behave'],
 2: ['ducane',
  'begins',
  'sea',
  'respected',
  'poison',
  'blackmail',
  'black',
  'swim',
  'revenge',
  'trapping'],
 3: ['fate',
  'moral',
  'accidental',
  'accidents',
  'prone',
  'war',
  'drafted',
  'bring',
  'grey',
  'happily'],
 4: ['jake',
  'hugo',
  'hugos',
  'secret',
  'writer',
  'film',
  'learn',
  'presumptuously',
  'formidable',
  'hunter'],
 5: ['woman',
  'people',
  'spiritual',
  'remote',
  'come',
  'ordinary',
  'castle',
  'guilty',
  'obsessions',
  'brilliant'],
 6: ['mcmurphy',
  'turns',
  'kesey',
  'flew',
  'nurse',
  'nest',
  'ward',
  'cuckoo',
  'mental',
  'madness'],
 7: ['morality',
  'sacred',
  'profane',
  'love',
  'machine',
  'play',
  'mistress',
  'instead',

In [22]:
# Identify topics that contain the words in the query
query = 'later marathon abandon'

search_topic_list = []
for word in query.lower().split():
    counter = 0
    for topic in topic_dict.values():
        if word in topic:
            search_topic_list.append(counter)
        counter += 1

search_topic_list

[1]

In [23]:
# Create a dataframe that contains all relevant topics
working_df = pd.DataFrame(document_topic.iloc[:, search_topic_list])
# working_df.set_index(corpus['isbn'], inplace=True)

# Get the total row score for each isbn
working_df['score'] = working_df.iloc[:, 0:].sum(axis=1)
working_df

result = working_df.sort_values(axis=0, by='score', ascending=False).head(10)
result = result[result['score'] > 0]
result

Unnamed: 0,nmf1,score
2,0.4597323,0.4597323
7,2.779934e-07,2.779934e-07
6,3.036735e-09,3.036735e-09
4,2.396274e-09,2.396274e-09
8,1.840537e-09,1.840537e-09
0,6.07324e-11,6.07324e-11
