In [1]:
import requests
import time
from datetime import datetime, timedelta
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import lru_cache
import json
import bs4
import re
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
API_KEY = "fh5hj47dynk4nvx4s9ewufj4"
BASE = "https://api.penguinrandomhouse.com/resources/v2/title/domains/PRH.US/titles/views/istca"
CAT = "https://api.penguinrandomhouse.com/resources/v2/domains/PRH.US/categories/"
session = requests.Session()

In [2]:
BISAC = pd.read_json("bisac_prefixes.json", typ="series").reset_index()
BISAC.columns = ["prefix", "category"]
BISAC

Unnamed: 0,prefix,category
0,ANT,Antiques & Collectibles
1,ARC,Architecture
2,BIB,Bibles
3,BIO,Biography & Autobiography
4,BOD,"Body, Mind & Spirit"
5,BUS,Business & Economics
6,CGN,Comics & Graphic Novels
7,COM,Computers
8,CKB,Cooking
9,CRA,Crafts & Hobbies


In [3]:
with open('fiction_BISAC.json', 'r') as f:
    fiction_bisac_codes = json.load(f)
fiction_bisac_codes = fiction_bisac_codes.get('data')
fiction_bisac_codes
f_cat_map = {
    c["catId"]: c["menuText"]
    # c["catId"]: {
    #     "BISAC": c["catUri"],
    #     "Description": c["menuText"]
        for c in fiction_bisac_codes["categories"]
}
f_cat_map

{3000001525: 'Fiction',
 3000001526: 'Absurdist',
 3000001527: 'Action & Adventure',
 3000001528: 'Adaptations & Pastiche',
 3000001529: 'African American & Black',
 3000001530: 'Christian',
 3000001531: 'Erotica',
 3000001532: 'Historical',
 3000001533: 'Mystery & Detective',
 3000001534: 'Urban & Street Lit',
 3000001535: 'Women',
 3000001536: 'Alternative History',
 3000001537: 'Amish & Mennonite',
 3000001538: 'Animals',
 3000001539: 'Anthologies (multiple authors)',
 3000001540: 'Asian American & Pacific Islander',
 3000001541: 'Biographical & Autofiction',
 3000001542: 'Books, Bookstores & Libraries',
 3000001543: 'Buddhist',
 3000001544: 'Christian',
 3000001545: 'Biblical',
 3000001546: 'Classic & Allegory',
 3000001547: 'Collections & Anthologies',
 3000001548: 'Contemporary',
 3000001549: 'Fantasy',
 3000001550: 'Futuristic',
 3000001551: 'Historical',
 3000001552: 'Romance',
 3000001553: 'Historical',
 3000001555: 'Suspense',
 3000001556: 'Western',
 3000001558: 'City Life',

In [4]:
def fetch_istca(catId):

    params = {
            "formatFamily": "Paperback",
            "catId": catId,
            "showFlapCopy": "true",
            "showPublishedBooks": "true",
            "api_key": API_KEY,
        }

    r = session.get(BASE, params=params, timeout=20)

    print("STATUS:", r.status_code)
    print("URL:", r.url)
    print("RAW:", r.text[:250])

    r.raise_for_status()
    return r.json()

In [5]:
data = fetch_istca(catId=3000001550)
print(data)

STATUS: 200
URL: https://api.penguinrandomhouse.com/resources/v2/title/domains/PRH.US/titles/views/istca?formatFamily=Paperback&catId=3000001550&showFlapCopy=true&showPublishedBooks=true&api_key=fh5hj47dynk4nvx4s9ewufj4
RAW: {"status":"ok","recordCount":6,"startTimestamp":"2025-11-20T01:30:41Z","endTimestamp":"2025-11-20T01:30:41Z","timeTaken":189,"data":[{"isbn":9780307457196,"isbnHyphenated":"978-0-307-45719-6","workId":19220,"title":"Broken Angel","author":"Sigmund Br


In [6]:
def fetch_istca_multi(catIds, rows=500):
    results = []

    for catId in catIds:

        params = {
            "formatFamily": "Paperback",
            "catId": catId,
            "showFlapCopy": "true",
            "showPublishedBooks": "true",
            "start": 0,
            "rows": rows,
            "api_key": API_KEY,
        }

        r = session.get(BASE, params=params, timeout=20)

        print("\n---")
        print("CATID:", catId)
        print("STATUS:", r.status_code)
        print("URL:", r.url)
        print("RAW:", r.text[:200])

        r.raise_for_status()
        results.append(r.json())

    return results

In [7]:
fetch_istca_multi([3000001525, 3000001526, 3000001527], rows=10)


---
CATID: 3000001525
STATUS: 200
URL: https://api.penguinrandomhouse.com/resources/v2/title/domains/PRH.US/titles/views/istca?formatFamily=Paperback&catId=3000001525&showFlapCopy=true&showPublishedBooks=true&start=0&rows=10&api_key=fh5hj47dynk4nvx4s9ewufj4
RAW: {"status":"ok","recordCount":33902,"startTimestamp":"2025-11-20T01:32:07Z","endTimestamp":"2025-11-20T01:32:30Z","timeTaken":23206,"data":[{"isbn":9780140014457,"isbnHyphenated":"978-0-14-001445-7","w

---
CATID: 3000001526
STATUS: 200
URL: https://api.penguinrandomhouse.com/resources/v2/title/domains/PRH.US/titles/views/istca?formatFamily=Paperback&catId=3000001526&showFlapCopy=true&showPublishedBooks=true&start=0&rows=10&api_key=fh5hj47dynk4nvx4s9ewufj4
RAW: {"status":"ok","recordCount":94,"startTimestamp":"2025-11-20T01:34:35Z","endTimestamp":"2025-11-20T01:34:35Z","timeTaken":82,"data":[{"isbn":9780140154078,"isbnHyphenated":"978-0-14-015407-8","workId"

---
CATID: 3000001527
STATUS: 200
URL: https://api.penguinrandomhouse

[{'status': 'ok',
  'recordCount': 33902,
  'startTimestamp': '2025-11-20T01:32:07Z',
  'endTimestamp': '2025-11-20T01:32:30Z',
  'timeTaken': 23206,
  'data': [{'isbn': 9780140014457,
    'isbnHyphenated': '978-0-14-001445-7',
    'workId': 323693,
    'title': 'Under the Net',
    'author': 'Iris Murdoch',
    'coverUrl': 'https://images.penguinrandomhouse.com/cover/9780140014457',
    'format': {'code': 'TR', 'description': 'Trade Paperback'},
    'subformat': None,
    'binding': None,
    'editionTarget': {'code': None, 'description': None},
    'trim': '5-1/16 x 7-3/4',
    'edition': '0',
    'onSaleDate': '1977-10-27',
    'exportOnSaleDate': None,
    'price': 16.0,
    'exportPrice': None,
    'globalDivision': None,
    'publishingDivision': 'Penguin Adult HC/TR',
    'imprint': 'Penguin Books',
    'publishingStatus': 'IP',
    'series': None,
    'language': 'E',
    'seq': None,
    'titleBlock': None,
    'description': "<b>Iris Murdoch's debut&mdash;a comic novel about 

In [8]:
fetch = fetch_istca_multi([3000001525, 3000001526, 3000001527], rows=100)


---
CATID: 3000001525
STATUS: 200
URL: https://api.penguinrandomhouse.com/resources/v2/title/domains/PRH.US/titles/views/istca?formatFamily=Paperback&catId=3000001525&showFlapCopy=true&showPublishedBooks=true&start=0&rows=100&api_key=fh5hj47dynk4nvx4s9ewufj4
RAW: {"status":"ok","recordCount":33901,"startTimestamp":"2025-11-20T02:24:29Z","endTimestamp":"2025-11-20T02:24:46Z","timeTaken":17414,"data":[{"isbn":9780140014457,"isbnHyphenated":"978-0-14-001445-7","w

---
CATID: 3000001526
STATUS: 200
URL: https://api.penguinrandomhouse.com/resources/v2/title/domains/PRH.US/titles/views/istca?formatFamily=Paperback&catId=3000001526&showFlapCopy=true&showPublishedBooks=true&start=0&rows=100&api_key=fh5hj47dynk4nvx4s9ewufj4
RAW: {"status":"ok","recordCount":94,"startTimestamp":"2025-11-20T02:24:48Z","endTimestamp":"2025-11-20T02:24:48Z","timeTaken":214,"data":[{"isbn":9780140154078,"isbnHyphenated":"978-0-14-015407-8","workId

---
CATID: 3000001527
STATUS: 200
URL: https://api.penguinrandomhou

In [9]:
# corpus = pd.DataFrame(fetch[0]['data'])
for catid in fetch:
    df = pd.DataFrame(catid['data'])
    try:
        corpus = pd.concat([corpus, df], ignore_index=True)
    except NameError:
        corpus = df

corpus = corpus.drop(columns=['isbnHyphenated', 'workId', 'coverUrl',
       'format', 'subformat', 'binding', 'editionTarget', 'trim', 'edition',
       'onSaleDate', 'exportOnSaleDate', 'price', 'exportPrice',
       'globalDivision', 'publishingDivision', 'imprint', 'publishingStatus',
       'series', 'language', 'seq', 'titleBlock', 'authors'])
# corpus.set_index('isbn', inplace=True)
corpus

Unnamed: 0,isbn,title,author,description
0,9780140014457,Under the Net,Iris Murdoch,<b>Iris Murdoch's debut&mdash;a comic novel ab...
1,9780140014747,The Sandcastle,Iris Murdoch,<b>A sparklingly profound novel about the conf...
2,9780140020038,A Severed Head,Iris Murdoch,<b>A novel about the frightfulness and ruthles...
3,9780140024760,The Unicorn,Iris Murdoch,<b>A brilliant mythical drama about well-meani...
4,9780140030341,The Nice and the Good,Iris Murdoch,From the Booker Prize-winning author of <i>The...
...,...,...,...,...
289,9780307475558,The Adventures of Tom Sawyer,Mark Twain,Mark Twain was one of the nineteenth century's...
290,9780307475565,The Adventures of Huckleberry Finn,Mark Twain,<b>Long cherished by readers of all ages: the ...
291,9780307717108,The Informationist,Taylor Stevens,<b>Governments pay her.</b><br><b>Criminals fe...
292,9780307743176,Jamrach's Menagerie,Carol Birch,<p>Nineteenth-century London comes vividly ali...


In [10]:
corpus['description'] = (
    corpus['description']
        .str.replace(r"<.*?>", "", regex=True)             # remove HTML tags
        .str.replace(r"&[A-Za-z0-9#]+;", "", regex=True)   # remove HTML entities
        .str.replace(r"\d+", "", regex=True)               # remove numbers
        .str.strip()
)
corpus

Unnamed: 0,isbn,title,author,description
0,9780140014457,Under the Net,Iris Murdoch,Iris Murdoch's debuta comic novel about work a...
1,9780140014747,The Sandcastle,Iris Murdoch,A sparklingly profound novel about the conflic...
2,9780140020038,A Severed Head,Iris Murdoch,A novel about the frightfulness and ruthlessne...
3,9780140024760,The Unicorn,Iris Murdoch,A brilliant mythical drama about well-meaning ...
4,9780140030341,The Nice and the Good,Iris Murdoch,From the Booker Prize-winning author of The Se...
...,...,...,...,...
289,9780307475558,The Adventures of Tom Sawyer,Mark Twain,Mark Twain was one of the nineteenth century's...
290,9780307475565,The Adventures of Huckleberry Finn,Mark Twain,Long cherished by readers of all ages: the hil...
291,9780307717108,The Informationist,Taylor Stevens,Governments pay her.Criminals fear her.Nobody ...
292,9780307743176,Jamrach's Menagerie,Carol Birch,Nineteenth-century London comes vividly alive ...


In [11]:
corpus.shape

(294, 4)

#### Step 1. Establish corpus

In [12]:
# Focus on descriptions only
working_corpus = corpus['description']
working_corpus

0      Iris Murdoch's debuta comic novel about work a...
1      A sparklingly profound novel about the conflic...
2      A novel about the frightfulness and ruthlessne...
3      A brilliant mythical drama about well-meaning ...
4      From the Booker Prize-winning author of The Se...
                             ...                        
289    Mark Twain was one of the nineteenth century's...
290    Long cherished by readers of all ages: the hil...
291    Governments pay her.Criminals fear her.Nobody ...
292    Nineteenth-century London comes vividly alive ...
293    A rollicking romp of a spy thriller from the a...
Name: description, Length: 294, dtype: object

#### Step 2. Perform TF-IDF on corpus

In [13]:
# TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(working_corpus)

# Get the feature names (words in the vocabulary)
feature_names = vectorizer.get_feature_names_out()

# Convert the sparse matrix to a dense array for easier viewing (for small datasets)
dense_matrix = tfidf_matrix.toarray()

# Create a DataFrame for better readability
tf_idf_df = pd.DataFrame(dense_matrix, columns=feature_names)
tf_idf_df

Unnamed: 0,abandon,abandoned,abandonment,abbess,abc,abducida,abe,abernathy,abeyance,abiding,...,zoasand,zodiac,zone,zones,zoo,zoos,zoran,zorro,zulu,zuzzo
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
291,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
292,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Step 3. Perform NMF

In [14]:
# Instantiate the NMF model & specify the number of topics
# Set random_state for reproducibility
n_topics = 10
nmf_model = NMF(n_components=n_topics, random_state=1).set_output(transform="pandas")

# Fit the NMF model to the TF-IDF matrix
# Note that the 'H' matrix (topic-term distribution) is in nmf_model.components_
# Note that the 'W' matrix (document-topic distribution) can be obtained with model.transform(tfidf_matrix)
nmf_model.fit(tf_idf_df)

# Function to print the top words for each topic (from search results)
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(nmf_model.components_):
        print(f"Topic #{topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

#### Find Optimal Number of Topics

In [15]:
# topic_no =

In [16]:
# Print the top words per topic from search results
# print_top_words(nmf_model, feature_names, topic_no) # uncomment when using topic_no 
print_top_words(nmf_model, feature_names, 10)

Topic #0:
penguin classic classics english notes readers authoritative publisher best bookshelf
Topic #1:
new book novel life york times world author stories love
Topic #2:
la en el que una su por es los mills
Topic #3:
albany cycle ironweed mid billy legs irish novels quinns moves
Topic #4:
sharpe napoleon battle richard enemy french wellington honor napoleonic british
Topic #5:
coetzee cruso available barton essays schooldays january jesus viking barbarians
Topic #6:
men women war able womento rhona facing unimaginable relatively baldwin
Topic #7:
dahomey manoel trading slave atlantic exceed ghastly ouidah silva amass
Topic #8:
town novel kesey vibrant havoc climax swirls refugees remnant turf
Topic #9:
bastian empress boy tale fantastica bats snails ende wisps sorcerers



In [17]:
# Get the document-topic distribution
document_topic = nmf_model.transform(tf_idf_df)
document_topic

Unnamed: 0,nmf0,nmf1,nmf2,nmf3,nmf4,nmf5,nmf6,nmf7,nmf8,nmf9
0,0.000174,0.048391,0.001460,0.008521,0.001111,0.001594,0.000000,0.007692,0.031801,0.002404
1,0.000000,0.084239,0.000000,0.000000,0.024229,0.000000,0.000000,0.000000,0.001144,0.000000
2,0.000000,0.055958,0.000592,0.000000,0.017851,0.008706,0.018782,0.000630,0.024248,0.000000
3,0.000000,0.060968,0.000730,0.000000,0.001454,0.017052,0.006040,0.000000,0.022504,0.000000
4,0.007556,0.053303,0.000000,0.000328,0.008738,0.000000,0.003282,0.000000,0.007590,0.008598
...,...,...,...,...,...,...,...,...,...,...
289,0.023551,0.030923,0.001325,0.025837,0.000596,0.000557,0.000000,0.007391,0.004711,0.036815
290,0.030367,0.102125,0.000000,0.009139,0.000000,0.000000,0.000000,0.028704,0.006853,0.015535
291,0.000000,0.081254,0.000000,0.000000,0.008392,0.008132,0.052130,0.007802,0.000000,0.020866
292,0.010602,0.051251,0.000000,0.003339,0.002639,0.000498,0.000000,0.000000,0.002351,0.042110


#### Step 4. Create a query function

In [18]:
# Get top words per topic from search results
n_top_words = 10
topic_dict = {}
for topic_idx, topic in enumerate(nmf_model.components_):
        topic_dict[topic_idx] = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
topic_dict

{0: ['penguin',
  'classic',
  'classics',
  'english',
  'notes',
  'readers',
  'authoritative',
  'publisher',
  'best',
  'bookshelf'],
 1: ['new',
  'book',
  'novel',
  'life',
  'york',
  'times',
  'world',
  'author',
  'stories',
  'love'],
 2: ['la', 'en', 'el', 'que', 'una', 'su', 'por', 'es', 'los', 'mills'],
 3: ['albany',
  'cycle',
  'ironweed',
  'mid',
  'billy',
  'legs',
  'irish',
  'novels',
  'quinns',
  'moves'],
 4: ['sharpe',
  'napoleon',
  'battle',
  'richard',
  'enemy',
  'french',
  'wellington',
  'honor',
  'napoleonic',
  'british'],
 5: ['coetzee',
  'cruso',
  'available',
  'barton',
  'essays',
  'schooldays',
  'january',
  'jesus',
  'viking',
  'barbarians'],
 6: ['men',
  'women',
  'war',
  'able',
  'womento',
  'rhona',
  'facing',
  'unimaginable',
  'relatively',
  'baldwin'],
 7: ['dahomey',
  'manoel',
  'trading',
  'slave',
  'atlantic',
  'exceed',
  'ghastly',
  'ouidah',
  'silva',
  'amass'],
 8: ['town',
  'novel',
  'kesey',
  '

In [19]:
# Identify topics that contain the words in the query
query = 'New york times'

search_topic_list = []
for word in query.lower().split():
    counter = 0
    for topic in topic_dict.values():
        if word in topic:
            search_topic_list.append(counter)
        counter += 1

search_topic_list

[1, 1, 1]

In [20]:
# Create a dataframe that contains all relevant topics
working_df = pd.DataFrame(document_topic.iloc[:, search_topic_list])
# working_df.set_index(corpus['isbn'], inplace=True)

# Get the total row score for each isbn
working_df['score'] = working_df.iloc[:, 0:].sum(axis=1)
working_df

result = working_df.sort_values(axis=0, by='score', ascending=False).head(10)
result = result[result['score'] > 0]
result

Unnamed: 0,nmf1,nmf1.1,nmf1.2,score
56,0.187273,0.187273,0.187273,0.561818
162,0.158964,0.158964,0.158964,0.476891
144,0.155365,0.155365,0.155365,0.466095
160,0.151645,0.151645,0.151645,0.454936
171,0.143005,0.143005,0.143005,0.429014
75,0.141394,0.141394,0.141394,0.424183
113,0.13283,0.13283,0.13283,0.398489
121,0.130553,0.130553,0.130553,0.39166
183,0.13026,0.13026,0.13026,0.390779
163,0.129734,0.129734,0.129734,0.389201


#### Cosine Similarity Query

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

def nmf_query_search(query, tfidf, nmf, W, top_k=10):

    # 1. Transform query using the SAME TF-IDF model
    q_tfidf = tfidf.transform([query])

    # 2. Project query into SAME NMF topic space
    q_vec = nmf.transform(q_tfidf)

    # 3. Cosine similarity
    sims = cosine_similarity(q_vec, W).ravel()

    # 4. Sort
    top_idx = sims.argsort()[::-1][:top_k]

    return top_idx, sims[top_idx]


In [41]:
idx, scores =  nmf_query_search(query='military', tfidf=vectorizer, nmf=nmf_model, W=document_topic, top_k=10)
idx, scores



(array([210, 206, 211, 207, 209, 202, 204, 208, 205, 201]),
 array([0.99999677, 0.99999581, 0.99998541, 0.99996428, 0.99985813,
        0.9994396 , 0.99881602, 0.9939709 , 0.99308095, 0.99240625]))

In [42]:
corpus.iloc[idx][['title', 'description']]

Unnamed: 0,title,description
210,Sharpe's Revenge (#10),"When his honor and reputation are at stake, Sh..."
206,Sharpe's Enemy (#6),A band of renegades led by Sharpe's vicious en...
211,Waterloo (#11),"June : The Duke of Wellington, the Prince of O..."
207,Sharpe's Honor (#7),"An unfinished duel, a midnight murder, and the..."
209,Sharpe's Siege (#9),Sharpe's mission has seemed simple: capture a ...
202,Sharpe's Eagle (#2),The first book in Bernard Cornwell's epicSharp...
204,Sharpe's Company (#4),"To stem the Napoleonic tide, Sharpe must captu..."
208,Sharpe's Regiment (#8),"Apart from the rousing battle scenes, the auth..."
205,Sharpe's Sword (#5),"A gripping tale of espionage, survival, and th..."
201,Sharpe's Rifles (#1),In the heart of war-torn Spain during the Napo...
