In [33]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
# Step 1: Load the JSON file normally
with open("100_prh_title_sample.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Step 2: Normalize only the "data" list into a DataFrame
df = pd.json_normalize(data["data"])

df.head()

Unnamed: 0,isbn,isbnHyphenated,workId,title,author,coverUrl,subformat,binding,trim,edition,...,series,language,seq,titleBlock,description,authors,format.code,format.description,editionTarget.code,editionTarget.description
0,9780028633879,978-0-02-863387-9,359570,The Complete Idiot's Guide to Learning Yiddish,Rabbi Benjamin Blech,https://images.penguinrandomhouse.com/cover/97...,,,7-3/8 x 9-1/8,0,...,,E,,,"You're not idiot, of course. You can serve up ...","[{'code': '309012', 'description': 'Rabbi Benj...",TR,Trade Paperback,,
1,9780130575715,978-0-13-057571-5,350672,Heinerman's Encyclopedia of Healing Juices,John Heinerman,https://images.penguinrandomhouse.com/cover/97...,,,6 x 9,0,...,,E,,,"This publication shows how raw, natural juices...","[{'code': '12511', 'description': 'John Heiner...",TR,Trade Paperback,,
2,9780131088382,978-0-13-108838-2,350688,Super Healing Foods,Frances Sheridan Goulart,https://images.penguinrandomhouse.com/cover/97...,,,6 x 9,0,...,,E,,,From apples (sunburn relief) and avocados (car...,"[{'code': '233283', 'description': 'Frances Sh...",TR,Trade Paperback,,
3,9780131872783,978-0-13-187278-3,299263,A Brief Tour of Human Consciousness,V.S. Ramachandran,https://images.penguinrandomhouse.com/cover/97...,,,5-3/8 x 8-1/4,0,...,,E,,,How can some people come to believe that their...,"[{'code': '2148674', 'description': 'V. S. Ram...",TR,Trade Paperback,,
4,9780132092302,978-0-13-209230-2,353186,Heinerman's New Encyclopedia of Fruits & Veget...,John Heinerman,https://images.penguinrandomhouse.com/cover/97...,,,6 x 9,0,...,,E,,,This book is your total guide to using the inc...,"[{'code': '12511', 'description': 'John Heiner...",TR,Trade Paperback,,


In [35]:
df.columns.tolist()

['isbn',
 'isbnHyphenated',
 'workId',
 'title',
 'author',
 'coverUrl',
 'subformat',
 'binding',
 'trim',
 'edition',
 'onSaleDate',
 'exportOnSaleDate',
 'price',
 'exportPrice',
 'globalDivision',
 'publishingDivision',
 'imprint',
 'publishingStatus',
 'series',
 'language',
 'seq',
 'titleBlock',
 'description',
 'authors',
 'format.code',
 'format.description',
 'editionTarget.code',
 'editionTarget.description']

In [51]:
corpus = df['description']
corpus.head()

0    You're not idiot, of course. You can serve up ...
1    This publication shows how raw, natural juices...
2    From apples (sunburn relief) and avocados (car...
3    How can some people come to believe that their...
4    This book is your total guide to using the inc...
Name: description, dtype: object

In [52]:
# TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
 
# Get the feature names (words in the vocabulary)
feature_names = vectorizer.get_feature_names_out()
 
# Convert the sparse matrix to a dense array for easier viewing (for small datasets)
dense_matrix = tfidf_matrix.toarray()
 
# You can then create a DataFrame for better readability
tf_idf_df = pd.DataFrame(dense_matrix, columns=feature_names)
tf_idf_df

Unnamed: 0,000,10,100,1066,151,1577,160,16th,1776,1777,...,yourself,yucca,zarathustra,zeal,zealand,zen,zero,zestful,zone,zones
0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
1,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
2,0.000000,0.076977,0.0,0.0,0.0,0.0,0.088087,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.076977,0.0,0.0,0.076977
3,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
4,0.101603,0.000000,0.0,0.0,0.0,0.0,0.077512,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
96,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
97,0.000000,0.000000,0.0,0.0,0.0,0.0,0.033887,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
98,0.000000,0.000000,0.0,0.0,0.0,0.0,0.235314,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000


In [63]:
# Set isbn as index of tf_idf_df
tf_idf_df.set_index(df['isbn'], inplace=True)
 
# Extract isbn, title, and author from the original dataframe
isbn_title_author_df = df[['isbn', 'title', 'author']]
isbn_title_author_df.set_index('isbn', inplace=True)
 
# Add the title and author information to the TF-IDF DataFrame
doc_importance = pd.merge(isbn_title_author_df, tf_idf_df, on='isbn', how='left')
doc_importance


Unnamed: 0_level_0,title_x,author_x,000,10,100,1066,151,1577,160,16th,...,yourself,yucca,zarathustra,zeal,zealand,zen,zero,zestful,zone,zones
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9780028633879,The Complete Idiot's Guide to Learning Yiddish,Rabbi Benjamin Blech,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
9780130575715,Heinerman's Encyclopedia of Healing Juices,John Heinerman,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
9780131088382,Super Healing Foods,Frances Sheridan Goulart,0.000000,0.076977,0.0,0.0,0.0,0.0,0.088087,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.076977,0.0,0.0,0.076977
9780131872783,A Brief Tour of Human Consciousness,V.S. Ramachandran,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
9780132092302,Heinerman's New Encyclopedia of Fruits & Veget...,John Heinerman,0.101603,0.000000,0.0,0.0,0.0,0.0,0.077512,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9780140089363,Q's Legacy,Helene Hanff,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
9780140089585,The Second Rumpole Omnibus,John Mortimer,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
9780140089738,Saints and Strangers,Angela Carter,0.000000,0.000000,0.0,0.0,0.0,0.0,0.033887,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
9780140089806,Between Women,Luise Eichenbaum,0.000000,0.000000,0.0,0.0,0.0,0.0,0.235314,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000


In [84]:
# Create a query function
query = 'christmas'
 
query_words = query.lower().split()
 
to_search = []
for word in query_words:
    if word in doc_importance.columns:
        to_search.append(word)
 
to_search.insert(0, 'title_x')
 
# Test first if all words in the query are in the dataframe
 
sample = doc_importance.loc[:, to_search] # 9780028633879
sample.head()

Unnamed: 0_level_0,title_x,christmas
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
9780028633879,The Complete Idiot's Guide to Learning Yiddish,0.0
9780130575715,Heinerman's Encyclopedia of Healing Juices,0.0
9780131088382,Super Healing Foods,0.0
9780131872783,A Brief Tour of Human Consciousness,0.0
9780132092302,Heinerman's New Encyclopedia of Fruits & Veget...,0.0


In [85]:
#sample['score'] = sample['idiot'] + sample['guide']
sample['score'] = sample.iloc[:, 1:].sum(axis=1)
result = sample.sort_values(axis=0, by='score', ascending=False).head(10)
result = result[result['score'] > 0]
result

Unnamed: 0_level_0,title_x,christmas,score
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9780140075571,The Little Disturbances of Man,0.139427,0.139427


In [92]:
pd.set_option('display.max_colwidth', None)
df[df['title'] == 'The Little Disturbances of Man']['description']

72    Whether writing about relationships, sexy little girls, loving and bickering couples, angry suburbanites, frustrated job-seekers, or Jewish children performing a Christmas play, Grace Paley captures the loneliness, poignancy, and humor of the human experience with matchless style in this book of short stories. <br><br> "Fresh and vigorous...Mrs. Paley&rsquo;s view of life is her own."--<i>The New Yorker<br></i><br> "The glad tidings from this reviewer&rsquo;s corner are of the appearance of a [writer] possessed of an all-too-infrequent literary virtue--the comic vision."--<i>The New York Times</i>
Name: description, dtype: object