In [1]:
# Necessary Libraries for Vector Search
from langchain_community.document_loaders import TextLoader         # Extract the text from dataset, then convert it into a format that Lang Chain can work with
from langchain_text_splitters import CharacterTextSplitter          # Split the whole document which containing all text into meaningful chunks
# from langchain_google_genai import GoogleGenerativeAIEmbeddings     # Convert the chunks being splitted into document embeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma                                 # Store the document embeddings in a vector database

from dotenv import load_dotenv
import pandas as pd



In [2]:
# Constant Variable
BOOKS_DATASET_PATH = "./dataset/books_cleaned.csv"

In [3]:
# Dataset Setup for Data Exploration
books_dataset = pd.read_csv(BOOKS_DATASET_PATH)

In [4]:
# Dataset Overview
books_dataset.head(10)

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
0,9780002005883,0002005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Gilead,9780002005883: A NOVEL THAT READERS and critic...
1,9780002261982,0002261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Spider's Web: A Novel,9780002261982: A new 'Christie for Christmas' ...
2,9780006178736,0006178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Rage of angels,"9780006178736: A memorable, mesmerizing heroin..."
3,9780006280897,0006280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,The Four Loves,9780006280897: Lewis' work on the nature of lo...
4,9780006280934,0006280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0,The Problem of Pain,"9780006280934: ""In The Problem of Pain, C.S. L..."
5,9780006380832,0006380832,Empires of the Monsoon,Richard Hall,"Africa, East",http://books.google.com/books/content?id=MuPEQ...,Until Vasco da Gama discovered the sea-route t...,1998.0,4.41,608.0,65.0,Empires of the Monsoon: A History of the India...,9780006380832: Until Vasco da Gama discovered ...
6,9780006470229,000647022X,The Gap Into Madness,Stephen R. Donaldson,"Hyland, Morn (Fictitious character)",http://books.google.com/books/content?id=4oXav...,A new-cover reissue of the fourth book in the ...,1994.0,4.15,743.0,103.0,The Gap Into Madness: Chaos and Order,9780006470229: A new-cover reissue of the four...
7,9780006472612,0006472613,Master of the Game,Sidney Sheldon,Adventure stories,http://books.google.com/books/content?id=TkTYp...,Kate Blackwell is an enigma and one of the mos...,1982.0,4.11,489.0,43540.0,Master of the Game,9780006472612: Kate Blackwell is an enigma and...
8,9780006482079,0006482074,Warhost of Vastmark,Janny Wurts,Fiction,http://books.google.com/books/content?id=uOL0f...,"Tricked once more by his wily half-brother, Ly...",1995.0,4.03,522.0,2966.0,Warhost of Vastmark,9780006482079: Tricked once more by his wily h...
9,9780006483014,0006483011,The Once and Future King,Terence Hanbury White,Arthurian romances,http://books.google.com/books/content?id=Jx6Bv...,An omnibus volume of the author's complete sto...,1996.0,4.04,823.0,2805.0,The Once and Future King,9780006483014: An omnibus volume of the author...


In [5]:
# Save the tagged and description of the book in text file, cause Lang Chain can't work with the Pandas Data Frame
books_dataset["tagged_description"].to_csv("/Users/jkhang/Documents/GitHub/Semantic-Book-Recommender/dataset/tagged_description.txt", sep = "\n", index = False, header = False)

In [6]:
# Load the text file that saved tagged and description of the book
raw_documents = TextLoader("./dataset/tagged_description.txt").load()

# Instantiate the text splitter
text_splitter = CharacterTextSplitter(chunk_size = 1, chunk_overlap = 0, separator = "\n")

documents = text_splitter.split_documents(raw_documents)
documents[0]

Created a chunk of size 1169, which is longer than the specified 1
Created a chunk of size 1215, which is longer than the specified 1
Created a chunk of size 374, which is longer than the specified 1
Created a chunk of size 310, which is longer than the specified 1
Created a chunk of size 484, which is longer than the specified 1
Created a chunk of size 483, which is longer than the specified 1
Created a chunk of size 961, which is longer than the specified 1
Created a chunk of size 189, which is longer than the specified 1
Created a chunk of size 844, which is longer than the specified 1
Created a chunk of size 297, which is longer than the specified 1
Created a chunk of size 198, which is longer than the specified 1
Created a chunk of size 882, which is longer than the specified 1
Created a chunk of size 1089, which is longer than the specified 1
Created a chunk of size 1190, which is longer than the specified 1
Created a chunk of size 305, which is longer than the specified 1
Create

Document(metadata={'source': './dataset/tagged_description.txt'}, page_content='9780002005883: A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s 

In [8]:
# Load API Key from .env file
load_dotenv()

# Build a Chroma vector store
db_books = Chroma.from_documents(documents, embedding = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2"))

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Search the similarity results from the dataset based on the query defined
docs = db_books.similarity_search("A book to teach children about nature", k = 10)
docs

[Document(id='c8859889-b890-4a8c-94a1-4d562611419b', metadata={'source': './dataset/tagged_description.txt'}, page_content='9780786808069: Children will discover the exciting world of their own backyard in this introduction to familiar animals from cats and dogs to bugs and frogs. The combination of photographs, illustrations, and fun facts make this an accessible and delightful learning experience.'),
 Document(id='e5df892f-3f21-49d4-ab11-7867ce3368e6', metadata={'source': './dataset/tagged_description.txt'}, page_content="9780786808380: Introduce your babies to birds, cats, dogs, and babies through fine art, illustration, and photographs. These books are a rare opportunity to expose little ones to a range of images on a single subject, from simple child's drawings and abstract art to playful photos. A brief text accompanies each image, introducing the baby to some basic -- and sometimes playful -- information about the subjects."),
 Document(id='48ac8921-e99e-4b87-be25-30b04f04c18a',

In [10]:
# Extract the details of the books based on the ISBN13 of each book
books_dataset[books_dataset["isbn13"] == int(docs[0].page_content.split(": ")[0].strip())]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
3747,9780786808069,786808063,Baby Einstein: Neighborhood Animals,Marilyn Singer;Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=X9a4P...,Children will discover the exciting world of t...,2001.0,3.89,16.0,180.0,Baby Einstein: Neighborhood Animals,9780786808069: Children will discover the exci...


In [11]:
def retrieve_semantic_rcommendations(query: str, top_k: int = 10) -> pd.DataFrame:
    # Search the query that I defined above, and return search results
    recs = db_books.similarity_search(query, k = top_k)
    
    books = []
    
    # Extract and store the details of the books
    for idx in range(0, len(recs)):
        books += [int(recs[idx].page_content.strip('"').split(": ")[0])]
        
    return books_dataset[books_dataset["isbn13"].isin(books)].head(top_k)

In [12]:
retrieve_semantic_rcommendations("A book to teach children about nature")    

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged_description
324,9780060959036,0060959037,Prodigal Summer,Barbara Kingsolver,Fiction,http://books.google.com/books/content?id=06IwG...,Barbara Kingsolver's fifth novel is a hymn to ...,2001.0,4.0,444.0,85440.0,Prodigal Summer: A Novel,9780060959036: Barbara Kingsolver's fifth nove...
404,9780064402453,0064402452,Racso and the Rats of NIMH,Jane Leslie Conly,Juvenile Fiction,http://books.google.com/books/content?id=MgoNv...,"‘Racso, a brash and boastful little rodent, is...",1988.0,3.76,288.0,3231.0,Racso and the Rats of NIMH,"9780064402453: ‘Racso, a brash and boastful li..."
406,9780064403870,0064403874,"R-T, Margaret, and the Rats of NIMH",Jane Leslie Conly,Juvenile Fiction,http://books.google.com/books/content?id=WTHHH...,"When Margaret and her younger brother, Artie, ...",1991.0,3.52,272.0,631.0,"R-T, Margaret, and the Rats of NIMH",9780064403870: When Margaret and her younger b...
1642,9780374522599,0374522596,The Control of Nature,John McPhee,Nature,http://books.google.com/books/content?id=p1qKQ...,The Control of Nature is John McPhee's bestsel...,1990.0,4.24,288.0,3365.0,The Control of Nature,9780374522599: The Control of Nature is John M...
3747,9780786808069,0786808063,Baby Einstein: Neighborhood Animals,Marilyn Singer;Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=X9a4P...,Children will discover the exciting world of t...,2001.0,3.89,16.0,180.0,Baby Einstein: Neighborhood Animals,9780786808069: Children will discover the exci...
3748,9780786808373,0786808373,Baby Einstein: Birds,Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=0jxHP...,"Introducing your baby to birds, cats, dogs, an...",2002.0,3.78,20.0,9.0,Baby Einstein: Birds,"9780786808373: Introducing your baby to birds,..."
3749,9780786808380,0786808381,Baby Einstein: Babies,Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=jv4NA...,"Introduce your babies to birds, cats, dogs, an...",2002.0,4.03,20.0,29.0,Baby Einstein: Babies,"9780786808380: Introduce your babies to birds,..."
3750,9780786808397,078680839X,Baby Einstein: Dogs,Julie Aigner-Clark,Juvenile Fiction,http://books.google.com/books/content?id=qut8t...,"Introduce your baby to birds, cats, dogs, and ...",2002.0,3.81,20.0,26.0,Baby Einstein: Dogs,"9780786808397: Introduce your baby to birds, c..."
3765,9780786819119,0786819111,"Baby Einstein: Water, Water Everywhere","Disney Book Group,",Juvenile Fiction,http://books.google.com/books/content?id=tuAdA...,Charming illustrations and playful rhythmic ve...,2003.0,3.7,10.0,77.0,"Baby Einstein: Water, Water Everywhere",9780786819119: Charming illustrations and play...
4898,9781593851170,1593851170,The Nature of Play,Anthony D. Pellegrini;Peter K. Smith,Psychology,http://books.google.com/books/content?id=Nukz6...,"""Comprehensive and up to date, this tightly ed...",2005.0,4.25,308.0,4.0,The Nature of Play: Great Apes and Humans,"9781593851170: ""Comprehensive and up to date, ..."
