In [21]:
import pandas as pd
import numpy as np

In [22]:
df = pd.read_csv("data/book_genre_prediction.csv")
df = df.dropna(subset=["title", "summary", "genre"])
df = df.drop_duplicates(subset=["title"])
df.head()

Unnamed: 0,index,title,genre,summary
0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...
1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ..."
2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...
3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...
4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...


In [28]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os

# --- One-Time Setup: Pre-computing Embeddings ---
# run once, or whenever dataset changes.

def setup():
    """
    Pre-computes and saves book embeddings to a file.
    """
    df = pd.read_csv("data/book_genre_prediction.csv")
    df = df[["title", "summary", "genre"]].dropna()
    df.columns = ["title", "summary", "genre"]
    df = df.drop_duplicates(subset=["title"]).reset_index(drop=True)

    df['combined_text'] = df['title'] + ' - ' + df['summary']

    model = SentenceTransformer('all-MiniLM-L6-v2')
    print("Generating embeddings for all books. This may take a while...")
    
    # Split long texts into chunks
    def split_and_encode(text, max_length=256):
        tokens = text.split()
        if len(tokens) <= max_length:
            return model.encode(text, show_progress_bar=True)
        chunks = [' '.join(tokens[i:i+max_length]) for i in range(0, len(tokens), max_length)]
        embeddings = model.encode(chunks, show_progress_bar=True)
        return np.mean(embeddings, axis=0)

    book_embeddings = np.array([split_and_encode(text) for text in df['combined_text']])
    
    np.save("models/book_embeddings.npy", book_embeddings)
    df.to_csv("data/books_data.csv", index=False)
    print("Completed!!!")

In [29]:
# Check if the setup has been run, if not, run it.
if not os.path.exists('models/book_embeddings.npy'):
    print("First-time setup: generating book embeddings.")
    setup()

First-time setup: generating book embeddings.
Generating embeddings for all books. This may take a while...


Batches: 100%|██████████| 1/1 [00:00<00:00,  4.63it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.86it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 12.33it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.59it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.49it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 40.30it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.74it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.75it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 13.53it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.74it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.27it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.53it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 79.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.46it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.49it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 15.63it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.07it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 20.15it/s]
Batches: 1

Completed!!!


In [36]:
# Load the pre-computed data and embeddings
df = pd.read_csv("data/books_data.csv")
book_embeddings = np.load('models/book_embeddings.npy')

In [None]:
def recommend_books(book_title, num_recommendations=15):
    try:
        df = pd.read_csv('data/books_data.csv')
        book_embeddings = np.load('models/book_embeddings.npy')
        book_index = df[df['title'].str.lower() == book_title.lower()].index[0]
        
    except IndexError:
        print(f"Book with title '{book_title}' not found in the dataset.")
        return pd.DataFrame(columns=["title", "summary"])
    
    except FileNotFoundError:
        print("Data files not found. Please run setup() first.")
        return pd.DataFrame(columns=["title", "summary"])

    input_embedding = book_embeddings[book_index].reshape(1, -1)
    similarities = cosine_similarity(input_embedding, book_embeddings).flatten()
    similar_indices = similarities.argsort()[::-1][1:num_recommendations + 1]

    return df.iloc[similar_indices][["title", "summary"]]

In [38]:
book_title = "Animal Farm"
recommendations = recommend_books(book_title)

recommendations

Unnamed: 0,title,summary
521,The Fatal Eggs,The Fatal Eggs can be described as a science ...
3056,Bloodlands: Europe Between Hitler and Stalin,Americans call the Second World War “The Good ...
3404,1984,"Among the seminal texts of the 20th century, N..."
3163,The Blank Slate: The Modern Denial of Human Na...,"In The Blank Slate, Steven Pinker explores the..."
3878,Zoo,"Once in a lifetime, a writer puts it all toget..."
3000,Homo Deus: A History of Tomorrow,"Yuval Noah Harari, author of the critically-ac..."
3388,The Omnivore's Dilemma: A Natural History of F...,What should we have for dinner? The question h...
677,The Shape of Things to Come,"As a frame story, Wells claims that the book ..."
2001,The Cleansing,The book tells the story of an American India...
3191,Freakonomics: A Rogue Economist Explores the H...,"Which is more dangerous, a gun or a swimming p..."
