In [26]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
books = pd.read_csv('books.csv')
tags = pd.read_csv('tags.csv')
book_tags = pd.read_csv('book_tags.csv')
ratings = pd.read_csv('ratings.csv')


In [27]:
# Construct pivot table: rows = goodreads_book_id, columns = tag_id, values = count (fill missing with 0)
pivot = book_tags.pivot_table(index='goodreads_book_id', columns='tag_id', values='count', fill_value=0)

print(f"Tag feature matrix shape: {pivot.shape}")


Tag feature matrix shape: (10000, 34252)


In [28]:
# Find tag_id(s) corresponding to genre 'fantasy' (case-insensitive)
selected_genre = 'drama'
genre_tag_ids = tags[tags['tag_name'].str.lower() == selected_genre.lower()]['tag_id'].tolist()
print(f"Tag IDs for genre '{selected_genre}': {genre_tag_ids}")


Tag IDs for genre 'drama': [9886]


In [29]:
# Get goodreads_book_ids for books tagged with the selected genre_tag_ids
books_in_genre = book_tags[book_tags['tag_id'].isin(genre_tag_ids)]['goodreads_book_id'].unique()
print(f"Total books in genre '{selected_genre}': {len(books_in_genre)}")

# Filter the pivot matrix to only these books
pivot_genre = pivot.loc[pivot.index.intersection(books_in_genre)]
print(f"Filtered tag matrix shape for genre '{selected_genre}': {pivot_genre.shape}")


Total books in genre 'drama': 3046
Filtered tag matrix shape for genre 'drama': (3046, 34252)


In [30]:
tfidf = TfidfTransformer()
pivot_genre_tfidf = tfidf.fit_transform(pivot_genre)

print("TF-IDF transformation applied on tag matrix for genre books.")


TF-IDF transformation applied on tag matrix for genre books.


In [31]:
K = 5
kmeans = KMeans(n_clusters=K, random_state=42)
clusters = kmeans.fit_predict(pivot_genre_tfidf)

# Add cluster assignments to pivot_genre DataFrame for easy lookup
pivot_genre = pivot_genre.copy()
pivot_genre['cluster'] = clusters
print(f"Books clustered into {K} clusters.")


Books clustered into 5 clusters.


In [32]:
# Map goodreads_book_id -> book details
books_subset = books.set_index('goodreads_book_id')

print("\nSample books by cluster:")
for c in range(K):
    cluster_books = pivot_genre[pivot_genre['cluster'] == c].index
    sample_books = cluster_books[:5]  # show 5 books from cluster
    print(f"\nCluster {c}:")
    for gbid in sample_books:
        if gbid in books_subset.index:
            print(f" - {books_subset.loc[gbid]['title']} by {books_subset.loc[gbid]['authors']}")



Sample books by cluster:

Cluster 0:
 - Heart of the Sea (Gallaghers of Ardmore / Irish Trilogy, #3) by Nora Roberts
 - Angels Fall by Nora Roberts
 - Valley of Silence (Circle Trilogy, #3) by Nora Roberts
 - Red Lily (In the Garden, #3) by Nora Roberts
 - Inner Harbor (Chesapeake Bay Saga, #3) by Nora Roberts

Cluster 1:
 - The Power of One (The Power of One, #1) by Bryce Courtenay
 - I am Charlotte Simmons by Tom Wolfe
 - Tropic of Capricorn by Henry Miller
 - The Lover by Marguerite Duras, Barbara Bray, Maxine Hong Kingston
 - Perfume: The Story of a Murderer by Patrick Süskind, John E. Woods

Cluster 2:
 - The Broken Wings by Kahlil Gibran, Anthony R. Ferris
 - Moon Palace by Paul Auster
 - The Da Vinci Code (Robert Langdon, #2) by Dan Brown
 - The Good Earth (House of Earth, #1) by Pearl S. Buck
 - The Broker by John Grisham

Cluster 3:
 - Kiss an Angel by Susan Elizabeth Phillips
 - Love Story (Love Story, #1) by Erich Segal
 - Perfect (Second Opportunities, #2) by Judith McNaug

In [33]:
user_favorites = [
    # Example goodreads_book_ids user has chosen (please change as per your data)
    pivot_genre.index[0],  # first book in genre cluster 0
    pivot_genre.index[10], # another random book in the cluster
]
print("\nUser favorite books selected (goodreads_book_id):", user_favorites)



User favorite books selected (goodreads_book_id): [np.int64(122), np.int64(449)]


In [34]:
# Extract indices in pivot_genre for user's favorite books
favorite_indices = [pivot_genre.index.get_loc(gbid) for gbid in user_favorites]

user_profile = pivot_genre_tfidf[favorite_indices].mean(axis=0)

# Convert from np.matrix to np.ndarray
user_profile_array = np.asarray(user_profile)

# Calculate cosine similarity
similarities = cosine_similarity(pivot_genre_tfidf, user_profile_array).flatten()

# Get books most similar to user's taste, excluding already selected favorites
recommended_indices = np.argsort(similarities)[::-1]  # descending order
recommended_books_gbid = [pivot_genre.index[i] for i in recommended_indices if pivot_genre.index[i] not in user_favorites]

# Show top 10 recommended books
top_n = 10
recommended_top = recommended_books_gbid[:top_n]

print("\nTop recommended books for user:")
for gbid in recommended_top:
    if gbid in books_subset.index:
        print(f" - {books_subset.loc[gbid]['title']} by {books_subset.loc[gbid]['authors']}")



Top recommended books for user:
 - Invisible by Paul Auster
 - The Human Stain (The American Trilogy, #3) by Philip Roth
 - Enduring Love by Ian McEwan
 - White Teeth by Zadie Smith
 - The Remains of the Day by Kazuo Ishiguro
 - Glamorama by Bret Easton Ellis
 - The Accidental Tourist by Anne Tyler, Jennifer Bassett
 - The Rules of Attraction by Bret Easton Ellis
 - Middlesex by Jeffrey Eugenides
 - Revolutionary Road by Richard Yates
