<a href="https://colab.research.google.com/github/kakashi3lite/GoodBooksRecommender/blob/main/GoodBooksRecommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [124]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [125]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [126]:
books_path = '/content/drive/My Drive/Goodbooks/books.csv'
ratings_path = '/content/drive/My Drive/Goodbooks/ratings.csv'
tags_path = '/content/drive/My Drive/Goodbooks/tags.csv'
book_tags_path = '/content/drive/My Drive/Goodbooks/book_tags.csv'


Step 1: Load and Merge Data

In [127]:
def load_and_merge_data(books_path, ratings_path, tags_path, book_tags_path):
    # Load datasets
    books = pd.read_csv(books_path)
    ratings = pd.read_csv(ratings_path)
    tags = pd.read_csv(tags_path)
    book_tags = pd.read_csv(book_tags_path)

    # Merge books with tags using book_id
    book_tags = book_tags.merge(tags, on='tag_id', how='left')
    books = books.merge(book_tags, left_on='goodreads_book_id', right_on='goodreads_book_id', how='left')

    return books, ratings

Step 2: Preprocess the Data

In [128]:
def preprocess_data(books):
    # Combine all tag names for each book into a single string
    books['all_tags'] = books.groupby('book_id')['tag_name'].transform(lambda x: ' '.join(x.dropna()))
    books = books.drop_duplicates(subset='book_id')  # Remove duplicates
    books.loc[:, 'all_tags'] = books['all_tags'].fillna('')  # Use .loc to avoid warnings
  # Fill missing values with an empty string
    return books

Step 3: Building the TF-IDF Matrix

In [129]:
def build_tfidf_matrix(books):
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(books['all_tags'])
    return tfidf_matrix

Step 4: Computing Cosine Similarity

In [130]:
def compute_similarity(tfidf_matrix):
    return cosine_similarity(tfidf_matrix, tfidf_matrix)

Step 5: Building Recommendation Function

In [131]:
def recommend_books(book_title, books, cosine_sim):
    try:
        # Debug: Print all available titles
        print("Available titles for matching:")
        print(books['title'].head(20))  # Print a subset of titles for debugging

        # Use case-insensitive, stripped matching
        matched_books = books[books['title'].str.strip().str.contains(book_title.strip(), case=False, na=False)]

        # Debug: Print matched books
        print(f"Matched books for '{book_title}':")
        print(matched_books[['title', 'authors']].head())

        if matched_books.empty:
            return f"Book titled '{book_title}' not found! Please try another title."

        # Use the first match's index
        idx = matched_books.index[0]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:4]  # Top 3 recommendations
        book_indices = [i[0] for i in sim_scores]

        # Debug: Print recommended indices
        print("Recommended indices and scores:")
        print(sim_scores)

        return books.iloc[book_indices][['title', 'authors', 'average_rating']]
    except Exception as e:
        return f"An error occurred: {str(e)}"

In [132]:
def main():
    # Specify dataset paths
    books_path = "/content/drive/My Drive/Goodbooks/books.csv"
    ratings_path = "/content/drive/My Drive/Goodbooks/ratings.csv"
    tags_path = "/content/drive/My Drive/Goodbooks/tags.csv"
    book_tags_path = "/content/drive/My Drive/Goodbooks/book_tags.csv"

    # Load and preprocess the data
    books, ratings = load_and_merge_data(books_path, ratings_path, tags_path, book_tags_path)
    books = preprocess_data(books)

    # Debug: Print number of unique titles
    print(f"Number of unique titles: {books['title'].nunique()}")

    # Build the TF-IDF matrix and compute similarity
    tfidf_matrix = build_tfidf_matrix(books)
    cosine_sim = compute_similarity(tfidf_matrix)

    # Test the recommendation system
    test_book = 'Atomic Habits'  # Replace with a book title for testing
    recommendations = recommend_books(test_book, books, cosine_sim)

    print(f"Recommendations for '{test_book}':")
    if isinstance(recommendations, str):
        print(recommendations)
    else:
        print(recommendations.to_string(index=False))

if __name__ == "__main__":
    main()


Number of unique titles: 9964
Available titles for matching:
0                 The Hunger Games (The Hunger Games, #1)
100     Harry Potter and the Sorcerer's Stone (Harry P...
200                               Twilight (Twilight, #1)
300                                 To Kill a Mockingbird
400                                      The Great Gatsby
500                                The Fault in Our Stars
600                                            The Hobbit
700                                The Catcher in the Rye
800                 Angels & Demons  (Robert Langdon, #1)
900                                   Pride and Prejudice
1000                                      The Kite Runner
1100                            Divergent (Divergent, #1)
1200                                                 1984
1300                                          Animal Farm
1400                            The Diary of a Young Girl
1500     The Girl with the Dragon Tattoo (Millennium, #1)
1600       