## Content based Collaborative Filtering

## Importing Libaries & loading data & preprocessing

In [36]:
# importing libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neighbors import NearestNeighbors
import ast


## Preprocessing for Content based filtering & Loading datasets

In [37]:
def preprocess_contentbased_filtering(book_df):
    """
    Preprocess books for content-based filtering by selecting relevant columns,
    dropping rows with missing categories or descriptions, and creating a 
    combined text field with boosted genres.

    Args:
        book_df (pd.DataFrame): DataFrame with columns 'title', 'categories',
            and 'description'.

    Returns:
        pd.DataFrame: Preprocessed DataFrame with 'combined_text' column added.
    """
       
    book_df = book_df[['title', 'categories','description']]
    book_df.dropna(subset=["categories","description"], inplace=True)
    book_df['combined_text'] = book_df.apply(boost_genres,axis=1)
    return book_df

# boost genres for TF-IDF
def boost_genres(row, boost=3):
    """
    Boost genre terms within a book's text representation for TF-IDF.

    This function strengthens the influence of genre/category keywords
    by repeating them multiple times before concatenating them with the
    book description. This helps ensure that genre signals remain
    prominent during TF-IDF vectorization, especially when genres are
    rare or overshadowed by longer descriptions.

    Args:
        row (pd.Series): A row from the dataframe containing at least
            'description' and 'categories' fields.
        boost (int, optional): The number of times to repeat (boost) the
            genre text. Defaults to 5.

    Returns:
        str: A single lowercase text string composed of boosted genre
        keywords followed by the book description.
    """
    
    description = str(row['description']).lower()
    genres = str(row['categories']).lower()
    boosted_genres = (' ' + genres) * boost  # repeat genres
    return boosted_genres + description

In [38]:
# load processed books info
book_df = pd.read_csv('preprocessed_data/books.csv')

# preprocess for content based similarity
book_df = preprocess_contentbased_filtering(book_df)
 
# load processed reviews
review_df = pd.read_csv('preprocessed_data/reviews.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_df.dropna(subset=["categories","description"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_df['combined_text'] = book_df.apply(boost_genres,axis=1)


In [39]:
def create_tfidf_matrix(df, text_column='combined_text', 
                        min_df=50, max_df=0.7, max_features=300):
    """
    Create a TF-IDF matrix from a DataFrame's text column.

    Args:
        df (pd.DataFrame): DataFrame containing the text data.
        text_column (str, optional): Name of the column with text. Default is 'combined_text'.
        min_df (int, optional): Minimum number of documents a term must appear in. Default is 50.
        max_df (float, optional): Maximum proportion of documents a term can appear in. Default is 0.7.
        max_features (int, optional): Maximum number of features to keep. Default is 300.
        extra_stop_words (set or list, optional): Additional words to ignore. Default is None.

    Returns:
        tuple: (vectorizer, tfidf_matrix)
            - vectorizer: fitted TfidfVectorizer object
            - tfidf_matrix: TF-IDF matrix for the text column
    """
    # Combine default stop words with any extra stop words
    stop_words = list(ENGLISH_STOP_WORDS.union({'&','book', 'novel', 'story','[',']'}))

    # Initialize vectorizer
    vectorizer = TfidfVectorizer(
        min_df=min_df,
        max_df=max_df,
        stop_words=stop_words,
        token_pattern=r"(?u)\b[a-zA-Z]+\b",
        max_features=max_features
    )

    # Fit and transform
    tfidf_matrix = vectorizer.fit_transform(df[text_column])

    return vectorizer, tfidf_matrix

def bayesian_scoring(reviews_df,C,m):
    """
    Calculate Bayesian scoring for books based on reviews.

    Args:
        reviews_df (pd.DataFrame): DataFrame with at least ['title', 'score'] columns.
        C (float): Confidence factor (e.g., average number of ratings).
        m (float): Global mean rating.

    Returns:
        pd.DataFrame: Books ranked by Bayesian score (highest first).
    """

    #step 1: create array with title, average rating of each book, and # of ratings
    books_avg_ratings = reviews_df.groupby(['title']).agg(
        avg_rating=('score', 'mean'),  
        num_ratings=('score', 'count') 
    ).reset_index()

    #step 2: bayesian score
    books_avg_ratings['bayesian_score'] = (
        (C * m + books_avg_ratings['num_ratings'] * books_avg_ratings['avg_rating']) /
        (C + books_avg_ratings['num_ratings'])
    )
    #step 3: rank the books based on their bayesian score
    books_ranked = books_avg_ratings.sort_values(
        by=['bayesian_score'], 
        ascending=False)
    
    return books_ranked

# find nearest neighbors (similar items) in a dataset instead of computing all pairwise distances
def fit_nearest_neighbors(tfidf_matrix, n_neighbors=100000, metric='cosine', algorithm='brute'):
    """
    Fit a NearestNeighbors model on a TF-IDF matrix.

    Args:
        tfidf_matrix (sparse matrix): TF-IDF representation of the items (books).
        n_neighbors (int, optional): Number of neighbors to return. Default is 1000.
        metric (str, optional): Distance metric to use. Default is 'cosine'.
        algorithm (str, optional): Algorithm for computing nearest neighbors. Default is 'brute'.

    Returns:
        NearestNeighbors: Fitted NearestNeighbors model.
    """
    nn = NearestNeighbors(
        n_neighbors=n_neighbors,
        metric=metric,
        algorithm=algorithm
    )
    
    nn.fit(tfidf_matrix)
    return nn


In [40]:
# computing the similarity
vectorizer, tfidf_matrix = create_tfidf_matrix(book_df)
C = 30 # Based on hyperparameter tuning
m = review_df['score'].mean() # Global mean
cold_start_books = bayesian_scoring(review_df, C, m)
nn = fit_nearest_neighbors(tfidf_matrix)


In [41]:
# user enters from a given list of genres
genres = [
    "Fiction","History","Religion","Biography & Autobiography",
    "Business & Economics","Computers","Social Science","Nonfiction",
    "Science","Education","Cooking","Sports & Recreation","Family & Relationships",
    "Literary Criticism","Music","Medical",
    "Body, Mind & Spirit","Health & Fitness","Language Arts & Disciplines"
    "Love","Fiction"
]

In [None]:
#Recommend books for a new user by selected genres
def recommend_for_new_user(selected_genres, vectorizer=vectorizer, nn=nn, cold_start_books=cold_start_books, df=book_df, top_n=10000, final_k = 10,boost=3):
    # Combine and boost genres
    user_text = (' '.join(selected_genres).lower() + ' ') * boost
    
    # Transform to TF-IDF
    user_vector = vectorizer.transform([user_text])
    
    # Find nearest neighbors
    distances, indices = nn.kneighbors(user_vector, n_neighbors=top_n)
    rec = df.iloc[indices[0]][['title', 'categories']]
    rec_df = rec.merge(cold_start_books, on='title',how='left')
    rec_df = rec_df.sort_values(by='bayesian_score',ascending=False)

    return rec_df[['title','categories','bayesian_score']].head(final_k)

Solving cold start

In [47]:
new_user_genres = ['science']
recommended_books = recommend_for_new_user(new_user_genres, final_k=5)
recommended_books


Unnamed: 0,title,categories,bayesian_score
8847,Prescription for Nutritional Healing: A Practi...,['Health & Fitness'],4.683841
1863,Black Like Me,['Social Science'],4.658758
8452,The Rediscovery of Man: The Complete Short Sci...,['Fiction'],4.648061
2831,Lakota Way: Stories & Lessons for Living,['Social Science'],4.638601
5620,The Girls Who Went Away: The Hidden History of...,['Social Science'],4.637971
