In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.preprocessing import MinMaxScaler
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import difflib
from collections import Counter
import string
import pickle
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# nltk.data.path.append('C:\\Users\\likhi\\anaconda3\\nltk_data')
# try:
#     nltk.download('stopwords', quiet=True)
#     nltk.download('wordnet', quiet=True)
#     nltk.download('punkt_tab', quiet=True,download_dir='C:\\Users\\likhi\\anaconda3\\nltk_data')
#     print("NLTK resources downloaded.")
# except:
#     print("NLTK resources couldn't be downloaded. Some text processing features might be limited.")
# print(nltk.data.find('tokenizers/punkt'))

C:\Users\likhi\anaconda3\nltk_data\tokenizers\punkt


In [None]:
books_df = pd.read_csv("./Dataset/books.csv")
print(f"Books dataset loaded with shape: {books_df.shape}")

Books dataset loaded with shape: (20935, 7)


In [59]:
books_df.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,Summary,Language,Category
0,399135782,The Kitchen God's Wife,Amy Tan,1991.0,A Chinese immigrant who is convinced she is dy...,en,Fiction
1,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000.0,"Essays by respected military historians, inclu...",en,History
2,771074670,Nights Below Station Street,David Adams Richards,1988.0,Another story based in the fictional rural tow...,en,Fiction
3,440234743,The Testament,John Grisham,1999.0,"A suicidal billionaire, a burnt-out Washington...",en,Fiction
4,452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994.0,Staring unflinchingly into the abyss of slaver...,en,Fiction


In [61]:
books_df = books_df.drop_duplicates(subset='book_title', keep='first')
books_df.shape

(19460, 7)

In [63]:
def advanced_text_preprocessing(text):
    """
    Apply advanced text preprocessing including:
    - Remove special characters and numbers
    - Convert to lowercase
    - Tokenize
    - Remove stopwords
    - Lemmatize
    """
    if not isinstance(text, str):
        return ""
    
    # Remove special characters and numbers
    text = re.sub('[^a-zA-Z\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join back to string
    return ' '.join(tokens)

# Apply advanced preprocessing to summary
print("Applying advanced text preprocessing...")
books_df['processed_summary'] = books_df['Summary'].apply(advanced_text_preprocessing)


Applying advanced text preprocessing...


In [64]:
books_df.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,Summary,Language,Category,processed_summary
0,399135782,The Kitchen God's Wife,Amy Tan,1991.0,A Chinese immigrant who is convinced she is dy...,en,Fiction,chinese immigrant convinced dying threatens ce...
1,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000.0,"Essays by respected military historians, inclu...",en,History,essay respected military historian including s...
2,771074670,Nights Below Station Street,David Adams Richards,1988.0,Another story based in the fictional rural tow...,en,Fiction,another story based fictional rural town miram...
3,440234743,The Testament,John Grisham,1999.0,"A suicidal billionaire, a burnt-out Washington...",en,Fiction,suicidal billionaire burntout washington litig...
4,452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994.0,Staring unflinchingly into the abyss of slaver...,en,Fiction,staring unflinchingly abyss slavery novel tran...


In [8]:
# 4.2 Create weighted content features
print("Creating weighted content features...")

# Give more weight to title and author by repeating them
books_df['weighted_content'] = (
    books_df['book_title'] + ' ' + books_df['book_title'] + ' ' + 
    books_df['book_author'] + ' ' + books_df['book_author'] + ' ' + 
    books_df['Category'] + ' ' +
    books_df['processed_summary']
)
books_df['weighted_content'][0]

Creating weighted content features...


"The Kitchen God's Wife The Kitchen God's Wife Amy Tan Amy Tan Fiction chinese immigrant convinced dying threatens celebrate chinese new year unburdening everybodys hidden truth thus prompting series comic misunderstanding"

In [9]:
# 5. Advanced Content-Based Filtering
print("\n# Building Advanced Content-Based Recommendation System")

# 5.1 Create TF-IDF matrix with n-grams
print("Creating TF-IDF matrix with n-grams...")
tfidf = TfidfVectorizer(stop_words='english', max_features=5000, 
                        ngram_range=(1, 2))  # Include bigrams
tfidf_matrix = tfidf.fit_transform(books_df['weighted_content'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# 5.2 Calculate cosine similarity
print("Calculating cosine similarity...")
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(f"Cosine similarity matrix shape: {cosine_sim.shape}")


# Building Advanced Content-Based Recommendation System
Creating TF-IDF matrix with n-grams...
TF-IDF matrix shape: (19460, 5000)
Calculating cosine similarity...
Cosine similarity matrix shape: (19460, 19460)


In [10]:
# 5.3 Create a mapping of book titles to indices
indices = pd.Series(books_df.index, index=books_df['book_title']).drop_duplicates()

In [11]:
def get_recommendations_by_title(title, cosine_sim=cosine_sim, df=books_df, indices=indices, top_n=10):
    """
    DataFrame containing content-based recommendations
    """
    # Get the index of the book that matches the title
    try:
        idx = indices[title]
    except:
        print(f"Book '{title}' not found in the database.")
        similar_titles = difflib.get_close_matches(title, indices.index, n=3)
        if similar_titles:
            print(f"Similar titles found: {similar_titles}")
            return None
        return None
    
    # Get the pairwise similarity scores
    sim_scores = list(enumerate(list(cosine_sim[idx])))
    # print(sim_scores)
    
    # Get the category/genre of the target book
    target_category = df.iloc[idx]['Category']
    
    # Sort the books based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get a larger pool of candidates
    candidates = sim_scores[1:top_n*3]
    
    # Get the book indices and scores
    candidate_indices = [i[0] for i in candidates]
    candidate_scores = [i[1] for i in candidates]
    
    # Create a candidate dataframe

    candidates_df = df.iloc[candidate_indices].copy()
    candidates_df['similarity_score'] = candidate_scores
    
    # Sort by combined score
    recommendations = candidates_df.head(top_n)
    
    # Return recommendations
    return recommendations[['book_title', 'book_author', 'Category', 'year_of_publication', 
                          'similarity_score']]

In [67]:
def get_recommendations_by_author(author, df=books_df, exclude_categories=None, year_range=None):
    """
    Generate recommendations by author with filtering options
    DataFrame containing filtered author recommendations
    """
    # Case insensitive search for author
    author_pattern = re.compile(author, re.IGNORECASE)
    matching_books = df[df['book_author'].str.contains(author_pattern, na=False)]
    
    if matching_books.empty:
        print(f"No books found for author '{author}'.")
        similar_authors = df[df['book_author'].str.lower().str.contains(author.lower(), na=False)]
        if not similar_authors.empty:
            print(f"Did you mean one of these authors?")
            display(similar_authors['book_author'].unique()[:5])
        return None
        
    # Sort by year (most recent first)
    matching_books = matching_books.sort_values('year_of_publication', ascending=False)
    return matching_books[['book_title', 'book_author', 'Category', 'year_of_publication']]

In [69]:
# 6.3 Content-based search function - find books by keywords in content
def search_books_by_content(keywords, df=books_df, top_n=20):
    """
    Search for books containing specific keywords in their content
    DataFrame containing search results
    """
    # Clean and prepare the keywords
    clean_keywords = advanced_text_preprocessing(keywords)
    
    # Create a TF-IDF vectorizer for the query
    query_vectorizer = TfidfVectorizer(vocabulary=tfidf.vocabulary_)
    query_vector = query_vectorizer.fit_transform([clean_keywords])
    
    # Calculate similarity between query and all books
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Get indices of top results
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    # Create results dataframe
    results = df.iloc[top_indices].copy()
    results['relevance_score'] = similarities[top_indices]
    
    # Filter out very low relevance
    results = results[results['relevance_score'] > 0.05]
    
    return results[['book_title', 'book_author', 'Category', 'year_of_publication', 'relevance_score']]


In [14]:
def recommend_books(query=None, query_type='title', top_n=10,exclude_categories=None, 
                    year_range=None, include_keywords=None):
    """
    Enhanced book recommendation function with multiple options
    
    Parameters:
    ----------
    query : str
        Book title, author name, or keywords
    query_type : str, default='title'
        Type of query: 'title', 'author', or 'keywords'
    top_n : int, default=10
        Number of recommendations to return
    exclude_categories : list, optional
        Categories to exclude from recommendations
    year_range : tuple, optional
        (min_year, max_year) to filter by publication year
    include_keywords : str, optional
        Keywords that must be included in recommendations
        
    Returns:
    -------
    DataFrame containing book recommendations
    """
    if query is None:
        print("Please provide a book title, author name, or keywords.")
        return None
    
    # Get base recommendations
    if query_type.lower() == 'title':
        recommendations = get_recommendations_by_title(query,top_n=top_n*2)
    
    elif query_type.lower() == 'author':
        recommendations = get_recommendations_by_author(query, exclude_categories=exclude_categories, year_range=year_range)
    
    elif query_type.lower() == 'keywords': 
        recommendations = search_books_by_content(query, top_n=top_n*2)
    
    else:
        print("Invalid query type. Choose 'title', 'author', or 'keywords'.")
        return None
    
    if recommendations is None or len(recommendations) == 0:
        return None
    
    # Apply category filter if specified
    if exclude_categories:
        if not isinstance(exclude_categories, list):
            exclude_categories = [exclude_categories]
        
        for category in exclude_categories:
            recommendations = recommendations[~recommendations['Category'].str.contains(category, case=False, na=False)]

    # Apply year range filter if specified
    if year_range and len(year_range) == 2:  # Already applied for author
        min_year, max_year = year_range
        recommendations = recommendations[
            (recommendations['year_of_publication'] >= min_year) & 
            (recommendations['year_of_publication'] <= max_year)
        ]
        
    if query_type.lower() == 'author' :
        return recommendations
    
    # Filter by keywords if specified
    if include_keywords:
        # Get books containing the keywords
        keyword_results = search_books_by_content(include_keywords, top_n=len(books_df))
        keyword_books = set(keyword_results['book_title'])
        
        # Only keep recommendations that are in the keyword results
        recommendations = recommendations[recommendations['book_title'].isin(keyword_books)]
    
    # Return top N results
    return recommendations.head(top_n)


In [15]:
# 7. Demonstration: Advanced Recommendation Examples
print("\n# Demonstration: Advanced Recommendation Examples")

# 7.1 Basic title-based recommendations
print("\nBasic recommendations based on a book title:")
title = books_df['book_title'].iloc[10]  # Choose a random book
print(f"Getting recommendations for book: {title}")
basic_recs = recommend_books(query=title, query_type='title')
display(basic_recs)

# 7.2 Recommendations by author with category filtering
print("\nRecommendations by author with filtering:")
author = books_df['book_author'].iloc[11239]  # Choose a random author
exclude_cat = books_df['Category'].iloc[12]  # Exclude the author's most common category]
print(f"Getting recommendations for author: {author}, excluding category: {exclude_cat}")
filtered_author_recs = recommend_books(query=author, query_type='author', exclude_categories=[exclude_cat])
display(filtered_author_recs)

# 7.3 Keyword-based search
print("\nKeyword-based book search:")
keywords = "mystery detective crime Agatha"
print(f"Searching for books matching keywords: {keywords}")
keyword_recs = recommend_books(query=keywords, query_type='keywords')
display(keyword_recs)



# Demonstration: Advanced Recommendation Examples

Basic recommendations based on a book title:
Getting recommendations for book: Timeline


Unnamed: 0,book_title,book_author,Category,year_of_publication,similarity_score
16454,Michael Crichton: A New Collection of Three Co...,Michael Crichton,Fiction,1994.0,0.594139
3744,The Terminal Man,Michael Crichton,Fiction,1988.0,0.587225
1127,Sphere,MICHAEL CRICHTON,Fiction,1988.0,0.586799
9830,Eaters of the Dead: The Manuscript of Ibn Fadl...,Michael Crichton,Fiction,1993.0,0.586776
6168,Disclosure,Michael Crichton,Fiction,1994.0,0.572655
885,Prey,Michael Crichton,Fiction,2003.0,0.547482
9,Airframe,Michael Crichton,Fiction,1997.0,0.538501
2749,Prey: A Novel,Michael Crichton,Fiction,2002.0,0.537795
1128,The Andromeda Strain,MICHAEL CRICHTON,Fiction,1992.0,0.527161
4635,Five Patients,MICHAEL CRICHTON,Social science,1989.0,0.526412



Recommendations by author with filtering:
Getting recommendations for author: Philip Morrison, excluding category: Fiction


Unnamed: 0,book_title,book_author,Category,year_of_publication
12027,The Ring of Truth: An Inquiry into How We Know...,Philip Morrison,Science,1987.0



Keyword-based book search:
Searching for books matching keywords: mystery detective crime Agatha


Unnamed: 0,book_title,book_author,Category,year_of_publication,relevance_score
11239,Abc Murders,Agatha Christie,Detective and mystery stories,1994.0,0.427542
6201,The Regatta Mystery and Other Stories,Agatha Christie,Fiction,1996.0,0.420276
648,Masterpieces of murder,Agatha Christie,Detective and mystery stories,1977.0,0.365369
1578,Agatha,Kathleen Tynan,Fiction,1979.0,0.364597
14839,Death on the Nile,Agatha Christie,Detective and mystery stories,1978.0,0.347172
10777,4:50 From Paddington,Agatha Christie,Detective and mystery stories,1992.0,0.344896
11861,Dead Man's Mirror,Agatha Christie,Detective and mystery stories,1986.0,0.337308
8018,MIRROR CRACKD,Agatha Christie,Detective and mystery stories,1981.0,0.330131
6205,The Clocks,Agatha Christie,Fiction,2004.0,0.329433
15577,Murder on the Orient Express,Agatha Christie,Detective and mystery stories,1984.0,0.328786


In [16]:
print("\n# Saving the enhanced model components")

with open('enhanced_content_based_book_recommendation_model.pkl', 'wb') as f:
    pickle.dump({
        'tfidf_vectorizer': tfidf,
        'tfidf_matrix': tfidf_matrix,
        'cosine_sim': cosine_sim,
        'indices': indices,
        'books_df': books_df
    }, f)

print("Enhanced model components saved successfully!")


# Saving the enhanced model components
Enhanced model components saved successfully!


In [71]:
def load_model(model_path='enhanced_content_based_book_recommendation_model.pkl'):
    """
    Load the saved model components and make them available for recommendation functions
    
    Parameters:
    ----------
    model_path : str
        Path to the saved model file
        
    Returns:
    -------
    Dictionary containing all model components
    """
    # Load the model
    with open(model_path, 'rb') as f:
        model_data = pickle.load(f)
    
    # Return the model components
    return model_data

def recommend_books_from_saved_model(model_path='enhanced_content_based_book_recommendation_model.pkl',
                                    query=None, query_type='title', top_n=10,
                                    exclude_categories=None, year_range=None, include_keywords=None):
    """
    Load model and make recommendations using the existing recommend_books function
    
    Parameters:
    ----------
    model_path : str
        Path to the saved model file
    Other parameters: Same as recommend_books()
        
    Returns:
    -------
    DataFrame containing book recommendations
    """
    # Load the model components
    model_data = load_model(model_path)
    
    # Make the model components available to the recommendation functions
    global tfidf, tfidf_matrix, cosine_sim, indices, books_df
    tfidf = model_data['tfidf_vectorizer']
    tfidf_matrix = model_data['tfidf_matrix']
    cosine_sim = model_data['cosine_sim']
    indices = model_data['indices']
    books_df = model_data['books_df']
    
    # Use the existing recommend_books function
    return recommend_books(query=query, query_type=query_type, top_n=top_n,
                          exclude_categories=exclude_categories, 
                          year_range=year_range, include_keywords=include_keywords)

In [73]:
def explain_recommendations(recommendations, original_title=None):
    """
    Generate explanations for why each book was recommended
    
    Parameters:
    ----------
    recommendations : DataFrame
        Dataframe of recommended books
    original_title : str, optional
        Title of the original book recommendations are based on
        
    Returns:
    -------
    DataFrame with explanation column added
    """
    if recommendations is None or len(recommendations) == 0:
        return None
    
    explained_recs = recommendations.copy()
    
    # Add explanation column
    explanations = []
    
    for _, row in recommendations.iterrows():
        explanation = []
        
        # If we have the original title
        if original_title:
            # Check if same author
            original_book = books_df[books_df['book_title'] == original_title].iloc[0]
            if row['book_author'] == original_book['book_author']:
                explanation.append(f"Same author as '{original_title}'")
            
            # Check if same category
            if row['Category'] == original_book['Category']:
                explanation.append(f"Same genre/category")
            else:
                explanation.append(f"Different genre that you might enjoy")
        
        # Add similarity explanation
        if 'similarity_score' in row:
            if row['similarity_score'] > 0.8:
                explanation.append("Very similar content")
            elif row['similarity_score'] > 0.5:
                explanation.append("Moderately similar themes")
            else:
                explanation.append("Some thematic elements in common")
        
        explanations.append(" - ".join(explanation))
    
    explained_recs['explanation'] = explanations
    
    return explained_recs

In [75]:
print("\n# Explanations for recommendations:")
explained_recommendations = explain_recommendations(basic_recs, title)
display(explained_recommendations)


# Explanations for recommendations:


Unnamed: 0,book_title,book_author,Category,year_of_publication,similarity_score,explanation
16454,Michael Crichton: A New Collection of Three Co...,Michael Crichton,Fiction,1994.0,0.594139,Same genre/category - Moderately similar themes
3744,The Terminal Man,Michael Crichton,Fiction,1988.0,0.587225,Same genre/category - Moderately similar themes
1127,Sphere,MICHAEL CRICHTON,Fiction,1988.0,0.586799,Same author as 'Timeline' - Same genre/categor...
9830,Eaters of the Dead: The Manuscript of Ibn Fadl...,Michael Crichton,Fiction,1993.0,0.586776,Same genre/category - Moderately similar themes
6168,Disclosure,Michael Crichton,Fiction,1994.0,0.572655,Same genre/category - Moderately similar themes
885,Prey,Michael Crichton,Fiction,2003.0,0.547482,Same genre/category - Moderately similar themes
9,Airframe,Michael Crichton,Fiction,1997.0,0.538501,Same genre/category - Moderately similar themes
2749,Prey: A Novel,Michael Crichton,Fiction,2002.0,0.537795,Same genre/category - Moderately similar themes
1128,The Andromeda Strain,MICHAEL CRICHTON,Fiction,1992.0,0.527161,Same author as 'Timeline' - Same genre/categor...
4635,Five Patients,MICHAEL CRICHTON,Social science,1989.0,0.526412,Same author as 'Timeline' - Different genre th...


In [77]:
print("\n# Conclusion")
print("""
This notebook has implemented an enhanced content-based book recommendation system with:
1. Advanced text preprocessing and feature engineering
2. N-gram based TF-IDF vectorization
3. Multiple querying options (title, author, keywords)
4. Filtering capabilities (categories, publication years)
5. Recommendation explanations

The system provides a rich set of options for getting personalized and diverse book recommendations.
""")


# Conclusion

This notebook has implemented an enhanced content-based book recommendation system with:
1. Advanced text preprocessing and feature engineering
2. N-gram based TF-IDF vectorization
3. Multiple querying options (title, author, keywords)
4. Filtering capabilities (categories, publication years)
5. Recommendation explanations

The system provides a rich set of options for getting personalized and diverse book recommendations.

