In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
try:
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('punkt_tab', quiet=True)
    print("NLTK resources downloaded.")
except:
    print("NLTK resources couldn't be downloaded. Some text processing features might be limited.")

NLTK resources downloaded.


In [3]:
books_df = pd.read_csv("./Dataset/books.csv")
books_df.shape

(20935, 9)

In [4]:
books_df = books_df.drop_duplicates(subset='book_title', keep='first')
books_df.shape

(19460, 9)

In [5]:
def advanced_text_preprocessing(text):
    """
    Apply advanced text preprocessing including:
    - Remove special characters and numbers
    - Convert to lowercase
    - Tokenize
    - Remove stopwords
    - Lemmatize
    """
    if not isinstance(text, str):
        return ""
    
    text = re.sub('[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

books_df['processed_summary'] = books_df['Summary'].apply(advanced_text_preprocessing)

In [6]:
books_df.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,Summary,Language,Category,img_l,average_rating,processed_summary
0,399135782,The Kitchen God's Wife,Amy Tan,1991.0,A Chinese immigrant who is convinced she is dy...,en,Fiction,http://images.amazon.com/images/P/0399135782.0...,3.5,chinese immigrant convinced dying threatens ce...
1,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000.0,"Essays by respected military historians, inclu...",en,History,http://images.amazon.com/images/P/0425176428.0...,3.0,essay respected military historian including s...
2,771074670,Nights Below Station Street,David Adams Richards,1988.0,Another story based in the fictional rural tow...,en,Fiction,http://images.amazon.com/images/P/0771074670.0...,3.0,another story based fictional rural town miram...
3,440234743,The Testament,John Grisham,1999.0,"A suicidal billionaire, a burnt-out Washington...",en,Fiction,http://images.amazon.com/images/P/0440234743.0...,3.3,suicidal billionaire burntout washington litig...
4,452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994.0,Staring unflinchingly into the abyss of slaver...,en,Fiction,http://images.amazon.com/images/P/0452264464.0...,3.3,staring unflinchingly abyss slavery novel tran...


In [7]:
books_df['weighted_content'] = (
    books_df['book_title'] + ' ' + books_df['book_title'] + ' ' + 
    books_df['book_author'] + ' ' + books_df['book_author'] + ' ' + 
    books_df['Category'] + ' ' +
    books_df['processed_summary']
)
books_df['weighted_content'][0]

"The Kitchen God's Wife The Kitchen God's Wife Amy Tan Amy Tan Fiction chinese immigrant convinced dying threatens celebrate chinese new year unburdening everybodys hidden truth thus prompting series comic misunderstanding"

In [8]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))
tfidf_matrix = tfidf.fit_transform(books_df['weighted_content'])

In [9]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [10]:
indices = pd.Series(books_df.index, index=books_df['book_title']).drop_duplicates()

In [11]:
with open('model.pkl', 'wb') as f:
    pickle.dump({
        'tfidf_vectorizer': tfidf,
        'tfidf_matrix': tfidf_matrix,
        'cosine_sim': cosine_sim,
        'indices': indices,
        'books_df': books_df
    }, f)