In [14]:
import numpy as np
import pandas as pd
import ast

books = pd.read_csv('../dataset/Book_Details.csv')


print("Shape:", books.shape)
books.head(3)

Shape: (16225, 15)


Unnamed: 0.1,Unnamed: 0,book_id,cover_image_uri,book_title,book_details,format,publication_info,authorlink,author,num_pages,genres,num_ratings,num_reviews,average_rating,rating_distribution
0,0,1,https://images-na.ssl-images-amazon.com/images...,Harry Potter and the Half-Blood Prince,"It is the middle of the summer, but there is a...","['652 pages, Paperback']","['First published July 16, 2005']",https://www.goodreads.com/author/show/1077326....,J.K. Rowling,['652'],"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",3292516,58398,4.58,"{'5': '2,244,154', '4': '775,028', '3': '219,8..."
1,1,2,https://images-na.ssl-images-amazon.com/images...,Harry Potter and the Order of the Phoenix,Harry Potter is about to start his fifth year ...,"['912 pages, Paperback']","['First published June 21, 2003']",https://www.goodreads.com/author/show/1077326....,J.K. Rowling,['912'],"['Young Adult', 'Fiction', 'Magic', 'Childrens...",3401709,64300,4.5,"{'5': '2,178,760', '4': '856,178', '3': '293,2..."
2,2,3,https://images-na.ssl-images-amazon.com/images...,Harry Potter and the Sorcerer's Stone,Harry Potter has no idea how famous he is. Tha...,"['309 pages, Hardcover']","['First published June 26, 1997']",https://www.goodreads.com/author/show/1077326....,J.K. Rowling,['309'],"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...",10116247,163493,4.47,"{'5': '6,544,542', '4': '2,348,390', '3': '856..."


In [15]:
relevant_columns = [
    'book_title', 
    'book_details',     
    'author', 
    'genres', 
    'average_rating',
    'num_ratings',
    'publication_info',  
    'cover_image_uri'     
]

books = books[relevant_columns]

books.rename(columns={
    'book_title': 'title',
    'book_details': 'description',
    'cover_image_uri': 'image'
}, inplace=True)

books.dropna(inplace=True)
books.drop_duplicates(subset='title', inplace=True)

print("Data loaded successfully. Shape:", books.shape)
books.head(3)

Data loaded successfully. Shape: (15443, 8)


Unnamed: 0,title,description,author,genres,average_rating,num_ratings,publication_info,image
0,Harry Potter and the Half-Blood Prince,"It is the middle of the summer, but there is a...",J.K. Rowling,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",4.58,3292516,"['First published July 16, 2005']",https://images-na.ssl-images-amazon.com/images...
1,Harry Potter and the Order of the Phoenix,Harry Potter is about to start his fifth year ...,J.K. Rowling,"['Young Adult', 'Fiction', 'Magic', 'Childrens...",4.5,3401709,"['First published June 21, 2003']",https://images-na.ssl-images-amazon.com/images...
2,Harry Potter and the Sorcerer's Stone,Harry Potter has no idea how famous he is. Tha...,J.K. Rowling,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...",4.47,10116247,"['First published June 26, 1997']",https://images-na.ssl-images-amazon.com/images...


In [16]:
import re  
import ast


def extract_year(text):
    text = str(text)
  
    match = re.search(r'\d{4}', text) 
    if match:
        return match.group(0) 
    else:
        return "Unknown" 

# Apply the fix
print("Fixing Year column...")
books['year'] = books['publication_info'].apply(extract_year)



def clean_genres(text):
   
    if pd.isna(text) or text == "":
        return ""
    
    text = str(text)
    
    try:
        
        genres_list = ast.literal_eval(text)
        
       
        if isinstance(genres_list, list):
            
            return " ".join(genres_list[:3])
        else:
            return str(text)
            
    except (ValueError, SyntaxError):
   
        clean_text = text.replace("[", "").replace("]", "").replace("'", "").replace(",", "")
        return clean_text


books['genres_clean'] = books['genres'].apply(clean_genres)
books['author_clean'] = books['author'].apply(lambda x: str(x).replace(" ", "") if not pd.isna(x) else "")
books['description_short'] = books['description'].apply(lambda x: " ".join(str(x).split()[:50]))



Fixing Year column...


In [17]:

books['tags'] = books['description_short'] + " " + books['genres_clean'] + " " + books['author_clean']

books['tags'] = books['tags'].apply(lambda x: x.lower())

final_df = books[['title', 'description', 'author', 'year', 'average_rating', 'image', 'tags', 'genres_clean']].copy()

print("Final Data Ready!")
final_df.head()

Final Data Ready!


Unnamed: 0,title,description,author,year,average_rating,image,tags,genres_clean
0,Harry Potter and the Half-Blood Prince,"It is the middle of the summer, but there is a...",J.K. Rowling,2005,4.58,https://images-na.ssl-images-amazon.com/images...,"it is the middle of the summer, but there is a...",Fantasy Young Adult Fiction
1,Harry Potter and the Order of the Phoenix,Harry Potter is about to start his fifth year ...,J.K. Rowling,2003,4.5,https://images-na.ssl-images-amazon.com/images...,harry potter is about to start his fifth year ...,Young Adult Fiction Magic
2,Harry Potter and the Sorcerer's Stone,Harry Potter has no idea how famous he is. Tha...,J.K. Rowling,1997,4.47,https://images-na.ssl-images-amazon.com/images...,harry potter has no idea how famous he is. tha...,Fantasy Fiction Young Adult
3,Harry Potter and the Prisoner of Azkaban,"Harry Potter, along with his best friends, Ron...",J.K. Rowling,1999,4.58,https://images-na.ssl-images-amazon.com/images...,"harry potter, along with his best friends, ron...",Fantasy Fiction Young Adult
4,Harry Potter and the Goblet of Fire,It is the summer holidays and soon Harry Potte...,J.K. Rowling,2000,4.57,https://images-na.ssl-images-amazon.com/images...,it is the summer holidays and soon harry potte...,Fantasy Young Adult Fiction


In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(final_df['tags']).toarray()


similarity = cosine_similarity(vectors)


In [19]:
import pickle

pickle.dump(final_df, open('books.pkl', 'wb'))


pickle.dump(similarity, open('similarity.pkl', 'wb'))
