In [274]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("data.csv")

In [275]:
df['categories'] = df['categories'].str.replace(',', ' ')

In [276]:
features = ['authors','categories', 'description']
average_rating = ['average_rating']

In [277]:
def combine_features(row):
    return row['authors']+" "+row['categories']

In [278]:
df['average_rating'].replace('', float('nan'), inplace=True)

# Drop rows with NaN values in 'average_rating' or handle as needed
df.dropna(subset=['average_rating'], inplace=True)

# Convert average_rating to float
df['average_rating'] = (df['average_rating']).astype(float)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['average_rating'].replace('', float('nan'), inplace=True)


In [279]:
for feature in features:
    df[feature] = df[feature].fillna('') #filling all NaNs with blank string
for rating in features:
    df[average_rating] = df[average_rating].fillna('') #filling all NaNs with blank string
df["combined_features"] = df.apply(combine_features,axis=1) 
df["average_rating"] = df.apply(average_rating,axis=1)

In [280]:
cv1 = CountVectorizer() #creating new CountVectorizer() object
count_matrix1 = cv1.fit_transform(df["combined_features"])
cv2 = StandardScaler() #creating new CountVectorizer() object
df['average_rating_scaled'] = cv2.fit_transform(df[['average_rating']])

In [281]:
df.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,combined_features,average_rating_scaled
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,Marilynne Robinson Fiction,-0.251364
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,Charles Osborne;Agatha Christie Detective and ...,-0.311727
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0,Stephen R. Donaldson American fiction,0.110816
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,Sidney Sheldon Fiction,-0.00991
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,Clive Staples Lewis Christian life,0.654086


In [282]:
cosine_sim1 = cosine_similarity(count_matrix1)
total_rating = abs(cosine_sim1[book_index]+df['average_rating_scaled'])/2
#cosine_sim2 = cosine_similarity(count_matrix2)

In [283]:
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]
def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]
def get_rating_from_index(index):
    return df[df.index == index]["average_rating"].values[0]

In [284]:
# Function to get the index of the book title
def get_index_from_title(title):
    return df[df.title == title].index[0]

# Prompt the user to enter a book they like
book_user_likes = input("Enter a book you like: ")
#book_user_likes = "The One Tree"

# Get the index of the book in the dataset
book_index = get_index_from_title(book_user_likes)

# Calculate the cosine similarity scores for the input book
similar_books = list(enumerate(abs(total_rating)))
#df['average_rating'] = total_rating

Enter a book you like:  Murder in LaMut


In [285]:
sorted_similar_books = sorted(similar_books,key=lambda x:x[1],reverse=True)[1:11]


In [290]:
print("Unsorted similar_books:", similar_books[2])
print("Sorted similar_books:", sorted_similar_books[2])


Unsorted similar_books: (2, 0.05540810792901152)
Sorted similar_books: (4453, 5.935649870948058)


In [297]:
i = 0
print(f"Top 10 similar books to '{book_user_likes}' are:\n")
print("Title" + " " * 26 + "Rating")
print("-" * 60)

for element in sorted_similar_books:
    # element[0] is the index of the book, element[1] is the total_rating
    title = get_title_from_index(element[0])
    rating = get_rating_from_index(element[0])  # Rating is already in `element[1]`
    print(f"{title:<30} {rating:.2f}")  # Adjust formatting as needed
    i += 1
    if i >= 10:
        break

Top 10 similar books to 'Murder in LaMut' are:

Title                          Rating
------------------------------------------------------------
Fire on the Mountain (Hb)      4.27
Fear and Loathing in Las Vegas 4.08
Romeo and Juliet               3.74
The Fourth Book of Lost Swords 3.83
The Gangs of New York          3.59
Out of Place                   3.96
The Scandal of Ulysses         3.79
Timothy McSweeney's            3.73
Under The Influence            3.64
Cry, the Beloved Country       3.89
