In [1]:
import pandas as pd

In [2]:
ratings_df = pd.read_csv('data/Ratings.csv')
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [8]:
# Look at the ratings column of the ratings table

ratings_df['Book-Rating'].value_counts()

Book-Rating
0     716109
8     103736
10     78610
7      76457
9      67541
5      50974
6      36924
4       8904
3       5996
2       2759
1       1770
Name: count, dtype: int64

In [11]:
ratings_df["User-ID"].nunique()

105283

We can see that the ratings have been made 1-10

In [4]:
books_df = pd.read_csv('data/Books.csv')
books_df.head()

  books_df = pd.read_csv('data/Books.csv')


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [None]:
#TODO

# Book title has been duplicated since the same book has been published multiple times

# IMPORTANT: 
# Remove duplicates and replace the title with the most recent publication, 
# replace the ISBN of the ratings table with the ISBN of the most recent publication

# Remove books that have been rated less than 10 times <- Check on this (not sure if its necessary just yet)
# Same with this -> Remove users that have rated less than 10 books
#       By simply looking at the data we can see that there are users that have not rated any books 
#       (close to 50% of the users have not rated any books)

In [5]:
# Combine the books and the ratings dataframes using ISBN
# Keep only the columns Book-Title, Book-Author, Year-Of-Publication, Publisher, User-ID, Book-Rating

combined_df = pd.merge(ratings_df, books_df, on='ISBN')[['Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'User-ID', 'Book-Rating']]

combined_df.head()

Unnamed: 0,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating
0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,276725,0
1,Rites of Passage,Judith Rae,2001,Heinle,276726,5
2,The Notebook,Nicholas Sparks,1996,Warner Books,276727,0
3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,276729,3
4,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,276729,6


In [14]:
# The books with the highest 10/10 ratings
combined_df[combined_df['Book-Rating'] >= 10].groupby('Book-Title').size().sort_values(ascending=False).head(10)

Book-Title
The Da Vinci Code                                                   160
Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))    152
Harry Potter and the Prisoner of Azkaban (Book 3)                   150
The Lovely Bones: A Novel                                           148
Harry Potter and the Chamber of Secrets (Book 2)                    139
Harry Potter and the Goblet of Fire (Book 4)                        136
To Kill a Mockingbird                                               133
Harry Potter and the Order of the Phoenix (Book 5)                  115
The Secret Life of Bees                                             112
The Fellowship of the Ring (The Lord of the Rings, Part 1)          103
dtype: int64

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(books_df['Book-Title'])

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title, result_count=5):
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -1 * result_count)[-1 * result_count:]
    results = books_df.iloc[indices].iloc[::-1]["Book-Title"]
    
    return results

In [17]:
# Test the search function
search('harry potter and the', result_count=10)

140443    Harry Potter and the Prisoner of Azkaban (Harr...
163282               Harry Potter and the  Sorcerer's Stone
72186     Harry Potter and the Chamber of Secrets (Harry...
209283    Harry Potter and the Sorcerer's Stone (Harry P...
35086     Harry Potter and the Prisoner of Azkaban (Harr...
77384     Harry Potter and the Sorcerer's Stone (Harry P...
257263    Harry Potter and the Goblet of Fire (Harry Pot...
259611                Harry Potter and the Sorcerer's Stone
2143      Harry Potter and the Sorcerer's Stone (Harry P...
234605                Harry Potter and the Sorcerer's Stone
Name: Book-Title, dtype: object

In [109]:
def get_similar_users(title, rating_threshold=8):
    similar_users = combined_df[combined_df['Book-Title'] == title].where(combined_df['Book-Rating'] >= rating_threshold)["User-ID"].unique()
    # Change the user_id to int
    similar_users = similar_users.astype(int)
    return similar_users

In [153]:
def get_recommendations_from_similar_users(similar_users, rating_lower_bound=8, minimum_similarity=0.03, normalize=False, count=None):
    """
    Get the recommendations for a user based on the similar users

    Inputs:
    similar_users: list of similar users
    minimum_similarity: minimum similarity to consider a movie (0 < minimum_similarity < 1)

    Returns:
    similar_user_recs: list of recommended movies
    """

    similar_users_recommendations = combined_df[combined_df['User-ID'].isin(similar_users) & (combined_df['Book-Rating'] >= rating_lower_bound
                                                                                            )]["Book-Title"]

    # Get the percentage of similar users who have liked the book
    similar_user_recs = similar_users_recommendations.value_counts() / len(similar_users)

    # Filter only the books which have been liked by more than minimum_similarity of the users
    similar_user_recs = similar_user_recs[similar_user_recs > minimum_similarity]

    if not normalize:
        if count is None:
            return similar_user_recs
        else:
            return similar_user_recs.head(count)


    # Ratings from all users
    all_user_recs = combined_df["Book-Title"].value_counts() / len(combined_df["User-ID"].unique())

    # Combine the similar user recommendations and all user recommendations
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    # Take only the rows where the similar users have liked the book
    rec_percentages.dropna(inplace=True)


    # Calculate the recommendation score by dividing the percentage of similar users who liked the movie 
    # by the percentage of all users who liked the movie
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

    if count is None:
        return rec_percentages.sort_values("score", ascending=False)
    else:
        return rec_percentages.sort_values("score", ascending=False).head(count)
        


    

In [155]:
################################################
# Use the search feature to find the movie title

title_input = input("Enter title: ")
search_results = search(title_input, 10)
print(search_results)


165320                                    The Da Vinci Code
748                                       The Da Vinci Code
252066                                    The Da Vinci Code
2586                                      The Da Vinci Code
71604                                     The Da Vinci Code
258666                                    The Da Vinci Code
4064                 El Codigo Da Vinci / The Da Vinci Code
234149    The Da Vinci Hoax: Exposing the Errors in The ...
27158          The Da Vinci Code (Random House Large Print)
218654                                    Leonardo Da Vinci
Name: Book-Title, dtype: object


In [156]:
title = "The Da Vinci Code"
rating_lower_bound = 8
similar_users = get_similar_users(title, rating_threshold=rating_lower_bound)
print("Number of similar users: ", len(similar_users))

recommendations = get_recommendations_from_similar_users(similar_users, minimum_similarity=0.01, normalize=False)
recommendations

Number of similar users:  381


  similar_users = similar_users.astype(int)


Book-Title
The Da Vinci Code                                                    0.997375
Angels &amp; Demons                                                  0.112861
The Lovely Bones: A Novel                                            0.091864
The Secret Life of Bees                                              0.073491
To Kill a Mockingbird                                                0.047244
                                                                       ...   
Carrie                                                               0.010499
Basket Case                                                          0.010499
Love You Forever                                                     0.010499
Left Behind: A Novel of the Earth's Last Days (Left Behind No. 1)    0.010499
Nicolae: The Rise of Antichrist (Left Behind No. 3)                  0.010499
Name: count, Length: 393, dtype: float64