In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
ratings_df = pd.read_csv('../data/minified/cleaned_ratings_reduced.csv')
ratings_df.head()

Unnamed: 0,User-ID,Book-ID,Book-Rating
0,87555,39877,0
1,55490,184467,10
2,139819,184467,0
3,150124,184467,0
4,152016,184467,0


In [3]:
# Look at the ratings column of the ratings table

ratings_df['Book-Rating'].value_counts()

Book-Rating
0     357518
8      54946
10     39754
7      38023
9      36922
5      22298
6      17583
4       4140
3       2793
2       1262
1        767
Name: count, dtype: int64

In [4]:
ratings_df["User-ID"].nunique()

68787

We can see that the ratings have been made 1-10

In [5]:
books_df = pd.read_csv('../data/minified/cleaned_books_reduced.csv')
books_df.head()

Unnamed: 0,Book-ID,Book-Title,Book-Author
0,39877,Count of Monte Cristo,A Dumas
1,184467,The Dragon and the Unicorn,A. A. Attanasio
2,231910,When We Were Very Young,A. A. Milne
3,234898,Winnie the Pooh,A. A. Milne
4,191108,The House at Pooh Corner,A. A. Milne


In [6]:
#TODO

# Book title has been duplicated since the same book has been published multiple times

# IMPORTANT: 
# Remove duplicates and replace the title with the most recent publication, 
# replace the ISBN of the ratings table with the ISBN of the most recent publication

# Remove books that have been rated less than 10 times <- Check on this (not sure if its necessary just yet)
# Same with this -> Remove users that have rated less than 10 books
#       By simply looking at the data we can see that there are users that have not rated any books 
#       (close to 50% of the users have not rated any books)

In [7]:
# Combine the books and the ratings dataframes using ISBN
# Keep only the columns Book-Title, Book-Author, Year-Of-Publication, Publisher, User-ID, Book-Rating

combined_df = pd.merge(ratings_df, books_df, on='Book-ID')[['Book-Title', 'Book-Author', 'User-ID', 'Book-Rating']]

combined_df.head()

Unnamed: 0,Book-Title,Book-Author,User-ID,Book-Rating
0,Count of Monte Cristo,A Dumas,87555,0
1,The Dragon and the Unicorn,A. A. Attanasio,55490,10
2,The Dragon and the Unicorn,A. A. Attanasio,139819,0
3,The Dragon and the Unicorn,A. A. Attanasio,150124,0
4,The Dragon and the Unicorn,A. A. Attanasio,152016,0


In [8]:
# The books with the highest 10/10 ratings
combined_df[combined_df['Book-Rating'] >= 10].groupby('Book-Title').size().sort_values(ascending=False).head(10)

Book-Title
The Da Vinci Code                                                   160
Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))    152
Harry Potter and the Prisoner of Azkaban (Book 3)                   150
The Lovely Bones: A Novel                                           148
Harry Potter and the Chamber of Secrets (Book 2)                    139
Harry Potter and the Goblet of Fire (Book 4)                        136
To Kill a Mockingbird                                               133
Harry Potter and the Order of the Phoenix (Book 5)                  115
The Secret Life of Bees                                             112
The Fellowship of the Ring (The Lord of the Rings, Part 1)          103
dtype: int64

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(books_df['Book-Title'])

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title, result_count=5):
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -1 * result_count)[-1 * result_count:]
    results = books_df.iloc[indices].iloc[::-1]["Book-Title"]
    
    return results

In [11]:
# Test the search function
search('harry potter and the', result_count=10)

7130       Harry Potter and the Sorcerer's Stone (Book 1)
7133         Harry Potter and the Goblet of Fire (Book 4)
7137    Harry Potter and the Sorcerer's Stone (Harry P...
7233    Harry Potter and the Prisoner of Azkaban (Harr...
7139     Harry Potter and the Chamber of Secrets (Book 2)
7141    Harry Potter and the Order of the Phoenix (Boo...
7146    Harry Potter and the Prisoner of Azkaban (Book 3)
7142    Harry Potter and the Sorcerer's Stone (Book 1 ...
7140    Harry Potter and the Chamber of Secrets Postca...
8576                      Harry Potter Und Der Feuerkelch
Name: Book-Title, dtype: object

In [12]:
def get_similar_users(title, rating_threshold=8):
    similar_users = combined_df[combined_df['Book-Title'] == title].where(combined_df['Book-Rating'] >= rating_threshold)["User-ID"].unique()
    # Change the user_id to int
    similar_users = similar_users.astype(int)
    return similar_users

In [13]:
def get_recommendations_from_similar_users(similar_users, rating_lower_bound=8, minimum_similarity=0.03, normalize=False, count=None):
    """
    Get the recommendations for a user based on the similar users

    Inputs:
    similar_users: list of similar users
    minimum_similarity: minimum similarity to consider a movie (0 < minimum_similarity < 1)

    Returnds:
    similar_user_recs: list of recommended movies
    """

    similar_users_recommendations = combined_df[combined_df['User-ID'].isin(similar_users) & (combined_df['Book-Rating'] >= rating_lower_bound
                                                                                            )]["Book-Title"]

    # Get the percentage of similar users who have liked the book
    similar_user_recs = similar_users_recommendations.value_counts() / len(similar_users)

    # Filter only the books which have been liked by more than minimum_similarity of the users
    similar_user_recs = similar_user_recs[similar_user_recs > minimum_similarity]

    if not normalize:
        if count is None:
            return similar_user_recs
        else:
            return similar_user_recs.head(count)


    # Ratings from all users
    all_user_recs = combined_df["Book-Title"].value_counts() / len(combined_df["User-ID"].unique())

    # Combine the similar user recommendations and all user recommendations
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    # Take only the rows where the similar users have liked the book
    rec_percentages.dropna(inplace=True)


    # Calculate the recommendation score by dividing the percentage of similar users who liked the movie 
    # by the percentage of all users who liked the movie
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

    if count is None:
        return rec_percentages.sort_values("score", ascending=False)
    else:
        return rec_percentages.sort_values("score", ascending=False).head(count)

In [14]:
################################################
# Use the search feature to find the movie title

# title_input = input("Enter title: ")
# search_results = search(title_input, 10)
# print(search_results)


In [15]:
# title = "The Da Vinci Code"
title = "The Great Gatsby"
rating_lower_bound = 8
similar_users = get_similar_users(title, rating_threshold=rating_lower_bound)
print("Number of similar users: ", len(similar_users))

recommendations = get_recommendations_from_similar_users(similar_users, minimum_similarity=0.01, normalize=False)
recommendations

Number of similar users:  79


Book-Title
The Great Gatsby                         0.987342
The Catcher in the Rye                   0.088608
The Joy Luck Club                        0.075949
Bridget Jones: The Edge of Reason        0.075949
The Lovely Bones: A Novel                0.063291
                                           ...   
Dead Aim                                 0.012658
Complicity                               0.012658
The Wasp Factory                         0.012658
Excession                                0.012658
Their Eyes Were Watching God: A Novel    0.012658
Name: count, Length: 1120, dtype: float64

In [16]:
# Calculate the precision@K and recall@K

def calculate_precision_recall_at_k(title, rating_lower_bound=8, k=10):
    similar_users = get_similar_users(title, rating_threshold=rating_lower_bound)
    recommendations = get_recommendations_from_similar_users(similar_users, minimum_similarity=0.01, normalize=False, count=k)

    # Get the books that the user has liked
    liked_books = combined_df[combined_df['User-ID'].isin(similar_users) & (combined_df['Book-Rating'] >= rating_lower_bound)]["Book-Title"].unique()

    # Calculate the precision
    if len(recommendations) == 0:
        precision = 0
    else:
        precision = len(set(recommendations.index).intersection(set(liked_books))) / len(recommendations)

    # Calculate the recall
    if len(liked_books) == 0:
        recall = 0
    else:
        recall = len(set(recommendations.index).intersection(set(liked_books))) / len(liked_books)

    return precision, recall

In [17]:
calculate_precision_recall_at_k(title, rating_lower_bound=8, k=10)

(1.0, 0.008928571428571428)

In [18]:
# Calculate the average precision and recall for a list of titles

def calculate_average_precision_recall_at_k(titles, rating_lower_bound=8, k=10):
    precisions = []
    recalls = []

    for title in titles:
        precision, recall = calculate_precision_recall_at_k(title, rating_lower_bound, k)
        precisions.append(precision)
        recalls.append(recall)

    return sum(precisions) / len(precisions), sum(recalls) / len(recalls)

In [20]:
calculate_average_precision_recall_at_k(books_df["Book-Title"].unique(), rating_lower_bound=8, k=10)

KeyboardInterrupt: 