In [1]:
# This is a basic recommendation system. The user will give a single book and the system returns books the user is likely to enjoy based on the similiarity (to the content) of the book that the user provided.

# Method: Content filtering

In [6]:
import pandas as pd
import numpy as np

# NLP stuff.
import string
from rake_nltk import Rake
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
book_data = pd.read_csv('pages-1-100.csv', sep=',')

In [8]:
# Remove duplicates
# Remove duplicates by common title
book_data.drop_duplicates(subset='title', inplace=True)

# Reset the index
# Rely on index later. 
# If we remove values here, the index will no longer be correct
book_Data = book_data.reset_index()


In [None]:
# Weighted rating and top books
# need an algorithm to weight the rating values
# IMDB's FAW has a weighted rating

# R = average rating (mean)
# v = number of votes/ratings
# m = minimum number of votes to be listed
# C = mean vote across the whole report

In [17]:
C = book_data['avg_rating'].mean()
C

4.0527289955780095

In [10]:
# Starting with 10th percentile
m = book_data['num_ratings'].quantile(0.1)
m

2421.9000000000005

In [12]:
def weighted_rating(book, m, C):
    # Average rating for the book
    R = book['avg_rating']
    # Total number of votes for the book
    v = book['num_ratings']
    # IMDB formula
    return (v / (v+m) * R) + (m / (m+v) * C)

# Calculate the weighted rating for books that are within our threshold
book_data.loc[book_data.num_ratings > m, 'weighted_rating'] = book_data.loc[book_data.num_ratings > m].apply(lambda x: weighted_rating(x, m, C), axis = 1)

# Fill the NaN values (ie. books lower than our threshold) with a zero score
book_data['weighted_rating'].fillna(0, inplace=True)

In [13]:
book_data.sort_values('weighted_rating', ascending = False).head(5)

Unnamed: 0,title,original_title,series,language,authors,avg_rating,num_ratings,num_reviews,genres,description,url,weighted_rating
1539,The Complete Calvin and Hobbes,The Complete Calvin and Hobbes,Calvin and Hobbes,English,Bill Watterson,4.82,33322,961,"Sequential Art,Comics,Humor,Sequential Art,Gra...",[ Box Set | Book One | Book Two | Book Three...,https://www.goodreads.com/book/show/24812.The_...,4.768012
988,Words of Radiance,Words of Radiance,The Stormlight Archive,English,Brandon Sanderson,4.76,172432,10541,"Fantasy,Fiction,Fantasy,Epic Fantasy,Fantasy,H...",From #1 New York Times bestselling author Bran...,https://www.goodreads.com/book/show/17332218-w...,4.750204
6538,"Harry Potter Boxed Set, Books 1-5 (Harry Potte...",,,English,"J.K. Rowling,Mary GrandPré (Illustrator)",4.78,39132,162,"Fantasy,Young Adult,Fiction,Fantasy,Magic",Box Set containing Harry Potter and the Sorcer...,https://www.goodreads.com/book/show/8.Harry_Po...,4.737612
1481,Harry Potter Series Box Set,,Harry Potter,English,J.K. Rowling,4.74,234260,7065,"Fantasy,Young Adult,Fiction","Over 4000 pages of Harry Potter and his world,...",https://www.goodreads.com/book/show/862041.Har...,4.732967
5455,It's a Magical World,It's a Magical World,Calvin and Hobbes,English,Bill Watterson,4.76,25119,334,"Sequential Art,Comics,Humor,Fiction,Sequential...",When cartoonist Bill Watterson announced that ...,https://www.goodreads.com/book/show/24814.It_s...,4.697804


In [14]:
book_data.sort_values('weighted_rating', ascending = False).tail(5)

Unnamed: 0,title,original_title,series,language,authors,avg_rating,num_ratings,num_reviews,genres,description,url,weighted_rating
6540,Awakening Inner Guru,,,English,"Banani Ray,Amit Ray",4.78,104,24,"Spirituality,Inspirational,Self Help",Awakening Inner Guru is a clear and straightfo...,https://www.goodreads.com/book/show/8596181-aw...,0.0
6534,30 Pieces of Gold: Self Growth - How to use In...,,,English,"Ron Millicent,Millie Parker (Editor)",4.31,128,1,"Novels,Inspirational,Contemporary,Adult,Self H...",Inspirational Quotes – Hah - Do They Really Wo...,https://www.goodreads.com/book/show/27467291-3...,0.0
6520,The Pace,The Pace,The Pace,English,Shelena Shorts,3.7,1409,258,"Young Adult,Fantasy,Romance,Fantasy,Paranormal...",Weston Wilson is not immortal and he is of thi...,https://www.goodreads.com/book/show/6599113-th...,0.0
6511,A Midnight Clear,A Midnight Clear,,English,William Wharton,4.18,1391,66,"Fiction,Historical,Historical Fiction,War,War,...",Set in the Ardennes Forest on Christmas Eve 19...,https://www.goodreads.com/book/show/720234.A_M...,0.0
4890,Death of the Body,,Crossing Death,English,Rick Chiantaretto,3.82,217,74,"Fantasy,Fantasy,Paranormal,Fantasy,Urban Fanta...",I grew up in a world of magic. By the time I w...,https://www.goodreads.com/book/show/18624197-d...,0.0


In [20]:
# A little cleanup
del C
del m

NameError: name 'C' is not defined

In [21]:
# Content -Based Reccommender System
# Crete an amalgam of features per book that will be used to calculate the similarity story between books
# Values: title, series, language, author(s), genres, keywords from book's descriptions

# Add weight by:
#   - mentioning the words multiple times in the vector that we use to calculate similarity

# Problems:
#   - genres and languages can overlap (English vs. English) 
#   - Processing is a little trivial without much testing yet
#   - all authors are included blindly; could be filtered based on role