In [85]:
from surprise import SVD, NMF, Dataset, Reader, accuracy, KNNBaseline
from surprise.model_selection import cross_validate, train_test_split
import pandas as pd
import numpy as np
from main import get_top_n

%matplotlib inline

In [2]:
df = pd.read_csv('ratings.csv')
df.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [3]:
df.shape

(5976479, 3)

In [4]:
df.rating.describe()

count    5.976479e+06
mean     3.919866e+00
std      9.910868e-01
min      1.000000e+00
25%      3.000000e+00
50%      4.000000e+00
75%      5.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [5]:
reader = Reader(rating_scale=(1, 5.0))

In [7]:
data = Dataset.load_from_df(df[['user_id', 'book_id', 'rating']], reader)

In [8]:
trainset, testset = train_test_split(data, test_size=0.25)

In [9]:
nmf = NMF().fit(trainset)

In [10]:
predictions = nmf.test(testset)

In [11]:
accuracy.rmse(predictions)

RMSE: 0.8650


0.8650144415784866

In [12]:
predictions[:5]

[Prediction(uid=52653, iid=2, r_ui=5.0, est=4.508873925882307, details={'was_impossible': False}),
 Prediction(uid=38641, iid=658, r_ui=2.0, est=3.482351345738407, details={'was_impossible': False}),
 Prediction(uid=29412, iid=389, r_ui=5.0, est=4.8325899867482045, details={'was_impossible': False}),
 Prediction(uid=17038, iid=9, r_ui=5.0, est=4.124137226916257, details={'was_impossible': False}),
 Prediction(uid=27362, iid=6230, r_ui=4.0, est=3.1133372460228355, details={'was_impossible': False})]

In [27]:
least_rat = df.groupby('book_id').count().sort_values(by='rating',ascending=True).head(1)
most_rat = df.groupby('book_id').count().sort_values(by='rating',ascending=False).head(1)

In [47]:
df_book = pd.read_csv('books.csv').set_index('book_id')

## Least Rated Book

In [48]:
least_rat
least_rat.join(df_book, on='book_id')['title']

book_id
7803    Kindle User's Guide
Name: title, dtype: object

## Most Rated Book

In [49]:
most_rat.join(df_book)['title']

book_id
1    The Hunger Games (The Hunger Games, #1)
Name: title, dtype: object

## Average number of Books Read 

In [50]:
df.groupby('user_id').count().mean()[0]

111.86880428271938

## How many books published between 2000 and 2010

In [57]:
df_book[df_book.original_publication_year.between(2000, 2010, inclusive=True)].shape[0]

3594

## Most Similar books to Great Gatsby

In [58]:
knn = KNNBaseline(sim_options={'name': 'cosine', 'user_based': False})

In [59]:
knn.fit(trainset)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x21b315dd278>

In [75]:
id = df_book[df_book.title == 'The Great Gatsby'].index[0]
id

5

In [78]:
sim = knn.get_neighbors(id, 10)
sim

[1, 13, 15, 16, 22, 24, 26, 27, 30, 31]

In [84]:
df_book.iloc[sim]['title']

book_id
2     Harry Potter and the Sorcerer's Stone (Harry P...
14                                          Animal Farm
16     The Girl with the Dragon Tattoo (Millennium, #1)
17                 Catching Fire (The Hunger Games, #2)
23    Harry Potter and the Chamber of Secrets (Harry...
25    Harry Potter and the Deathly Hallows (Harry Po...
27    Harry Potter and the Half-Blood Prince (Harry ...
28                                    Lord of the Flies
31                                             The Help
32                                      Of Mice and Men
Name: title, dtype: object

## Recommended Books for user# 37

In [95]:
top_n = get_top_n(predictions, n=5)

In [106]:
for i, (uid, ur) in enumerate(top_n.items()):
    if i == 37:
        books = [iid for iid, _ in ur]
books

[1203, 608, 8681, 1781, 1402]

In [108]:
df_book.iloc[books]['title']

book_id
1204                          The Invisible Man
609                                        Cell
8682                             Thérèse Raquin
1782    The Dream Thieves (The Raven Cycle, #2)
1403                                      Horns
Name: title, dtype: object