In [3]:
!pip install scikit-surprise
!pip install matplotlib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
## import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [9]:
## Bring in the datasets from source
rate_data = pd.read_csv('/content/Books data/ratings.csv')
meta_data = pd.read_csv('/content/Books data/books.csv')

In [10]:
rate_data.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [11]:
meta_data.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [12]:
## Creating a dataset that surprise is going to use.
from surprise import Dataset,Reader

reader = Reader(rating_scale=(1,5)) ## Setting the rating scale between 1 and 5
data_set = Dataset.load_from_df(rate_data[['user_id','book_id','rating']],reader)

In [13]:
## Performing training and cross validation of an SVD osample
from surprise import SVD
from surprise.model_selection import cross_validate
svd = SVD(verbose=True,n_epochs=10)
cross_validate(svd,data_set,measures=['RMSE','MAE'],cv=3,verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8550  0.8576  0.8563  0.8563  0.0011  
MAE (testset)     0.6749  0.6768  0.6747  0.6755  0.0009  
Fit time          25.15   34.33   28.71   29.40   3.78    
Test time         4.76    6.90    4.69    5.45    1.02    


{'test_rmse': array([0.854976  , 0.85757275, 0.85630091]),
 'test_mae': array([0.6748777 , 0.67681023, 0.67473304]),
 'fit_time': (25.150593996047974, 34.33075499534607, 28.71449875831604),
 'test_time': (4.763572692871094, 6.896812677383423, 4.693897485733032)}

In [None]:
## The model can also be trained on the entire data set but this consumes alot of computing power and takes long
train_set = data_set.build_full_trainset()
svd.fit(trainset=train_set) ## only run this if you have enough computing power and time to spare

In [14]:
## Generation rating predictions
svd.predict(uid=10,iid=100)

Prediction(uid=10, iid=100, r_ui=None, est=3.9205292039443203, details={'was_impossible': False})

In [17]:
## Now lets get to the fun part, Lets implement our utility
import difflib
import random
def fetch_book_id(book_title,meta_data):
  ## We will fetchthe book id based on the closest match with regards to the metadata we are parsed
  present_titles = list(meta_data['title'].values)
  close_titles = difflib.get_close_matches(book_title,present_titles)
  book_id = meta_data[meta_data['title']==close_titles[0]]['id'].values[0]
  return book_id

def fetch_book_info(book_id,meta_data):
  ## We will return the basic info about a book given the book id and the metadata
  book_info = meta_data[meta_data['id']==book_id][['id','isbn','authors','title','original_title']]
  return book_info.to_dict(orient='records')


def predict_review(user_id,book_title,model,meta_data):
  ## We will predict the review on a scale of 1-5 that users have assigned a specific book
  book_id = fetch_book_id(book_title,meta_data)
  review_prediction = model.predict(uid=user_id,iid=book_id)
  return review_prediction.est

def gen_recommendation(user_id,model,meta_data,thresh=4):
  ## Generate a book recommendation for a user based on a rating threshhold. Only books with
  ## the specified recommendations will be recommended
  book_titles = list(meta_data['title'].values)
  random.shuffle(book_titles)
  for book_title in book_titles:
    rating = predict_review(user_id,book_title,model,meta_data)
    if rating >= thresh:
      book_id = fetch_book_id(book_title,meta_data)
      return fetch_book_info(book_id,meta_data)

In [19]:
## Lets test our algo
gen_recommendation(1200,svd,meta_data) ## You can play with the first value to get different results

[{'id': 740,
  'isbn': '60529962',
  'authors': 'Laura Ingalls Wilder, Garth Williams',
  'title': 'The Little House Collection (Little House, #1-9)',
  'original_title': 'The Little House Collection'}]

In [21]:
## Now let us visualize our data so as to see similarities between the books
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2,n_iter=500,verbose=3,random_state=1)
books_embd = tsne.fit_transform(svd.qi)
proj = pd.DataFrame(columns=['x-axis','y-axis'],data=books_embd)
proj['title'] = meta_data['original_title']



[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.003s...
[t-SNE] Computed neighbors for 10000 samples in 2.130s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 0.233225
[t-SNE] Computed conditional probabilities in 0.512s
[t-SNE] Iteration 50: error = 95.3972168, gradient norm = 0.0711324 (50 iterations in 9.087s)
[t-SNE] It

In [22]:
import plotly.express as px
figure = px.scatter(proj,x='x-axis',y='y-axis')
figure.show()