## Book recommendation system, specifically for Mystery, Crime and Thrillers

In [2]:
# import necessary packages to interpret data
import gzip
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


## Loading the Dataset

The datasets have been downloaded from; https://mengtingwan.github.io/data/goodreads. There are 219,235 books, 1,849,236 detailed reviews and 24,799,896 interactions, but for the purpose of this system we will sample 100k.

In [2]:
# looks at the format of the data, so we can load necessary columns
with gzip.open('./data/goodreads_books_mystery_thriller_crime.json.gz') as f:
  line = f.readline()
json.loads(line)  

{'isbn': '184737297X',
 'text_reviews_count': '15',
 'series': ['169353'],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '159', 'name': 'to-read'},
  {'count': '12', 'name': 'historical-fiction'},
  {'count': '11', 'name': 'mystery'},
  {'count': '10', 'name': 'historical-mystery'},
  {'count': '7', 'name': 'medieval'},
  {'count': '6', 'name': 'historical'},
  {'count': '5', 'name': 'crime'},
  {'count': '3', 'name': 'series'},
  {'count': '3', 'name': 'ebook'},
  {'count': '3', 'name': 'fiction'},
  {'count': '3', 'name': 'crowner-john'},
  {'count': '2', 'name': 'default'},
  {'count': '2', 'name': 'books'},
  {'count': '2', 'name': 'owned'},
  {'count': '2', 'name': 'mystery-thrillers'},
  {'count': '2', 'name': 'c'},
  {'count': '2', 'name': 'library'},
  {'count': '2', 'name': 'wish-list'},
  {'count': '2', 'name': 'england'},
  {'count': '2', 'name': 'medieval-england'},
  {'count': '2', 'name': 'mystery-historical'},
  {'count': '2', 'name': 'audio

In [3]:
with gzip.open('./data/goodreads_reviews_mystery_thriller_crime.json.gz') as f:
  line = f.readline()
json.loads(line)  

{'user_id': '8842281e1d1347389f2ab93d60773d4d',
 'book_id': '6392944',
 'review_id': '5e212a62bced17b4dbe41150e5bb9037',
 'rating': 3,
 'review_text': "I haven't read a fun mystery book in a while and not sure I've ever read Poirot. Was looking for a fun read set in France while I was on holiday there and this didn't disappoint! Fast paced and good mystery. \n One that struck me was how similar Poirot is to Sherlock. They are both detectives, have a ex-military sidekick who is telling the story, and solve mysteries using their superior wit. Poirot seems like a French Sherlock. I'm curious if he was inspired by Sherlock.",
 'date_added': 'Mon Jul 24 02:48:17 -0700 2017',
 'date_updated': 'Sun Jul 30 09:28:03 -0700 2017',
 'read_at': 'Tue Jul 25 00:00:00 -0700 2017',
 'started_at': 'Mon Jul 24 00:00:00 -0700 2017',
 'n_votes': 6,
 'n_comments': 0}

In [4]:
with gzip.open('./data/goodreads_interactions_mystery_thriller_crime.json.gz') as f:
  line = f.readline()
json.loads(line)  

{'user_id': '8842281e1d1347389f2ab93d60773d4d',
 'book_id': '6392944',
 'review_id': '5e212a62bced17b4dbe41150e5bb9037',
 'is_read': True,
 'rating': 3,
 'review_text_incomplete': "I haven't read a fun mystery book in a while and not sure I've ever read Poirot. Was looking for a fun read set in France while I was on holiday there and this didn't disappoint! Fast paced and good mystery.<br /><br />One that struck me was how similar Poirot is to Sherlock. They are both detectives, have a...",
 'date_added': 'Mon Jul 24 02:48:17 -0700 2017',
 'date_updated': 'Sun Jul 30 09:28:03 -0700 2017',
 'read_at': 'Tue Jul 25 00:00:00 -0700 2017',
 'started_at': 'Mon Jul 24 00:00:00 -0700 2017'}

## Parsers

The code for the parsers and loading the data is based on this github repo: https://github.com/amitmldlai/Book-Recommendation-System/blob/main/Notebook/Book-Recommendation.ipynb, as I found it was the quickest way to do this due to the size of the various files

In [118]:
def user_parser(line):
  data = {
      'book_id': line['book_id'],
      'user_id': line['user_id'],
      'review_text': line['review_text'],
      'review_id' : line['review_id'],
      'n_votes': line['n_votes'],
      'user_rating': line['rating']
  } 
  return data

def book_parser(line):
  data = {
      'book_id': line['book_id'],
      'title_without_series': line['title_without_series'],
      'book_description': line['description'],
      'publication_year': line['publication_year'],
      'publisher': line['publisher'],
      'ratings_count': line['ratings_count'],
      'book_average_rating': line['average_rating'],
      'cover_page': line['image_url'],
      'book_url': line['url'],
      'is_ebook': line['is_ebook'],
      'num_pages': line['num_pages'],
      'country_code': line['country_code'],
      'language_code': line['language_code'],
  }
  return data


### Books

In [119]:
# 35s to convert
books = list()
with gzip.open('./data/goodreads_books_mystery_thriller_crime.json.gz', 'r') as f:
  while True:
    line = f.readline()
    if not line:
      break
    book = book_parser(json.loads(line))
    try:
      # makes sure the book has sufficient ratings, so will discount a lot of the low rating books
      # as this could create unessecary bias within the system
      if int(book['ratings_count'])>25 and int(book['ratings_count'])<1000:
        books.append(book)
    except Exception:
      continue    

df_books = pd.DataFrame.from_dict(books)
df_books.to_csv('df_books.csv.gz', index=False, compression='gzip') 

In [4]:
books = pd.read_csv('df_books.csv.gz')
books.head()

Unnamed: 0,book_id,title_without_series,book_description,publication_year,publisher,ratings_count,book_average_rating,cover_page,book_url,is_ebook,num_pages,country_code,language_code
0,6066814,"Crowner Royal (Crowner John Mystery, #13)","London, 1196. At the command of Richard the Li...",2009.0,Simon & Schuster UK,186,3.93,https://images.gr-assets.com/books/1328724803m...,https://www.goodreads.com/book/show/6066814-cr...,False,400.0,US,
1,33394837,The House of Memory (Pluto's Snitch #2),,,,269,4.33,https://images.gr-assets.com/books/1493114742m...,https://www.goodreads.com/book/show/33394837-t...,True,318.0,US,eng
2,29074697,The Slaughtered Virgin of Zenopolis (Inspector...,"BATHS, BANKS AND ROMAN INSURRECTION\nDetective...",,Amazon Digital Services,192,3.49,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/29074697-t...,True,,US,eng
3,1902202,"Dead in the Morning (Patrick Grant, #1)","Gerald breezily introduced his wife, Helen, to...",1975.0,Ulverscroft,52,3.3,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1902202.De...,False,,US,
4,2805495,Wycliffe and the Cycle of Death,A respectable bookseller is found bludgeoned a...,2001.0,Orion,58,3.61,https://images.gr-assets.com/books/1328819096m...,https://www.goodreads.com/book/show/2805495-wy...,False,320.0,US,


In [5]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111654 entries, 0 to 111653
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   book_id               111654 non-null  int64  
 1   title_without_series  111654 non-null  object 
 2   book_description      100901 non-null  object 
 3   publication_year      81189 non-null   float64
 4   publisher             79129 non-null   object 
 5   ratings_count         111654 non-null  int64  
 6   book_average_rating   111654 non-null  float64
 7   cover_page            111654 non-null  object 
 8   book_url              111654 non-null  object 
 9   is_ebook              111654 non-null  bool   
 10  num_pages             78226 non-null   float64
 11  country_code          111654 non-null  object 
 12  language_code         71690 non-null   object 
dtypes: bool(1), float64(3), int64(2), object(7)
memory usage: 10.3+ MB


### Users

In [9]:
# 3 mins to convert

users = list()
with gzip.open('./data/goodreads_reviews_mystery_thriller_crime.json.gz', 'r') as f:
  while True:
    line = f.readline()
    if not line:
      break
    user = user_parser(json.loads(line))
    users.append(user)

df_users = pd.DataFrame.from_dict(users)
df_users.to_csv('df_users.csv.gz', index=False, compression='gzip')

In [6]:
users = pd.read_csv('df_users.csv.gz')
users.head()

Unnamed: 0,book_id,user_id,review_text,review_id,n_votes,user_rating
0,6392944,8842281e1d1347389f2ab93d60773d4d,I haven't read a fun mystery book in a while a...,5e212a62bced17b4dbe41150e5bb9037,6,3
1,28684704,8842281e1d1347389f2ab93d60773d4d,"A fun, fast paced science fiction thriller. I ...",2ede853b14dc4583f96cf5d120af636f,22,3
2,32283133,8842281e1d1347389f2ab93d60773d4d,http://www.telegraph.co.uk/culture/10...,8e4d61801907e591018bdc3442a9cf2b,9,0
3,17860739,8842281e1d1347389f2ab93d60773d4d,An amazing and unique creation: JJ Abrams and ...,022bb6daffa49adc27f6b20b6ebeb37d,7,4
4,8694005,8842281e1d1347389f2ab93d60773d4d,The Name of the Rose is a thrilling Dan Brown-...,0e317947e1fd341f573192111bb2921d,17,3


In [7]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1849236 entries, 0 to 1849235
Data columns (total 6 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   book_id      int64 
 1   user_id      object
 2   review_text  object
 3   review_id    object
 4   n_votes      int64 
 5   user_rating  int64 
dtypes: int64(3), object(3)
memory usage: 84.7+ MB


## Data Preparation 

In [8]:
# make copies of both datasets;
users_copy = users.copy()
books_copy = books.copy()

Check for any occurences of books that aren't english

In [9]:
books_copy.describe()

Unnamed: 0,book_id,publication_year,ratings_count,book_average_rating,num_pages
count,111654.0,81189.0,111654.0,111654.0,78226.0
mean,14158790.0,2008.296851,167.516891,3.838677,322.160727
std,10541810.0,130.550807,192.067462,0.310052,164.473045
min,164.0,2.0,26.0,1.45,0.0
25%,3257460.0,2005.0,46.0,3.66,240.0
50%,13330320.0,2011.0,87.0,3.86,318.0
75%,22916000.0,2014.0,203.0,4.04,392.0
max,36496900.0,20113.0,999.0,4.95,16925.0


In [10]:
# lists all unique vales 
for col in list(books_copy):
    # looks at unique values of these 2 to make sure we only have english descriptions
    if col == 'language_code' or col == 'country_code':
        print(col,books_copy[col].unique())

country_code ['US']
language_code [nan 'eng' 'en-CA' 'gre' 'fin' 'en-GB' 'en-US' 'ita' 'spa' 'ger' 'swe'
 'nl' 'en' 'ind' 'rum' 'fre' 'por' 'afr' 'dan' 'srp' 'ben' 'pol' 'vie'
 'nor' 'tha' 'bul' 'nob' 'cze' 'ara' 'tur' 'kan' 'rus' 'msa' 'est' 'isl'
 'jpn' 'slo' 'scr' 'lav' 'heb' 'lit' 'tam' 'zho' 'cat' 'ukr' 'hin' 'hun'
 'vls' 'per' 'din' 'kat' 'glg' 'nno' 'frs' 'urd' '--' 'nld' 'fil' 'mlt'
 'kor' 'es-MX' 'egy' 'mul' 'mal' 'sin' 'tel' 'ady' 'mus' 'aus']


can keep 'eng', 'en-CA', 'en-GB', 'en-US', 'en', 'aus', as these are all english and nan values

In [11]:
# keeps only the valid language codes, allows for proper content based filtering
# used this to work out valid codes https://www.loc.gov/standards/iso639-2/php/English_list.php 
valid_language_codes = ['eng', 'en-CA', 'en-GB', 'en-US', 'en', 'aus', np.nan]

books_copy = books_copy[books_copy['language_code'].isin(valid_language_codes)]

In [12]:
books_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96357 entries, 0 to 111652
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   book_id               96357 non-null  int64  
 1   title_without_series  96357 non-null  object 
 2   book_description      86230 non-null  object 
 3   publication_year      67302 non-null  float64
 4   publisher             65045 non-null  object 
 5   ratings_count         96357 non-null  int64  
 6   book_average_rating   96357 non-null  float64
 7   cover_page            96357 non-null  object 
 8   book_url              96357 non-null  object 
 9   is_ebook              96357 non-null  bool   
 10  num_pages             64402 non-null  float64
 11  country_code          96357 non-null  object 
 12  language_code         56393 non-null  object 
dtypes: bool(1), float64(3), int64(2), object(7)
memory usage: 9.6+ MB


### Checks for any duplicate entries, and removes if they occur

In [13]:
users[['user_id','book_id','user_rating']].duplicated().sum()

0

In [14]:
books_copy[['title_without_series','book_average_rating']].duplicated().sum()

23707

In [15]:
# gets rid of any duplicate ratings within the books
books_copy.drop_duplicates(subset=['title_without_series','book_average_rating'], keep='first', inplace=True)

books_copy.shape

(72650, 13)

In [16]:
books_copy.isnull().any(axis=0)

book_id                 False
title_without_series    False
book_description         True
publication_year         True
publisher                True
ratings_count           False
book_average_rating     False
cover_page              False
book_url                False
is_ebook                False
num_pages                True
country_code            False
language_code            True
dtype: bool

In [17]:
books_copy[books_copy.isnull().any(axis=1)][['num_pages', 'book_average_rating','publication_year', 'publisher']].describe()

Unnamed: 0,num_pages,book_average_rating,publication_year
count,24439.0,49505.0,26747.0
mean,304.813986,3.842399,2005.762702
std,144.261588,0.331202,33.003229
min,0.0,1.92,2.0
25%,231.0,3.64,2002.0
50%,304.0,3.85,2009.0
75%,368.0,4.06,2013.0
max,5104.0,4.95,2911.0


In [18]:
books_copy.shape

(72650, 13)

In [19]:
# num_pages has 8304 null values and publication_year has 10663 null values, so we take the median value of each column and place this in

books_copy['publication_year'] = books_copy['publication_year'].fillna(books_copy['publication_year'].median())
books_copy['num_pages'] = books_copy['num_pages'].fillna(books_copy['num_pages'].median())
books_copy['language_code'] = books_copy['language_code'].fillna('en-US')

In [20]:
books_copy.isnull().any(axis=0)

book_id                 False
title_without_series    False
book_description         True
publication_year        False
publisher                True
ratings_count           False
book_average_rating     False
cover_page              False
book_url                False
is_ebook                False
num_pages               False
country_code            False
language_code           False
dtype: bool

In [21]:
# not sure how to process the text columns to fill the null/missing values? -> just drop these as these have no real importance to the ratings_count?
books_copy.dropna(subset=['publisher', 'book_description'],axis=0,inplace=True)

# subset allows for quicker operations on mutliple columns

# can drop due to the fact we cannot extrapolate to get any values due to it not being numeric data

In [22]:
books_copy.isnull().any(axis=0)

book_id                 False
title_without_series    False
book_description        False
publication_year        False
publisher               False
ratings_count           False
book_average_rating     False
cover_page              False
book_url                False
is_ebook                False
num_pages               False
country_code            False
language_code           False
dtype: bool

### Check the users/ratings now

In [23]:
users_copy.isnull().any(axis=0)

book_id        False
user_id        False
review_text     True
review_id      False
n_votes        False
user_rating    False
dtype: bool

In [24]:
users_copy.dropna(subset=['review_text'],axis=0,inplace=True)

In [25]:
users_copy.isnull().any(axis=0)

book_id        False
user_id        False
review_text    False
review_id      False
n_votes        False
user_rating    False
dtype: bool

In [26]:
users_copy.shape

(1848810, 6)

In [27]:
books_copy.shape

(45256, 13)

Have now finished the preprocesing and have no null values within the dataset, only have 45k books with 1.84 million interactions

## Merge the datasets

In [28]:
ratings = pd.merge(books_copy, users_copy, on='book_id')

In [29]:
ratings.shape

# have 312k entries for the ratings, so take a sample of the first 100k that is the same each time

(312110, 18)

In [30]:
ratings.columns

Index(['book_id', 'title_without_series', 'book_description',
       'publication_year', 'publisher', 'ratings_count', 'book_average_rating',
       'cover_page', 'book_url', 'is_ebook', 'num_pages', 'country_code',
       'language_code', 'user_id', 'review_text', 'review_id', 'n_votes',
       'user_rating'],
      dtype='object')

In [31]:
ratings.head()

Unnamed: 0,book_id,title_without_series,book_description,publication_year,publisher,ratings_count,book_average_rating,cover_page,book_url,is_ebook,num_pages,country_code,language_code,user_id,review_text,review_id,n_votes,user_rating
0,6066814,"Crowner Royal (Crowner John Mystery, #13)","London, 1196. At the command of Richard the Li...",2009.0,Simon & Schuster UK,186,3.93,https://images.gr-assets.com/books/1328724803m...,https://www.goodreads.com/book/show/6066814-cr...,False,400.0,US,en-US,9ab00cb54e02704c5a7bd5ca90564e2e,I enjoyed this book. It kept me guessing until...,8ee2d0644aa02cdf9a40dde4adaade04,0,5
1,29074697,The Slaughtered Virgin of Zenopolis (Inspector...,"BATHS, BANKS AND ROMAN INSURRECTION\nDetective...",2011.0,Amazon Digital Services,192,3.49,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/29074697-t...,True,304.0,US,eng,42dabb3a33d9a437387496a2dd50a247,It seems to take place in a parallel world ful...,c0e3b3879264cb7a888c5da4f5bf25c4,0,2
2,1902202,"Dead in the Morning (Patrick Grant, #1)","Gerald breezily introduced his wife, Helen, to...",1975.0,Ulverscroft,52,3.3,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1902202.De...,False,304.0,US,en-US,1e7bca7d5f8e3d6460320704c423c12e,Lively,22d5b7a4c8c7a48ec09b8779ecc76221,0,4
3,1902202,"Dead in the Morning (Patrick Grant, #1)","Gerald breezily introduced his wife, Helen, to...",1975.0,Ulverscroft,52,3.3,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1902202.De...,False,304.0,US,en-US,6a352b99747118883209c79d4d1f587d,I'm always trying to find the perfect mystery ...,0b104fb43b46e65758a1837450fa90d0,0,4
4,2805495,Wycliffe and the Cycle of Death,A respectable bookseller is found bludgeoned a...,2001.0,Orion,58,3.61,https://images.gr-assets.com/books/1328819096m...,https://www.goodreads.com/book/show/2805495-wy...,False,320.0,US,en-US,5c4fd56b7e95eb4d92ee8814b883eab8,"I enjoy the Wycliffe books,simpley because,the...",cc5598497557cd09bdd6b6246104bf99,0,4


### num_pages

In [32]:
# first we need to work out the distribution of this to see if there are any outliers

# look at percentiles to work out this

for i in range(0, 110,10):
    print(f'{i}th percentile is', np.percentile(ratings['num_pages'], i))

for i in range(0, 11,1):
    print(f'{i}th percentile is', np.percentile(ratings['num_pages'], i))

for i in range(90, 101,1):
    print(f'{i}th percentile is', np.percentile(ratings['num_pages'], i))



0th percentile is 0.0
10th percentile is 196.0
20th percentile is 240.0
30th percentile is 272.0
40th percentile is 296.0
50th percentile is 304.0
60th percentile is 320.0
70th percentile is 340.0
80th percentile is 370.0
90th percentile is 416.0
100th percentile is 16925.0
0th percentile is 0.0
1th percentile is 12.0
2th percentile is 46.0
3th percentile is 80.0
4th percentile is 108.0
5th percentile is 130.0
6th percentile is 150.0
7th percentile is 164.0
8th percentile is 178.0
9th percentile is 189.0
10th percentile is 196.0
90th percentile is 416.0
91th percentile is 424.0
92th percentile is 432.0
93th percentile is 444.0
94th percentile is 452.0
95th percentile is 465.0
96th percentile is 480.0
97th percentile is 500.0
98th percentile is 528.0
99th percentile is 592.0
100th percentile is 16925.0


Can see that we have a few books with less than 11 pages, so we will drop below this, as this will be too short.

Also, we have a book with 16925 pages, so we will drop above 1000 pages. As this is only 260 ratings out of 312153

There are 1231 ratings swith books below 5 pages, so we will also drop these. As it is unlikely you want  abook woth less than 5 pages

In [33]:
ratings[ratings['num_pages']>=1000.0]['num_pages'].count()

259

In [34]:
ratings[ratings['num_pages']<=5.0]['num_pages'].count()

1229

In [35]:
ratings.drop(ratings[ratings['num_pages'] <= 10.0].index, inplace=True)
ratings.drop(ratings[ratings['num_pages'] >= 1000.0].index, inplace=True)

In [36]:
ratings.shape

(309344, 18)

In [37]:
# select the first 100,000 cases to use for the RS, as mentioned in the FAQs the way this is done doesn't matter
ratings_100k = ratings.head(100000)

# verify size
ratings_100k.shape

(100000, 18)

In [38]:
# turns the ratings into a .csv file to allow for the use in the command line interface
ratings_100k.to_csv('ratings_clean_100k.csv')

In [110]:
from sklearn.model_selection import train_test_split
# perform the train/test split of 80/20

train_df_1, test_df_1 = train_test_split(ratings_100k, train_size=0.8,test_size=0.2)

## Content-based filtering

Analyses features od the items/dpcuments, that have been previously rated by a user. Then builds a model of user interests

Vector space model; with selection metric and combination of feature vectors to then create a weighting scheme for the cbf.

To represent this VSM; use the TF-IDF weighting scheme, use normalisation so that all documents are represented by vectors with weights in a 0,1 interval.  

Filtering component; to predict ratings for new components, need user profile to predict ratings, what extent the user will like the unseen items  (how the comparison between the items is close) 

Can use a similariity / distance measure to find the nearest neighbours. E.g. cosine, pearson correlation, euclidean, etc. Look at a few + prevouos literature to find out the best measure 

In [132]:
# create a copy of the training ratings
content_ratings = ratings_100k.copy()

In [123]:
# takes about 12s
def create_tf_idf(data):

    # tokenize the text data e.g. descprtion
    # gets rid of the english stop words, to help give a better decsription
    tf_idf = TfidfVectorizer(stop_words='english')

    # fit data and transform to tdidf matrix
    tf_idf_matrix = tf_idf.fit_transform(data['book_description'])

    return tf_idf_matrix

To help with the cold start problem, we will recommend the books with the highest weighted average to the user

In [181]:
# cold start avoidance in the case of a new user
def cold_start_avoidance(data, n):
    # sorts the most popular books via weighted avg, to help with the good books with less reviews
    popular_books = data.sort_values(by='weighted_avg', ascending=False)
    # gets rid of any duplicates
    popular_books = popular_books.drop_duplicates(subset='title_without_series')
    # gets the top n books
    popular_n_books = popular_books.head(n)[['book_id', 'title_without_series']]
    # turns into dataframe for easy of use and consistency
    df = pd.DataFrame(popular_n_books, columns=['book_id', 'title_without_series'])
    return df

In [591]:
def get_recommendations(user_id, data, n):
    # first checks if the user is registered/ in the system:
    if user_id not in data['user_id'].unique():
        # if not we return the best books
        return cold_start_avoidance(data, n)
    
    # creates the tf_idf matrix dependent on the data given
    tf_idf_matrix = create_tf_idf(data)

    # Extract books liked by the user, if the id is valid 
    user_books = data[data['user_id'] == user_id]['title_without_series'].tolist()
        
    # create dataframe for storing recommmendations
    recommendations_df = pd.DataFrame(columns=['Cosine Similarity'])

    # Keep track of recommended books to avoid duplicates
    recommended_books_set = set()

    # looks through each book liked by the user gets similar books based on cosine similarities
    for book in user_books:
        # aggregates td-idf vectors for each book
        liked_book_idx = data.index[(data['title_without_series'] == book) & (data['user_id'] == user_id)].tolist()[0]
        liked_book_tfidf = tf_idf_matrix[liked_book_idx]

        # calculate cosine similarity of current book with the rest of the books
        cosine_similarities = cosine_similarity(liked_book_tfidf, tf_idf_matrix).flatten()

        # indices of 'close' books
        sim_indices = cosine_similarities.argsort()[::-1]

        # gets rid of any books already liked by the user
        sim_indices = [i for i in sim_indices if (data['title_without_series'].iloc[i] not in user_books) and 
                                                (data['title_without_series'].iloc[i] not in recommended_books_set)]

        # add the top n books to the dataframe
        top_recommendations = data['title_without_series'].iloc[sim_indices].tolist()
        cosine_sim_values = cosine_similarities[sim_indices]
        book_id = data['book_id'].iloc[sim_indices].tolist()

        recommendations_df = recommendations_df.append(pd.DataFrame({'book_id':book_id, 'title_without_series': top_recommendations, 'Cosine Similarity': cosine_sim_values}))

        recommended_books_set.update(top_recommendations)

    # sorts recommendations via cosine similarity
    recommendations_df = recommendations_df.sort_values(by=['Cosine Similarity'], ascending=False)

    # gets top n unique recommendations
    unique_recommendations = recommendations_df.drop_duplicates(subset=['title_without_series']).head(n)
    
    # gets rid of cosine similarity, to allow for ease of use for user
    unique_recommendations = unique_recommendations.drop('Cosine Similarity', axis=1)
    return unique_recommendations

In [593]:
# Example usage
user_id = '9b808f1cf7160f03647fb8b8aefd4ffb'
reced = get_recommendations(user_id, content_ratings, 10)

In [594]:
reced

Unnamed: 0,book_id,title_without_series
0,251808.0,"The Mosaic Crimes (Dante Alighieri, #2)"
34,259042.0,The Dante Club
41,29430732.0,Kill the Father
43,3890688.0,"Never Say Sty (Kendra Ballantyne, Pet-Sitter M..."
46,24450451.0,Inferno: A Novel Unabridged Edition
50,36070517.0,"Tramps and Thieves (Murder and Mayhem, #2)"
52,25926794.0,Duce (World's End #1)
67,30269117.0,Inferno
70,27161832.0,We Were Kings
79,24485919.0,"The Last Honeytrap (Florence Love, #1)"


## Collaborative Filtering

In [142]:
collab_ratings = ratings_100k.copy()

In [51]:
pip install surprise 


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [339]:
# for collab filtering use the suprise module to allow
from surprise import SVD , SVDpp
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate
from surprise import accuracy

adapted from this article; https://pub.aimind.so/implementing-a-collaborative-filtering-recommendation-system-using-surprise-a-step-by-step-guide-2e879a34e021

In [194]:
re = Reader(rating_scale=(0.0,5.0))

In [195]:
data = Dataset.load_from_df(collab_ratings[["user_id", "book_id", "book_average_rating"]], re)

In [341]:
# 80/20 split, can experiement and change
trainset, testset = train_test_split(data, test_size=0.2)

Comparison of SVD and SVD++ across 5 folds to see which is better

In [595]:
algo = SVD()
cross_validate(algo,data, measures=["RMSE"], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.1887  0.1840  0.1824  0.1866  0.1857  0.1855  0.0021  
Fit time          1.30    1.27    1.24    1.33    1.25    1.28    0.03    
Test time         0.14    0.13    22.14   0.13    0.13    4.53    8.80    


{'test_rmse': array([0.18865167, 0.18396849, 0.18242575, 0.18663097, 0.18565618]),
 'fit_time': (1.3000288009643555,
  1.2701051235198975,
  1.2384250164031982,
  1.3263790607452393,
  1.2489640712738037),
 'test_time': (0.13849806785583496,
  0.13400602340698242,
  22.1388521194458,
  0.13197994232177734,
  0.13136577606201172)}

In [596]:
algo = SVDpp()
cross_validate(algo,data, measures=["RMSE"], cv=5, verbose=True)

Evaluating RMSE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.1568  0.1568  0.1562  0.1580  0.1568  0.1569  0.0006  
Fit time          1.73    1.74    1.74    1.84    1.84    1.78    0.05    
Test time         0.45    0.47    11.68   0.55    0.47    2.72    4.48    


{'test_rmse': array([0.15683573, 0.15680932, 0.15615002, 0.15797296, 0.15677781]),
 'fit_time': (1.7334482669830322,
  1.7430388927459717,
  1.7411158084869385,
  1.8445050716400146,
  1.8381340503692627),
 'test_time': (0.45234179496765137,
  0.46922802925109863,
  11.683666944503784,
  0.5468008518218994,
  0.46802520751953125)}

In [597]:
# trains the algorithm with the test set
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7f7c62e9be50>

In [598]:
# gets the predictions by evaulating the testset to the model
predictions = algo.test(testset)

In [599]:
# allows for the use in the the cli interface
pred = pd.DataFrame(predictions)
pred.to_csv('predictions_collab.csv')

In [610]:
def make_recommendations(data, n, predictions, user_id):
    # turns predictions into a dataframe for easy use
    pred = pd.DataFrame(predictions)
    # gets ratings for only specific user
    user_ratings = data[data['user_id'] == user_id]
    
    # merges predictions with the ratings on user_id and book_id
    merged_data = pd.merge(pred, user_ratings, left_on='iid', right_on='book_id')

    # sorts merged dataframe based on the estimated ratings
    merged_data.sort_values(by=['est'], inplace=True, ascending=False)
    
    unique_recommendations = []
    # looks through each prediction
    for book_id in merged_data['iid']:
        if book_id not in unique_recommendations:
            # adds to list if it is unique
            unique_recommendations.append(book_id)

    # makes a dataframe of the books and corresponding titles, to keep consistent with content based
    corresponding_titles = data[data['book_id'].isin(unique_recommendations)][['book_id', 'title_without_series']].drop_duplicates()

    return pd.DataFrame(corresponding_titles)

In [611]:
# example usage
user_id = '9b808f1cf7160f03647fb8b8aefd4ffb'
rec = make_recommendations(collab_ratings, 10, pred, user_id)

In [601]:
# Print the performance metrics -> low rmse, so shows that the recommender method makes good predictions as the rmse is low!
accuracy.rmse(predictions)

RMSE: 0.1578


0.15776811092415596

In [612]:
rec

Unnamed: 0,book_id,title_without_series
11494,25651905,Why You Were Taken (When Tomorrow Calls #1)
18937,34700191,Undercurrent
23073,27845484,"Eve: A Christmas Ghost Story (Psychic Surveys,..."
23308,28821606,The Last Thing I Remember
27278,30242428,Bloodwalker
32250,23664378,Guilt
52401,28351603,Twisted Justice
96085,35480285,A Deadly Game


## Hybrid System

Combine the 2 functions and then pick the best values

https://thecleverprogrammer.com/2023/06/05/hybrid-recommendation-system-using-python/

In [613]:
def hybrid_recommendations(user_id, data, n, predictions):
    # calls the collaborative and content based filtering functions
    content = get_recommendations(user_id, data, n)
    collab = make_recommendations(data, n, predictions, user_id)
    # join the 2 dataframes
    hybrid = pd.concat([content, collab])
     
    # use a weighted system based on number of ratings, to decide on the weighting to use for the hybrid approach

    # inspired by this paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9590147 

    v = data['ratings_count']
    R = data['book_average_rating']

    C = data['book_average_rating'].mean()

    m = data['ratings_count']>25
    
    data['weighted_avg'] = ((R*v) + (C*m)) / (v+m)

    # change column type of book_id to be a integer, as it was a float, so hard to look up the weighted average
    hybrid['book_id'] = hybrid['book_id'].astype(int)
    
    # look at how to use the book_id to then find the weighted average and then add this to hybrid table 
    id_weight = data.set_index('book_id')['weighted_avg'].to_dict()

    # maps the weighted_avg values to the 'book_id' column
    hybrid['weighted_avg'] = hybrid['book_id'].map(id_weight)

    # sorts the dataframe by 'weighted_avg'
    hybrid_sorted = hybrid.sort_values(by='weighted_avg', ascending=False)

    # Select the top n rows with unique values
    top_n_unique_values = hybrid_sorted.head(n).drop_duplicates(subset='title_without_series')
    # gets rid of the weighted_avg column to allow for user friendly format
    values = top_n_unique_values.drop('weighted_avg',axis=1)

    return values

In [615]:
# example usage
user_id = '9b808f1cf7160f03647fb8b8aefd4ffb'
n=10
hybrid_recs = hybrid_recommendations(user_id, ratings_100k, n, predictions)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weighted_avg'] = ((R*v) + (C*m)) / (v+m)


In [616]:
hybrid_recs

Unnamed: 0,book_id,title_without_series
50,36070517,"Tramps and Thieves (Murder and Mayhem, #2)"
27278,30242428,Bloodwalker
23073,27845484,"Eve: A Christmas Ghost Story (Psychic Surveys,..."
52401,28351603,Twisted Justice
96085,35480285,A Deadly Game
11494,25651905,Why You Were Taken (When Tomorrow Calls #1)
41,29430732,Kill the Father
32250,23664378,Guilt
18937,34700191,Undercurrent
67,30269117,Inferno


Evaulation metric, precision@k

In [618]:
def precision_at_k2(recommendations,k, user_id):
    # get items that are relevant to user 
    user_rated_items = ratings_100k[ratings_100k['user_id'] == user_id]['book_id'].tolist()
    # then finds all the common books between the ones that the user has previously rated and then ones that the recommender has predicted
    # then divides by, the number of recommendations to get this value
    precision = len(set(recommendations['book_id']) & set(user_rated_items)) / k
    return precision

In [619]:
precision_hybrid = precision_at_k2(hybrid_recs,n,user_id)
print(f'Precision@{n} for hybrid recommendations: {precision_hybrid}')

Precision@10 for hybrid recommendations: 0.7


note low or 0 precision means that no relvant items that have been predicted to the user, hence a bad rs



work out how to compare the recommendations, this will allow us to then return the top n for the hybrid function. 

Then work on the methods for rs2

look at previous literature to work out what ways have been considered? maybe weighted 

# RS2 - Deep Learning + NLP approach 

In [63]:
# creates a copy of the 100k sample to allow for any preperation needed for rs2
rs2_ratings = ratings_100k.copy()

In [590]:
rs2_ratings.shape

(100000, 20)

In [65]:
rs2_ratings.head()

Unnamed: 0,book_id,title_without_series,book_description,publication_year,publisher,ratings_count,book_average_rating,cover_page,book_url,is_ebook,num_pages,country_code,language_code,user_id,review_text,review_id,n_votes,user_rating,weighted_avg
0,6066814,"Crowner Royal (Crowner John Mystery, #13)","London, 1196. At the command of Richard the Li...",2009.0,Simon & Schuster UK,186,3.93,https://images.gr-assets.com/books/1328724803m...,https://www.goodreads.com/book/show/6066814-cr...,False,400.0,US,en-US,9ab00cb54e02704c5a7bd5ca90564e2e,I enjoyed this book. It kept me guessing until...,8ee2d0644aa02cdf9a40dde4adaade04,0,5,3.929339
1,29074697,The Slaughtered Virgin of Zenopolis (Inspector...,"BATHS, BANKS AND ROMAN INSURRECTION\nDetective...",2011.0,Amazon Digital Services,192,3.49,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/29074697-t...,True,304.0,US,eng,42dabb3a33d9a437387496a2dd50a247,It seems to take place in a parallel world ful...,c0e3b3879264cb7a888c5da4f5bf25c4,0,2,3.491639
2,1902202,"Dead in the Morning (Patrick Grant, #1)","Gerald breezily introduced his wife, Helen, to...",1975.0,Ulverscroft,52,3.3,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1902202.De...,False,304.0,US,en-US,1e7bca7d5f8e3d6460320704c423c12e,Lively,22d5b7a4c8c7a48ec09b8779ecc76221,0,4,3.309555
3,1902202,"Dead in the Morning (Patrick Grant, #1)","Gerald breezily introduced his wife, Helen, to...",1975.0,Ulverscroft,52,3.3,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1902202.De...,False,304.0,US,en-US,6a352b99747118883209c79d4d1f587d,I'm always trying to find the perfect mystery ...,0b104fb43b46e65758a1837450fa90d0,0,4,3.309555
4,2805495,Wycliffe and the Cycle of Death,A respectable bookseller is found bludgeoned a...,2001.0,Orion,58,3.61,https://images.gr-assets.com/books/1328819096m...,https://www.goodreads.com/book/show/2805495-wy...,False,320.0,US,en-US,5c4fd56b7e95eb4d92ee8814b883eab8,"I enjoy the Wycliffe books,simpley because,the...",cc5598497557cd09bdd6b6246104bf99,0,4,3.613329


Text PreProcessing for Content based filtering rs2

In [620]:
# imports necessary for RS2
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
# removes stopwords in pre-processing
nltk.download('stopwords')
# uses this module for the text tokenization
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jacobdear/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jacobdear/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [621]:
# creats a list of all stopwords from the nltk built-in library
stop_words_tpp = set(stopwords.words('english'))
print(stop_words_tpp)

{'own', 'his', 'do', 'any', 'during', 'out', 'just', 'while', 'of', 'yourselves', 'will', 'very', 'o', 'did', 'd', 'in', 'with', 'm', "you'd", 'is', "hadn't", 's', 'into', 'themselves', 'other', 'against', 'itself', 'not', 'only', 'because', "weren't", 'or', 'should', 'your', "shouldn't", "you'll", "doesn't", 'ours', 'if', 'hers', 'as', 'both', 'you', 'ourselves', 'some', 'haven', 'shouldn', 'off', "wouldn't", 'he', 'doing', 'wasn', 'once', "you've", 'hadn', 'where', "isn't", 'after', 'here', 'but', 'don', "wasn't", 'wouldn', 'hasn', 'by', 'to', "you're", 'how', 'theirs', "aren't", 'further', 'there', 'too', "haven't", "it's", 'about', "won't", 't', 'up', 'has', "she's", 'are', 'mustn', "didn't", 'doesn', 'for', 'why', 'below', 've', 'won', 'be', 'all', 'didn', "mustn't", 'no', 'yours', "needn't", 'a', 'mightn', 'on', 'yourself', 'between', "don't", 'herself', 'had', 'an', 'their', 'until', 'needn', 'her', 'being', 'at', 'what', 'same', 'each', 'having', 'most', 'when', 'whom', 'our', 

In [90]:
# adapted from https://github.com/ashok426/Recommender-System/blob/main/book_recommendation.ipynb 
rs2_ratings_copy = rs2_ratings.copy()
def nlp_preprocessing(total_text, index, column):
    if type(total_text) is not int:
        string = ""
        for words in total_text.split():
            # remove the special chars in review like '"#$@!%^&*()_+-~?>< etc.
            word = ("".join(e for e in words if e.isalnum()))
            # Conver all letters to lower-case
            word = word.lower()
            # stop-word removal
            if not word in stop_words_tpp:
                string += word + " "
        # updates the specified row and colummn pair with the new text
        rs2_ratings_copy[column][index] = string

In [91]:
# perform it for the book_description -> will take about 2 mins
for index, row in rs2_ratings.iterrows():
    nlp_preprocessing(row['book_description'],index, 'book_description')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rs2_ratings_copy[column][index] = string


In [92]:
# perform it for the book description
for index, row in rs2_ratings.iterrows():
    nlp_preprocessing(row['title_without_series'],index, 'title_without_series')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rs2_ratings_copy[column][index] = string


In [235]:
rs2_ratings_copy.head()

Unnamed: 0,book_id,title_without_series,book_description,publication_year,publisher,ratings_count,book_average_rating,cover_page,book_url,is_ebook,num_pages,country_code,language_code,user_id,review_text,review_id,n_votes,user_rating,weighted_avg
0,6066814,crowner royal crowner john mystery 13,london 1196 command richard lionheart sir john...,2009.0,Simon & Schuster UK,186,3.93,https://images.gr-assets.com/books/1328724803m...,https://www.goodreads.com/book/show/6066814-cr...,False,400.0,US,en-US,9ab00cb54e02704c5a7bd5ca90564e2e,I enjoyed this book. It kept me guessing until...,8ee2d0644aa02cdf9a40dde4adaade04,0,5,3.929339
1,29074697,slaughtered virgin zenopolis inspector capstan 1,baths banks roman insurrection detective inspe...,2011.0,Amazon Digital Services,192,3.49,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/29074697-t...,True,304.0,US,eng,42dabb3a33d9a437387496a2dd50a247,It seems to take place in a parallel world ful...,c0e3b3879264cb7a888c5da4f5bf25c4,0,2,3.491639
2,1902202,dead morning patrick grant 1,gerald breezily introduced wife helen mrs mack...,1975.0,Ulverscroft,52,3.3,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1902202.De...,False,304.0,US,en-US,1e7bca7d5f8e3d6460320704c423c12e,Lively,22d5b7a4c8c7a48ec09b8779ecc76221,0,4,3.309555
3,1902202,dead morning patrick grant 1,gerald breezily introduced wife helen mrs mack...,1975.0,Ulverscroft,52,3.3,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1902202.De...,False,304.0,US,en-US,6a352b99747118883209c79d4d1f587d,I'm always trying to find the perfect mystery ...,0b104fb43b46e65758a1837450fa90d0,0,4,3.309555
4,2805495,wycliffe cycle death,respectable bookseller found bludgeoned strang...,2001.0,Orion,58,3.61,https://images.gr-assets.com/books/1328819096m...,https://www.goodreads.com/book/show/2805495-wy...,False,320.0,US,en-US,5c4fd56b7e95eb4d92ee8814b883eab8,"I enjoy the Wycliffe books,simpley because,the...",cc5598497557cd09bdd6b6246104bf99,0,4,3.613329


In [475]:
# add a column that will allow us to tokenise the textual data, have already applied the preprocessing techniques above
rs2_ratings_copy['combined_text'] = rs2_ratings_copy['title_without_series'] + ' ' + rs2_ratings_copy['book_description']

In [622]:
# creates a new column that contains the tokenised text
rs2_ratings_copy['tokenised_text'] = rs2_ratings_copy['combined_text'].apply(word_tokenize)

In [623]:
rs2_ratings_copy.head()

Unnamed: 0,book_id,title_without_series,book_description,publication_year,publisher,ratings_count,book_average_rating,cover_page,book_url,is_ebook,...,country_code,language_code,user_id,review_text,review_id,n_votes,user_rating,weighted_avg,combined_text,tokenised_text
0,6066814,crowner royal crowner john mystery 13,london 1196 command richard lionheart sir john...,2009.0,Simon & Schuster UK,186,3.93,https://images.gr-assets.com/books/1328724803m...,https://www.goodreads.com/book/show/6066814-cr...,False,...,US,en-US,9ab00cb54e02704c5a7bd5ca90564e2e,I enjoyed this book. It kept me guessing until...,8ee2d0644aa02cdf9a40dde4adaade04,0,5,3.929339,crowner royal crowner john mystery 13 london ...,"[crowner, royal, crowner, john, mystery, 13, l..."
1,29074697,slaughtered virgin zenopolis inspector capstan 1,baths banks roman insurrection detective inspe...,2011.0,Amazon Digital Services,192,3.49,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/29074697-t...,True,...,US,eng,42dabb3a33d9a437387496a2dd50a247,It seems to take place in a parallel world ful...,c0e3b3879264cb7a888c5da4f5bf25c4,0,2,3.491639,slaughtered virgin zenopolis inspector capstan...,"[slaughtered, virgin, zenopolis, inspector, ca..."
2,1902202,dead morning patrick grant 1,gerald breezily introduced wife helen mrs mack...,1975.0,Ulverscroft,52,3.3,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1902202.De...,False,...,US,en-US,1e7bca7d5f8e3d6460320704c423c12e,Lively,22d5b7a4c8c7a48ec09b8779ecc76221,0,4,3.309555,dead morning patrick grant 1 gerald breezily ...,"[dead, morning, patrick, grant, 1, gerald, bre..."
3,1902202,dead morning patrick grant 1,gerald breezily introduced wife helen mrs mack...,1975.0,Ulverscroft,52,3.3,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1902202.De...,False,...,US,en-US,6a352b99747118883209c79d4d1f587d,I'm always trying to find the perfect mystery ...,0b104fb43b46e65758a1837450fa90d0,0,4,3.309555,dead morning patrick grant 1 gerald breezily ...,"[dead, morning, patrick, grant, 1, gerald, bre..."
4,2805495,wycliffe cycle death,respectable bookseller found bludgeoned strang...,2001.0,Orion,58,3.61,https://images.gr-assets.com/books/1328819096m...,https://www.goodreads.com/book/show/2805495-wy...,False,...,US,en-US,5c4fd56b7e95eb4d92ee8814b883eab8,"I enjoy the Wycliffe books,simpley because,the...",cc5598497557cd09bdd6b6246104bf99,0,4,3.613329,wycliffe cycle death respectable bookseller f...,"[wycliffe, cycle, death, respectable, booksell..."


In [97]:
# turns the ratings into a .csv file to allow for the use in the command line interface
rs2_ratings_copy.to_csv('rs2_clean_ratings.csv')

In [485]:
# split this dataset 80/20 to allow for train/testing

# gets 80% of books
rs2_train = [books[i] for i in range(round(0.8*len(books)))]

train_df = rs2_ratings_copy[rs2_ratings_copy['book_id'].isin(rs2_train)]
# all values not in training set, 20% for testing set
test_df = rs2_ratings_copy[~rs2_ratings_copy['book_id'].isin(rs2_train)]

In [254]:
# install this which has the built-in code for the word2vec model
pip install gensim 

Collecting gensim
  Downloading gensim-4.3.2-cp38-cp38-macosx_10_9_x86_64.whl (24.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting scipy>=1.7.0 (from gensim)
  Downloading scipy-1.10.1-cp38-cp38-macosx_10_9_x86_64.whl (35.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.0/35.0 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-6.4.0-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: smart-open, scipy, gensim
  Attempting uninstall: scipy
    Found existing installation: scipy 1.6.2
    Uninstalling scipy-1.6.2:
      Successfully uninstalled scipy-1.6.2
Successfully installed gensim-4.3.2 scipy-1.10.1 smart-open-6.4.0

[1m[[0m[34;49mnotice[

In [104]:
# use this to implement Word2Vec 

# paper # use the NLP approach, with word2vec model / CBOW model from https://arxiv.org/pdf/1301.3781.pdf  
from gensim.models import Word2Vec


https://www.analyticsvidhya.com/blog/2019/07/how-to-build-recommendation-system-word2vec-python/

In [302]:
from tqdm import tqdm
# list to capture books read by the customers
books_read = []

# populate the list with the book codes
for index, row in tqdm(train_df.iterrows(), total=len(train_df)):
    temp = train_df[train_df["user_id"] == row['user_id']]['book_id'].tolist()
    books_read.append(temp)

100%|██████████| 80011/80011 [13:10<00:00, 101.26it/s] 


In [295]:
# list to capture books read by the customers
books_read_test = []

# populate the list with the book codes
for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    temp = test_df[test_df["user_id"] == row['user_id']]['book_id'].tolist()
    books_read_test.append(temp)

100%|██████████| 19989/19989 [00:53<00:00, 376.73it/s]


In [500]:
# train word2vec model
w2_model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)
# buils the vocabulary, using the book_ids and the tokenised text, so takes into account the book description and titles as well
w2_model.build_vocab(books_read + train_df['tokenised_text'].tolist())

In [505]:
from gensim.models.callbacks import CallbackAny2Vec
# allows us to print each epoch and the corresponding loss value
class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    # when the epoch ends we print this out to see progress
    def on_epoch_end(self, model):
        print(f"Epoch {self.epoch}, Loss: {model.get_latest_training_loss()}")
        self.epoch += 1


In [506]:
# calls the class to log epochs
epoch = EpochLogger()
# trains on a combination of book_ids and tokenised text, for all the value in the corpus, which is the set of 
# vectors in the vocab, trains for 10 epochs
# compute_loss=True, so we can see the progress and then allows for the printing of epochs via callbacks
w2_model.train(books_read + train_df['tokenised_text'].tolist(),
               total_examples=w2_model.corpus_count,
               epochs=10,
               compute_loss=True,
               callbacks=[epoch]
              )

Epoch 0, Loss: 27273928.0
Epoch 1, Loss: 45090316.0
Epoch 2, Loss: 61044704.0
Epoch 3, Loss: 72315368.0
Epoch 4, Loss: 80555336.0
Epoch 5, Loss: 88334792.0
Epoch 6, Loss: 95885552.0
Epoch 7, Loss: 103129128.0
Epoch 8, Loss: 109449064.0
Epoch 9, Loss: 114134048.0


(82359891, 83396620)

In [558]:
# saves the model, allowing for the use in the cli 
w2_model.save("w2_model.model")

In [508]:
print(w2_model)

Word2Vec<vocab=60762, vector_size=100, alpha=0.03>


In [624]:
# create a book id and description to easy map the description to id and vice versa
trained_books = train_df[["book_id", "title_without_series"]]

# remove duplicates
trained_books = trained_books.drop_duplicates(inplace=True, subset='book_id', keep="last")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trained_books = trained_books.drop_duplicates(inplace=True, subset='book_id', keep="last")


In [581]:
# takes the average all previously rated items and produces a resultant vector
def aggregate_vectors(books):
    book_vec = []
    for i in books:
        try:
            book_vec.append(w2_model.wv[i])
        except KeyError:
            continue
        
    return np.mean(book_vec, axis=0)

In [627]:
def content_rs2(user_id, data, n):
    
    # extract liked books by user
    user_books = data[data['user_id'] == user_id]['book_id'].tolist()
    # calculate the resultant vector
    user_vector = aggregate_vectors(user_books)

    # extract most similar words/books for the input vector
    ms = w2_model.wv.most_similar(user_vector, topn= n+1)[1:]
    # extracts the top n most similar books, excluding itself
    book_ids = [int(j[0]) for j in ms]
    # creates a df of the most similar book_ids
    book_ids_df = pd.DataFrame({'book_id': book_ids})

    # merges the dataset with the original dataset to allow for the titles to be in proper formats
    # not lower and with all stopwords and punctutation removed like train_df
    new_ms = pd.merge(book_ids_df, rs2_ratings, how='left', on='book_id')
    # gets rid of the duplicated values when merging, can run into problems due to having mutliple instances 
    # of same book_id
    new_ms.drop_duplicates(inplace=True, subset='book_id', keep="last")
    
    # returns the dataframe but only the book_id and title column
    return new_ms[['book_id', 'title_without_series']]


In [579]:
test_df['user_id'].head()

80936    af129a3a15e5e989c4b753eca37d0bee
80937    b77fdab04b65257d9dcb89e47be44cda
80938    30c9e6cbe1794a885dee0ff3d8b58e19
80939    9aa1f348011c0f862b88bd6bba03d138
80940    f5343cf39c3f9eb8eeb281a7450f6f87
Name: user_id, dtype: object

In [628]:
content_rs2('9aa1f348011c0f862b88bd6bba03d138', train_df, 10)

Unnamed: 0,book_id,title_without_series
0,15797755,The Quick Red Fox: A Travis McGee Novel
1,1024338,"The Nemesis Mission (Aerospace Systems, #2)"
5,968971,Mr. Capone
7,8130469,House of Reckoning
11,3045999,"Bloodline (Repairman Jack, #11)"
13,7025145,The Adventure Of The Blue Carbuncle (The Adven...
14,15797756,A Deadly Shade of Gold: A Travis McGee Novel
17,5278136,The Crosskiller
19,2020191,A Very Private Gentleman
27,13642995,"The Deep Blue Good-by (Travis McGee, #1)"


In [584]:
content_rs2('a04990a9062191f84d05d44581872e86', train_df, 10)

Unnamed: 0,book_id,title_without_series
14,25817610,London Rain (Josephine Tey #6)
15,11100733,Rose Cottage
16,72772,Much Ado About Murder
18,31077045,212 (Ellie Hatcher #3)
20,1523559,Nine Coaches Waiting
27,13019553,Death Comes to Pemberley
28,6442485,"A Murder on London Bridge (Thomas Chaloner, #5)"
31,13644633,Death of an Old Git (The Falconer Files - File 1)
34,916671,Rosaura A las Diez
38,636680,Jane and the Unpleasantness at Scargrave Manor...


In [585]:
content_rs2('9b808f1cf7160f03647fb8b8aefd4ffb', train_df,10)

Unnamed: 0,book_id,title_without_series
3,34311689,White Bodies
8,28500919,Betrayal
10,33156619,The Bay of Shadows
11,34743122,Blood Money
14,35178144,Grievance
19,28351603,Twisted Justice
22,17881160,"A Long Walk Home: One Woman's Story of Kidnap,..."
23,29199942,What She Never Told Me
35,23664378,Guilt
36,20591541,The Eternal Chain


In [587]:
content_rs2('5c4fd56b7e95eb4d92ee8814b883eab8',train_df,10)

Unnamed: 0,book_id,title_without_series
1,2805495,Wycliffe and the Cycle of Death
7,907601,Helter Skelter: The True Story of the Manson M...
16,31143799,Reservoir 13
19,1502479,A.N.T.I.D.O.T.E.
24,28500919,Betrayal
27,6950999,The Scarpetta Factor (Kay Scarpetta #17)
37,23860310,The Last Confession of Thomas Hawkins (Tom Haw...
41,34311689,White Bodies
46,35271392,"Force of Nature (Aaron Falk, #2)"
49,11907159,V Is for Vengeance


In [588]:
content_rs2('2918efc49ff8f3ffba558afc91792892',train_df,20)

Unnamed: 0,book_id,title_without_series
10,30981004,Crazy For Alice
14,29976075,Serial Killers: The Colombian Monsters (Serial...
15,25840536,Women in Red
18,10231736,If I Did It: Confessions of the Killer
26,21393608,When The Circus Came To Town
36,18759375,Murder Is A Family Affair
37,19814507,"Want to Play? (Monkeewrench, #1)"
41,13554552,"Sapphire Reign (Royal Blood, #2)"
50,18072986,Embracing Hamilton (JackRabbit7 Series)
68,18148534,"The Twin Dragons (Golden Crown, #2)"


Hybrid System with word2vec content method applied.

In [631]:
def hybrid_recommendations(user_id, data, n, predictions):
    # calls the collaborative and content based filtering functions
    # calls rs2 for content rather than the original function
    content = content_rs2(user_id, data, n)
    collab = make_recommendations(data, n, predictions, user_id)
    # join the 2 dataframes
    hybrid = pd.concat([content, collab])
     
    # use a weighted system based on number of ratings, to decide on the weighting to use for the hybrid approach

    # inspired by this paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9590147 

    v = data['ratings_count']
    R = data['book_average_rating']

    C = data['book_average_rating'].mean()

    m = data['ratings_count']>25
    
    data['weighted_avg'] = ((R*v) + (C*m)) / (v+m)

    # change column type of book_id to be a integer, as it was a float, so hard to look up the weighted average
    hybrid['book_id'] = hybrid['book_id'].astype(int)
    
    # look at how to use the book_id to then find the weighted average and then add this to hybrid table 
    id_weight = data.set_index('book_id')['weighted_avg'].to_dict()

    # maps the weighted_avg values to the 'book_id' column
    hybrid['weighted_avg'] = hybrid['book_id'].map(id_weight)

    # sorts the dataframe by 'weighted_avg'
    hybrid_sorted = hybrid.sort_values(by='weighted_avg', ascending=False)

    # Select the top n rows with unique values
    top_n_unique_values = hybrid_sorted.head(n).drop_duplicates(subset='title_without_series')
    # gets rid of the weighted_avg column to allow for user friendly format
    values = top_n_unique_values.drop('weighted_avg',axis=1)

    return values

In [632]:
# example usage
user_id = '9b808f1cf7160f03647fb8b8aefd4ffb'
n = 10
hybrid_recs_rs2 = hybrid_recommendations(user_id, ratings_100k, n, predictions)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weighted_avg'] = ((R*v) + (C*m)) / (v+m)


In [633]:
hybrid_recs_rs2

Unnamed: 0,book_id,title_without_series
36,20591541,The Eternal Chain
27278,30242428,Bloodwalker
23073,27845484,"Eve: A Christmas Ghost Story (Psychic Surveys,..."
19,28351603,Twisted Justice
96085,35480285,A Deadly Game
11,34743122,Blood Money
11494,25651905,Why You Were Taken (When Tomorrow Calls #1)
23,29199942,What She Never Told Me
35,23664378,Guilt


In [634]:
hybrid_recs

Unnamed: 0,book_id,title_without_series
50,36070517,"Tramps and Thieves (Murder and Mayhem, #2)"
27278,30242428,Bloodwalker
23073,27845484,"Eve: A Christmas Ghost Story (Psychic Surveys,..."
52401,28351603,Twisted Justice
96085,35480285,A Deadly Game
11494,25651905,Why You Were Taken (When Tomorrow Calls #1)
41,29430732,Kill the Father
32250,23664378,Guilt
18937,34700191,Undercurrent
67,30269117,Inferno


In [636]:
precision_at_k2(hybrid_recs,n,user_id)

0.7

In [637]:
precision_at_k2(hybrid_recs_rs2,n,user_id)

0.6