## Book recommendation system, specifically for Mystery, Crime and Thrillers

In [3]:
# import necessary packages to interpret data
import gzip
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


## Loading the Dataset

The datasets have been downloaded from; https://mengtingwan.github.io/data/goodreads. There are 219,235 books, 1,849,236 detailed reviews and 24,799,896 interactions, but for the purpose of this system we will sample 100k.

In [2]:
# looks at the format of the data, so we can load necessary columns
with gzip.open('./data/goodreads_books_mystery_thriller_crime.json.gz') as f:
  line = f.readline()
json.loads(line)  

{'isbn': '184737297X',
 'text_reviews_count': '15',
 'series': ['169353'],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '159', 'name': 'to-read'},
  {'count': '12', 'name': 'historical-fiction'},
  {'count': '11', 'name': 'mystery'},
  {'count': '10', 'name': 'historical-mystery'},
  {'count': '7', 'name': 'medieval'},
  {'count': '6', 'name': 'historical'},
  {'count': '5', 'name': 'crime'},
  {'count': '3', 'name': 'series'},
  {'count': '3', 'name': 'ebook'},
  {'count': '3', 'name': 'fiction'},
  {'count': '3', 'name': 'crowner-john'},
  {'count': '2', 'name': 'default'},
  {'count': '2', 'name': 'books'},
  {'count': '2', 'name': 'owned'},
  {'count': '2', 'name': 'mystery-thrillers'},
  {'count': '2', 'name': 'c'},
  {'count': '2', 'name': 'library'},
  {'count': '2', 'name': 'wish-list'},
  {'count': '2', 'name': 'england'},
  {'count': '2', 'name': 'medieval-england'},
  {'count': '2', 'name': 'mystery-historical'},
  {'count': '2', 'name': 'audio

In [3]:
with gzip.open('./data/goodreads_reviews_mystery_thriller_crime.json.gz') as f:
  line = f.readline()
json.loads(line)  

{'user_id': '8842281e1d1347389f2ab93d60773d4d',
 'book_id': '6392944',
 'review_id': '5e212a62bced17b4dbe41150e5bb9037',
 'rating': 3,
 'review_text': "I haven't read a fun mystery book in a while and not sure I've ever read Poirot. Was looking for a fun read set in France while I was on holiday there and this didn't disappoint! Fast paced and good mystery. \n One that struck me was how similar Poirot is to Sherlock. They are both detectives, have a ex-military sidekick who is telling the story, and solve mysteries using their superior wit. Poirot seems like a French Sherlock. I'm curious if he was inspired by Sherlock.",
 'date_added': 'Mon Jul 24 02:48:17 -0700 2017',
 'date_updated': 'Sun Jul 30 09:28:03 -0700 2017',
 'read_at': 'Tue Jul 25 00:00:00 -0700 2017',
 'started_at': 'Mon Jul 24 00:00:00 -0700 2017',
 'n_votes': 6,
 'n_comments': 0}

In [4]:
with gzip.open('./data/goodreads_interactions_mystery_thriller_crime.json.gz') as f:
  line = f.readline()
json.loads(line)  

{'user_id': '8842281e1d1347389f2ab93d60773d4d',
 'book_id': '6392944',
 'review_id': '5e212a62bced17b4dbe41150e5bb9037',
 'is_read': True,
 'rating': 3,
 'review_text_incomplete': "I haven't read a fun mystery book in a while and not sure I've ever read Poirot. Was looking for a fun read set in France while I was on holiday there and this didn't disappoint! Fast paced and good mystery.<br /><br />One that struck me was how similar Poirot is to Sherlock. They are both detectives, have a...",
 'date_added': 'Mon Jul 24 02:48:17 -0700 2017',
 'date_updated': 'Sun Jul 30 09:28:03 -0700 2017',
 'read_at': 'Tue Jul 25 00:00:00 -0700 2017',
 'started_at': 'Mon Jul 24 00:00:00 -0700 2017'}

## Parsers

The code for the parsers and loading the data is based on this github repo: https://github.com/amitmldlai/Book-Recommendation-System/blob/main/Notebook/Book-Recommendation.ipynb, as I found it was the quickest way to do this due to the size of the various files. It also inspired some of my data preperation approaches

In [4]:
def user_parser(line):
  data = {
      'book_id': line['book_id'],
      'user_id': line['user_id'],
      'review_text': line['review_text'],
      'review_id' : line['review_id'],
      'n_votes': line['n_votes'],
      'user_rating': line['rating']
  } 
  return data

def book_parser(line):
  data = {
      'book_id': line['book_id'],
      'title_without_series': line['title_without_series'],
      'book_description': line['description'],
      'publication_year': line['publication_year'],
      'publisher': line['publisher'],
      'ratings_count': line['ratings_count'],
      'book_average_rating': line['average_rating'],
      'cover_page': line['image_url'],
      'book_url': line['url'],
      'is_ebook': line['is_ebook'],
      'num_pages': line['num_pages'],
      'country_code': line['country_code'],
      'language_code': line['language_code'],
  }
  return data


### Books

In [119]:
# 35s to convert
books = list()
with gzip.open('./data/goodreads_books_mystery_thriller_crime.json.gz', 'r') as f:
  while True:
    line = f.readline()
    if not line:
      break
    book = book_parser(json.loads(line))
    try:
      # makes sure the book has sufficient ratings, so will discount a lot of the low rating books
      # as this could create unessecary bias within the system
      if int(book['ratings_count'])>25 and int(book['ratings_count'])<1000:
        books.append(book)
    except Exception:
      continue    

df_books = pd.DataFrame.from_dict(books)
df_books.to_csv('df_books.csv.gz', index=False, compression='gzip') 

In [5]:
books = pd.read_csv('df_books.csv.gz')
books.head()

Unnamed: 0,book_id,title_without_series,book_description,publication_year,publisher,ratings_count,book_average_rating,cover_page,book_url,is_ebook,num_pages,country_code,language_code
0,6066814,"Crowner Royal (Crowner John Mystery, #13)","London, 1196. At the command of Richard the Li...",2009.0,Simon & Schuster UK,186,3.93,https://images.gr-assets.com/books/1328724803m...,https://www.goodreads.com/book/show/6066814-cr...,False,400.0,US,
1,33394837,The House of Memory (Pluto's Snitch #2),,,,269,4.33,https://images.gr-assets.com/books/1493114742m...,https://www.goodreads.com/book/show/33394837-t...,True,318.0,US,eng
2,29074697,The Slaughtered Virgin of Zenopolis (Inspector...,"BATHS, BANKS AND ROMAN INSURRECTION\nDetective...",,Amazon Digital Services,192,3.49,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/29074697-t...,True,,US,eng
3,1902202,"Dead in the Morning (Patrick Grant, #1)","Gerald breezily introduced his wife, Helen, to...",1975.0,Ulverscroft,52,3.3,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1902202.De...,False,,US,
4,2805495,Wycliffe and the Cycle of Death,A respectable bookseller is found bludgeoned a...,2001.0,Orion,58,3.61,https://images.gr-assets.com/books/1328819096m...,https://www.goodreads.com/book/show/2805495-wy...,False,320.0,US,


In [6]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111654 entries, 0 to 111653
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   book_id               111654 non-null  int64  
 1   title_without_series  111654 non-null  object 
 2   book_description      100901 non-null  object 
 3   publication_year      81189 non-null   float64
 4   publisher             79129 non-null   object 
 5   ratings_count         111654 non-null  int64  
 6   book_average_rating   111654 non-null  float64
 7   cover_page            111654 non-null  object 
 8   book_url              111654 non-null  object 
 9   is_ebook              111654 non-null  bool   
 10  num_pages             78226 non-null   float64
 11  country_code          111654 non-null  object 
 12  language_code         71690 non-null   object 
dtypes: bool(1), float64(3), int64(2), object(7)
memory usage: 10.3+ MB


### Users

In [9]:
# 3 mins to convert

users = list()
with gzip.open('./data/goodreads_reviews_mystery_thriller_crime.json.gz', 'r') as f:
  while True:
    line = f.readline()
    if not line:
      break
    user = user_parser(json.loads(line))
    users.append(user)

df_users = pd.DataFrame.from_dict(users)
df_users.to_csv('df_users.csv.gz', index=False, compression='gzip')

In [7]:
users = pd.read_csv('df_users.csv.gz')
users.head()

Unnamed: 0,book_id,user_id,review_text,review_id,n_votes,user_rating
0,6392944,8842281e1d1347389f2ab93d60773d4d,I haven't read a fun mystery book in a while a...,5e212a62bced17b4dbe41150e5bb9037,6,3
1,28684704,8842281e1d1347389f2ab93d60773d4d,"A fun, fast paced science fiction thriller. I ...",2ede853b14dc4583f96cf5d120af636f,22,3
2,32283133,8842281e1d1347389f2ab93d60773d4d,http://www.telegraph.co.uk/culture/10...,8e4d61801907e591018bdc3442a9cf2b,9,0
3,17860739,8842281e1d1347389f2ab93d60773d4d,An amazing and unique creation: JJ Abrams and ...,022bb6daffa49adc27f6b20b6ebeb37d,7,4
4,8694005,8842281e1d1347389f2ab93d60773d4d,The Name of the Rose is a thrilling Dan Brown-...,0e317947e1fd341f573192111bb2921d,17,3


In [8]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1849236 entries, 0 to 1849235
Data columns (total 6 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   book_id      int64 
 1   user_id      object
 2   review_text  object
 3   review_id    object
 4   n_votes      int64 
 5   user_rating  int64 
dtypes: int64(3), object(3)
memory usage: 84.7+ MB


## Data Preparation 

In [9]:
# make copies of both datasets;
users_copy = users.copy()
books_copy = books.copy()

Check for any occurences of books that aren't english

In [10]:
books_copy.describe()

Unnamed: 0,book_id,publication_year,ratings_count,book_average_rating,num_pages
count,111654.0,81189.0,111654.0,111654.0,78226.0
mean,14158790.0,2008.296851,167.516891,3.838677,322.160727
std,10541810.0,130.550807,192.067462,0.310052,164.473045
min,164.0,2.0,26.0,1.45,0.0
25%,3257460.0,2005.0,46.0,3.66,240.0
50%,13330320.0,2011.0,87.0,3.86,318.0
75%,22916000.0,2014.0,203.0,4.04,392.0
max,36496900.0,20113.0,999.0,4.95,16925.0


In [11]:
# lists all unique vales 
for col in list(books_copy):
    # looks at unique values of these 2 to make sure we only have english descriptions
    if col == 'language_code' or col == 'country_code':
        print(col,books_copy[col].unique())

country_code ['US']
language_code [nan 'eng' 'en-CA' 'gre' 'fin' 'en-GB' 'en-US' 'ita' 'spa' 'ger' 'swe'
 'nl' 'en' 'ind' 'rum' 'fre' 'por' 'afr' 'dan' 'srp' 'ben' 'pol' 'vie'
 'nor' 'tha' 'bul' 'nob' 'cze' 'ara' 'tur' 'kan' 'rus' 'msa' 'est' 'isl'
 'jpn' 'slo' 'scr' 'lav' 'heb' 'lit' 'tam' 'zho' 'cat' 'ukr' 'hin' 'hun'
 'vls' 'per' 'din' 'kat' 'glg' 'nno' 'frs' 'urd' '--' 'nld' 'fil' 'mlt'
 'kor' 'es-MX' 'egy' 'mul' 'mal' 'sin' 'tel' 'ady' 'mus' 'aus']


can keep 'eng', 'en-CA', 'en-GB', 'en-US', 'en', 'aus', as these are all english and nan values

In [12]:
# keeps only the valid language codes, allows for proper content based filtering
# used this to work out valid codes https://www.loc.gov/standards/iso639-2/php/English_list.php 
valid_language_codes = ['eng', 'en-CA', 'en-GB', 'en-US', 'en', 'aus', np.nan]

books_copy = books_copy[books_copy['language_code'].isin(valid_language_codes)]

In [13]:
books_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96357 entries, 0 to 111652
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   book_id               96357 non-null  int64  
 1   title_without_series  96357 non-null  object 
 2   book_description      86230 non-null  object 
 3   publication_year      67302 non-null  float64
 4   publisher             65045 non-null  object 
 5   ratings_count         96357 non-null  int64  
 6   book_average_rating   96357 non-null  float64
 7   cover_page            96357 non-null  object 
 8   book_url              96357 non-null  object 
 9   is_ebook              96357 non-null  bool   
 10  num_pages             64402 non-null  float64
 11  country_code          96357 non-null  object 
 12  language_code         56393 non-null  object 
dtypes: bool(1), float64(3), int64(2), object(7)
memory usage: 9.6+ MB


### Checks for any duplicate entries, and removes if they occur

In [14]:
users[['user_id','book_id','user_rating']].duplicated().sum()

0

In [15]:
books_copy[['title_without_series','book_average_rating']].duplicated().sum()

23707

In [16]:
# gets rid of any duplicate ratings within the books
books_copy.drop_duplicates(subset=['title_without_series','book_average_rating'], keep='first', inplace=True)

books_copy.shape

(72650, 13)

In [17]:
books_copy.isnull().any(axis=0)

book_id                 False
title_without_series    False
book_description         True
publication_year         True
publisher                True
ratings_count           False
book_average_rating     False
cover_page              False
book_url                False
is_ebook                False
num_pages                True
country_code            False
language_code            True
dtype: bool

In [18]:
books_copy[books_copy.isnull().any(axis=1)][['num_pages', 'book_average_rating','publication_year', 'publisher']].describe()

Unnamed: 0,num_pages,book_average_rating,publication_year
count,24439.0,49505.0,26747.0
mean,304.813986,3.842399,2005.762702
std,144.261588,0.331202,33.003229
min,0.0,1.92,2.0
25%,231.0,3.64,2002.0
50%,304.0,3.85,2009.0
75%,368.0,4.06,2013.0
max,5104.0,4.95,2911.0


In [19]:
books_copy.shape

(72650, 13)

In [20]:
# num_pages has 8304 null values and publication_year has 10663 null values, so we take the median value of each column and place this in

books_copy['publication_year'] = books_copy['publication_year'].fillna(books_copy['publication_year'].median())
books_copy['num_pages'] = books_copy['num_pages'].fillna(books_copy['num_pages'].median())
books_copy['language_code'] = books_copy['language_code'].fillna('en-US')

In [21]:
books_copy.isnull().any(axis=0)

book_id                 False
title_without_series    False
book_description         True
publication_year        False
publisher                True
ratings_count           False
book_average_rating     False
cover_page              False
book_url                False
is_ebook                False
num_pages               False
country_code            False
language_code           False
dtype: bool

In [22]:
# not sure how to process the text columns to fill the null/missing values? -> just drop these as these have no real importance to the ratings_count?
books_copy.dropna(subset=['publisher', 'book_description'],axis=0,inplace=True)

# subset allows for quicker operations on mutliple columns

In [23]:
books_copy.isnull().any(axis=0)

book_id                 False
title_without_series    False
book_description        False
publication_year        False
publisher               False
ratings_count           False
book_average_rating     False
cover_page              False
book_url                False
is_ebook                False
num_pages               False
country_code            False
language_code           False
dtype: bool

### Check the users/ratings now

In [24]:
users_copy.isnull().any(axis=0)

book_id        False
user_id        False
review_text     True
review_id      False
n_votes        False
user_rating    False
dtype: bool

In [25]:
users_copy.dropna(subset=['review_text'],axis=0,inplace=True)

In [26]:
users_copy.isnull().any(axis=0)

book_id        False
user_id        False
review_text    False
review_id      False
n_votes        False
user_rating    False
dtype: bool

In [27]:
users_copy.shape

(1848810, 6)

In [28]:
books_copy.shape

(45256, 13)

## Merge the datasets

In [29]:
ratings = pd.merge(books_copy, users_copy, on='book_id')

In [30]:
ratings.shape

# have 312k entries for the ratings, so take a sample of the first 100k that is the same each time

(312110, 18)

In [31]:
ratings.columns

Index(['book_id', 'title_without_series', 'book_description',
       'publication_year', 'publisher', 'ratings_count', 'book_average_rating',
       'cover_page', 'book_url', 'is_ebook', 'num_pages', 'country_code',
       'language_code', 'user_id', 'review_text', 'review_id', 'n_votes',
       'user_rating'],
      dtype='object')

In [32]:
ratings.head()

Unnamed: 0,book_id,title_without_series,book_description,publication_year,publisher,ratings_count,book_average_rating,cover_page,book_url,is_ebook,num_pages,country_code,language_code,user_id,review_text,review_id,n_votes,user_rating
0,6066814,"Crowner Royal (Crowner John Mystery, #13)","London, 1196. At the command of Richard the Li...",2009.0,Simon & Schuster UK,186,3.93,https://images.gr-assets.com/books/1328724803m...,https://www.goodreads.com/book/show/6066814-cr...,False,400.0,US,en-US,9ab00cb54e02704c5a7bd5ca90564e2e,I enjoyed this book. It kept me guessing until...,8ee2d0644aa02cdf9a40dde4adaade04,0,5
1,29074697,The Slaughtered Virgin of Zenopolis (Inspector...,"BATHS, BANKS AND ROMAN INSURRECTION\nDetective...",2011.0,Amazon Digital Services,192,3.49,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/29074697-t...,True,304.0,US,eng,42dabb3a33d9a437387496a2dd50a247,It seems to take place in a parallel world ful...,c0e3b3879264cb7a888c5da4f5bf25c4,0,2
2,1902202,"Dead in the Morning (Patrick Grant, #1)","Gerald breezily introduced his wife, Helen, to...",1975.0,Ulverscroft,52,3.3,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1902202.De...,False,304.0,US,en-US,1e7bca7d5f8e3d6460320704c423c12e,Lively,22d5b7a4c8c7a48ec09b8779ecc76221,0,4
3,1902202,"Dead in the Morning (Patrick Grant, #1)","Gerald breezily introduced his wife, Helen, to...",1975.0,Ulverscroft,52,3.3,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1902202.De...,False,304.0,US,en-US,6a352b99747118883209c79d4d1f587d,I'm always trying to find the perfect mystery ...,0b104fb43b46e65758a1837450fa90d0,0,4
4,2805495,Wycliffe and the Cycle of Death,A respectable bookseller is found bludgeoned a...,2001.0,Orion,58,3.61,https://images.gr-assets.com/books/1328819096m...,https://www.goodreads.com/book/show/2805495-wy...,False,320.0,US,en-US,5c4fd56b7e95eb4d92ee8814b883eab8,"I enjoy the Wycliffe books,simpley because,the...",cc5598497557cd09bdd6b6246104bf99,0,4


### num_pages

In [33]:
# first we need to work out the distribution of this to see if there are any outliers

# look at percentiles to work out this

for i in range(0, 110,10):
    print(f'{i}th percentile is', np.percentile(ratings['num_pages'], i))

for i in range(0, 11,1):
    print(f'{i}th percentile is', np.percentile(ratings['num_pages'], i))

for i in range(90, 101,1):
    print(f'{i}th percentile is', np.percentile(ratings['num_pages'], i))



0th percentile is 0.0
10th percentile is 196.0
20th percentile is 240.0
30th percentile is 272.0
40th percentile is 296.0
50th percentile is 304.0
60th percentile is 320.0
70th percentile is 340.0
80th percentile is 370.0
90th percentile is 416.0
100th percentile is 16925.0
0th percentile is 0.0
1th percentile is 12.0
2th percentile is 46.0
3th percentile is 80.0
4th percentile is 108.0
5th percentile is 130.0
6th percentile is 150.0
7th percentile is 164.0
8th percentile is 178.0
9th percentile is 189.0
10th percentile is 196.0
90th percentile is 416.0
91th percentile is 424.0
92th percentile is 432.0
93th percentile is 444.0
94th percentile is 452.0
95th percentile is 465.0
96th percentile is 480.0
97th percentile is 500.0
98th percentile is 528.0
99th percentile is 592.0
100th percentile is 16925.0


Can see that we have a few books with less than 11 pages, so we will drop below this, as this will be too short.

Also, we have a book with 16925 pages, so we will drop above 1000 pages. As this is only 260 ratings out of 312153

There are 1231 ratings swith books below 5 pages, so we will also drop these. As it is unlikely you want  abook woth less than 5 pages

In [34]:
ratings[ratings['num_pages']>=1000.0]['num_pages'].count()

259

In [35]:
ratings[ratings['num_pages']<=5.0]['num_pages'].count()

1229

In [36]:
ratings.drop(ratings[ratings['num_pages'] <= 10.0].index, inplace=True)
ratings.drop(ratings[ratings['num_pages'] >= 1000.0].index, inplace=True)

In [37]:
ratings.shape

(309344, 18)

In [38]:
# select the first 100,000 cases to use for the RS, as mentioned in the FAQs the way this is done doesn't matter
ratings_100k = ratings.head(100000)

# verify size
ratings_100k.shape

(100000, 18)

In [41]:
# turns the ratings into a .csv file to allow for the use in the command line interface
ratings_100k.to_csv('ratings_clean_100k.csv')

In [325]:
from sklearn.model_selection import train_test_split
# perform the train/test split of 80/20

train_df, test_df = train_test_split(ratings_100k, train_size=0.8,test_size=0.2)

In [326]:
train_df

Unnamed: 0,book_id,title_without_series,book_description,publication_year,publisher,ratings_count,book_average_rating,cover_page,book_url,is_ebook,num_pages,country_code,language_code,user_id,review_text,review_id,n_votes,user_rating,weighted_avg
89048,24611949,"Just Killing Time (Clock Shop Mystery, #1)",First in a new cozy series that ticks with exc...,2015.0,Berkley,237,4.04,https://images.gr-assets.com/books/1427846109m...,https://www.goodreads.com/book/show/24611949-j...,False,294.0,US,en-GB,fe7e61537dffd723c765be23c40cea40,Really enjoyed this debut of a series... The m...,b059eb3bc39d81498c7a017911991ad8,0,4,4.039019
12965,15742835,Town in a Pumpkin Bash (A Candy Holliday Myst...,In the quaint seaside villageof Cape Willingto...,2013.0,Penguin Group (USA),467,3.85,https://images.gr-assets.com/books/1349938045m...,https://www.goodreads.com/book/show/15742835-t...,False,336.0,US,en-US,1f920e1dda9ebc58fc6e5f1a3e6a7acf,I really liked this book because it was festiv...,e1f0ab61a37ead27c16a646e3dde4137,3,4,3.849907
94584,15768116,"The Red Pole of Macau (Ava Lee, #4)","In The Red Pole of Macau, Ava's half-brother M...",2012.0,House of Anansi,838,3.87,https://images.gr-assets.com/books/1348432330m...,https://www.goodreads.com/book/show/15768116-t...,False,336.0,US,eng,961c9deb14d8f2be94876eba803d6bf1,"Book 4, in the Ava Lee series \n This series i...",214775b756f8e6bea145dd966d4bd6a0,0,3,3.869924
68298,10329868,October Fest (Murder-by-Month Mystery #6),Beer and polka music reign supreme at Octoberf...,2011.0,Midnight Ink,388,3.72,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/10329868-o...,False,229.0,US,en-US,eb0a72ec26fb13d99388bfd8141ab95b,I have not read a Mira James book that I haven...,a19e17230164350e1b984e3119afa522,0,5,3.720222
559,23252909,"Crucifixion Creek (The Belltree Trilogy, #1)",Homicide detective Harry Belltree wouldn't usu...,2014.0,Text Publishing,331,3.54,https://images.gr-assets.com/books/1411188356m...,https://www.goodreads.com/book/show/23252909-c...,False,255.0,US,eng,dd75a7c8b35b9500914d4dec9b427197,What is this about: Harry Belltree is a Sydney...,c228893171903f9a2b2ba4b0d196cc11,0,3,3.540802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37283,31208858,The Woman Who Couldn't Scream,From the New York Timesbestselling author come...,2017.0,St. Martin's Press,401,4.19,https://images.gr-assets.com/books/1482527141m...,https://www.goodreads.com/book/show/31208858-t...,False,352.0,US,eng,32da874ae1d31c0cb392afa3a8e001f3,"In my review of ""Obsession Falls"" (Virtue Fall...",cb4d0258cd8d0d8c2f63b407b19ab347,1,5,4.189046
3384,12438022,"The Anatomy of Death (Dr Dody McCleland, #1)","At the turn of the twentieth century, London's...",2012.0,Berkley Trade,169,3.67,https://images.gr-assets.com/books/1328141839m...,https://www.goodreads.com/book/show/12438022-t...,False,307.0,US,eng,aeada51312c0f5625c52b549a13c92c8,The Anatomy of Death is the first in a new ser...,ab08033969441b7ede6eb680a79edfe8,1,3,3.670802
40139,22055854,A Head Full Of Knives,THE NEW NOVEL FROM THE AUTHOR OF THE #1 BESTSE...,2011.0,Amazon Digital Services,278,3.88,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/22055854-a...,True,304.0,US,eng,9437002ebd6b22743003edfac9676fa5,Wow. Luke (may I call you Luke?) has done it a...,5f91009af3ee2900e06bf950531b9c31,0,5,3.879736
37571,28502862,Incensed (A Taipei Night Market #2),"In Taiwan, the Mid-Autumn Festival is a time f...",2016.0,Soho Crime,37,3.37,https://images.gr-assets.com/books/1470765236m...,https://www.goodreads.com/book/show/28502862-i...,True,304.0,US,en-US,8a7311cf27149551e157f48c941006d4,I love reading mysteries set in different plac...,dc482aca54b5d8ff66bc85f3d51574d0,1,2,3.381485


## Content-based filtering


In [46]:
# create a copy of the training ratings
content_ratings = ratings_100k.copy()

In [303]:
# takes about 12s
def create_tf_idf(data):

    # tokenize the text data e.g. descprtion
    # gets rid of the english stop words, to help give a better decsription
    tf_idf = TfidfVectorizer(stop_words='english')

    # fit data and transform to tdidf matrix
    tf_idf_matrix = tf_idf.fit_transform(data['book_description'])

    return tf_idf_matrix

# creates the matrix of all ratings to allow for the use in the system
matrix = create_tf_idf(content_ratings)

To help with the cold start problem, we will recommend the books with the highest weighted average to the user

In [48]:
# cold start avoidance in the case of a new user
def cold_start_avoidance(data, n):
    # sorts the most popular books via weighted avg, to help with the good books with less reviews
    popular_books = data.sort_values(by='weighted_avg', ascending=False)
    # gets rid of any duplicates
    popular_books = popular_books.drop_duplicates(subset='title_without_series')
    # gets the top n books
    popular_n_books = popular_books.head(n)[['book_id', 'title_without_series']]
    # turns into dataframe for easy of use and consistency
    df = pd.DataFrame(popular_n_books, columns=['book_id', 'title_without_series'])
    return df

In [447]:
def get_recommendations(user_id, data, n,matrix, full_ratings):
    # first checks if the user is registered/ in the system:
    if user_id not in data['user_id'].unique():
        # if not we return the best books
        return cold_start_avoidance(data, n)
    
    # creates the tf_idf matrix dependent on the data given
    #tf_idf_matrix = create_tf_idf(data)
    tf_idf_matrix = matrix

    # Extract books liked by the user, if the id is valid 
    user_books = data[data['user_id'] == user_id]['title_without_series'].tolist()
        
    # create dataframe for storing recommmendations
    recommendations_df = pd.DataFrame(columns=['Cosine Similarity'])

    # keep track of recommended books to avoid duplicates
    recommended_books_set = set()

    # looks through each book liked by the user gets similar books based on cosine similarities
    for book in user_books:
        # aggregates td-idf vectors for each book
        liked_book_idx = data.index[(data['title_without_series'] == book) & (data['user_id'] == user_id)].tolist()[0]
        liked_book_tfidf = tf_idf_matrix[liked_book_idx]

        # calculate cosine similarity of current book with the rest of the books
        cosine_similarities = cosine_similarity(liked_book_tfidf, tf_idf_matrix).flatten()

        # indices of 'close' books
        sim_indices = cosine_similarities.argsort()[::-1]

        # gets rid of any books already liked by the user
        # have to pass through the full dataset otherwise can given iloc errors due to indexs not being in the correct
        # train/test split
        sim_indices = [i for i in sim_indices if (full_ratings['title_without_series'].iloc[i] not in user_books) and 
                                                (full_ratings['title_without_series'].iloc[i] not in recommended_books_set)]

        # add the top n books to the dataframe
        top_recommendations = full_ratings['title_without_series'].iloc[sim_indices].tolist()
        cosine_sim_values = cosine_similarities[sim_indices]
        book_id = full_ratings['book_id'].iloc[sim_indices].tolist()

        recommendations_df = recommendations_df.append(pd.DataFrame({'book_id':book_id, 'title_without_series': top_recommendations, 'Cosine Similarity': cosine_sim_values}))

        recommended_books_set.update(top_recommendations)

    # sorts recommendations via cosine similarity
    recommendations_df = recommendations_df.sort_values(by=['Cosine Similarity'], ascending=False)

    # gets top n unique recommendations
    unique_recommendations = recommendations_df.drop_duplicates(subset=['title_without_series']).head(n)
    
    # gets rid of cosine similarity, to allow for ease of use for user
    unique_recommendations = unique_recommendations.drop('Cosine Similarity', axis=1)
    return unique_recommendations

In [572]:
# example usage
user_id = '9b808f1cf7160f03647fb8b8aefd4ffb'
get_recommendations(user_id, ratings_100k, 10, matrix, content_ratings)

Unnamed: 0,book_id,title_without_series
0,251808.0,"The Mosaic Crimes (Dante Alighieri, #2)"
34,259042.0,The Dante Club
41,29430732.0,Kill the Father
43,3890688.0,"Never Say Sty (Kendra Ballantyne, Pet-Sitter M..."
46,24450451.0,Inferno: A Novel Unabridged Edition
50,36070517.0,"Tramps and Thieves (Murder and Mayhem, #2)"
52,25926794.0,Duce (World's End #1)
67,30269117.0,Inferno
70,27161832.0,We Were Kings
79,24485919.0,"The Last Honeytrap (Florence Love, #1)"


## Collaborative Filtering

In [51]:
collab_ratings = ratings_100k.copy()

In [51]:
pip install surprise 


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [416]:
# for collab filtering use the suprise module to allow
from surprise import SVD , SVDpp
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split as tts2

The collaboartive filtering recommendation code has been adapted using the logicand examples presented here; https://pub.aimind.so/implementing-a-collaborative-filtering-recommendation-system-using-surprise-a-step-by-step-guide-2e879a34e021

In [53]:
re = Reader(rating_scale=(0.0,5.0))

In [361]:
trainset = Dataset.load_from_df(train_df[["user_id", "book_id", "book_average_rating"]], re)
testset = Dataset.load_from_df(test_df[["user_id", "book_id", "book_average_rating"]], re)
full_data = Dataset.load_from_df(collab_ratings[["user_id", "book_id", "book_average_rating"]], re)

In [421]:
trainset2, testset2 = tts2(full_data, test_size=0.2)

Comparison of SVD and SVD++ across 5 folds to see which is better

In [595]:
algo = SVD()
cross_validate(algo,full_data, measures=["RMSE"], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.1887  0.1840  0.1824  0.1866  0.1857  0.1855  0.0021  
Fit time          1.30    1.27    1.24    1.33    1.25    1.28    0.03    
Test time         0.14    0.13    22.14   0.13    0.13    4.53    8.80    


{'test_rmse': array([0.18865167, 0.18396849, 0.18242575, 0.18663097, 0.18565618]),
 'fit_time': (1.3000288009643555,
  1.2701051235198975,
  1.2384250164031982,
  1.3263790607452393,
  1.2489640712738037),
 'test_time': (0.13849806785583496,
  0.13400602340698242,
  22.1388521194458,
  0.13197994232177734,
  0.13136577606201172)}

In [596]:
algo = SVDpp()
cross_validate(algo,full_data, measures=["RMSE"], cv=5, verbose=True)

Evaluating RMSE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.1568  0.1568  0.1562  0.1580  0.1568  0.1569  0.0006  
Fit time          1.73    1.74    1.74    1.84    1.84    1.78    0.05    
Test time         0.45    0.47    11.68   0.55    0.47    2.72    4.48    


{'test_rmse': array([0.15683573, 0.15680932, 0.15615002, 0.15797296, 0.15677781]),
 'fit_time': (1.7334482669830322,
  1.7430388927459717,
  1.7411158084869385,
  1.8445050716400146,
  1.8381340503692627),
 'test_time': (0.45234179496765137,
  0.46922802925109863,
  11.683666944503784,
  0.5468008518218994,
  0.46802520751953125)}

In [570]:
# fits the SVD++ algo, and then uses the test set for predictions
algo2=SVDpp()
algo2.fit(trainset2)
predictions_2 = algo2.test(testset2)

In [599]:
# allows for the use in the the cli interface
pred = pd.DataFrame(predictions_2)
pred.to_csv('predictions_collab.csv')

In [426]:
def make_recommendations(data_df, n, predictions, user_id):
    
    # turns predictions into a dataframe for easy use
    pred = pd.DataFrame(predictions)
    # gets ratings for only specific user
    user_ratings = data_df[data_df['user_id'] == user_id]
    
    # merges predictions with the ratings on user_id and book_id
    merged_data = pd.merge(pred, user_ratings, left_on='iid', right_on='book_id')

    # sorts merged dataframe based on the estimated ratings
    merged_data.sort_values(by=['est'], inplace=True, ascending=False)
    
    unique_recommendations = []
    # looks through each prediction
    for book_id in merged_data['iid']:
        if book_id not in unique_recommendations:
            # adds to list if it is unique
            unique_recommendations.append(book_id)

    # makes a dataframe of the books and corresponding titles, to keep consistent with content based
    corresponding_titles = data_df[data_df['book_id'].isin(unique_recommendations)][['book_id', 'title_without_series']].drop_duplicates()

    return pd.DataFrame(corresponding_titles)

In [569]:
# example usage
user_id = '9b808f1cf7160f03647fb8b8aefd4ffb'
make_recommendations(ratings_100k, 10, predictions_2,user_id)

Unnamed: 0,book_id,title_without_series
11494,25651905,Why You Were Taken (When Tomorrow Calls #1)
23073,27845484,"Eve: A Christmas Ghost Story (Psychic Surveys,..."
23308,28821606,The Last Thing I Remember
27278,30242428,Bloodwalker
32250,23664378,Guilt
52401,28351603,Twisted Justice
96085,35480285,A Deadly Game
98948,32766340,The Hunger Within


## Hybrid System

Combine the 2 functions and then pick the best values

The code and weighted average approach has been inspired by this example https://thecleverprogrammer.com/2023/06/05/hybrid-recommendation-system-using-python/ and this paper https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9590147 


In [584]:
def hybrid_recommendations_rs1(user_id, data, n, algorithm, matrix, full_ratings):
    # calls the collaborative and content based filtering functions
    content = get_recommendations(user_id, data, n, matrix, full_ratings)
    collab = make_recommendations(data, n, algorithm, user_id)
    # join the 2 dataframes
    hybrid = pd.concat([content, collab])

    # use a weighted system based on number of ratings, to decide on the weighting to use for the hybrid approach

    v = data['ratings_count']
    R = data['book_average_rating']

    C = data['book_average_rating'].mean()

    m = data['ratings_count']>25
    
    data_copy = data.copy()
    data_copy.loc[:,'weighted_avg'] = ((R*v) + (C*m)) / (v+m)


    # change column type of book_id to be a integer, as it was a float, so hard to look up the weighted average
    hybrid['book_id'] = hybrid['book_id'].astype(int)
    
    # look at how to use the book_id to then find the weighted average and then add this to hybrid table 
    id_weight = data_copy.set_index('book_id')['weighted_avg'].to_dict()

    # maps the weighted_avg values to the 'book_id' column
    hybrid['weighted_avg'] = hybrid['book_id'].map(id_weight)

    # sorts the dataframe by 'weighted_avg'
    hybrid_sorted = hybrid.sort_values(by='weighted_avg', ascending=False)

    # Select the top n rows with unique values
    top_n_unique_values = hybrid_sorted.head(n).drop_duplicates(subset='title_without_series')
    # gets rid of the weighted_avg column to allow for user friendly format
    values = top_n_unique_values.drop('weighted_avg',axis=1)

    return values

In [585]:
# example usage
user_id = '9b808f1cf7160f03647fb8b8aefd4ffb'
n=10
hybrid_recs = hybrid_recommendations_rs1(user_id, ratings_100k, n, predictions_2, matrix, ratings_100k)

In [586]:
hybrid_recs

Unnamed: 0,book_id,title_without_series
50,36070517,"Tramps and Thieves (Murder and Mayhem, #2)"
27278,30242428,Bloodwalker
23073,27845484,"Eve: A Christmas Ghost Story (Psychic Surveys,..."
52401,28351603,Twisted Justice
96085,35480285,A Deadly Game
11494,25651905,Why You Were Taken (When Tomorrow Calls #1)
41,29430732,Kill the Father
32250,23664378,Guilt
98948,32766340,The Hunger Within
67,30269117,Inferno


Evaulation metric, precision@k

In [450]:
# pick a user_id in the test set to allow for evaulation
test_df['user_id']

6741     fca26c34be8fe623ee340061f1281796
99445    1e665751171140504fe141bc0b32d561
52630    7a59f54e4132d7cee8e35e0fc27b911e
34331    fe4b4225b3abb22d1fa8a2f084afd3d1
7934     d90a5f4e9c9f218b502e0256051e10f9
                       ...               
41194    e74bf12dfe132ed793fba533926c6ce6
74352    795595616d3dbd81bd16b617c9a1fa48
87690    e4e71efd6de21e8dab4b1e6c73640221
28998    24cc9173407cba53efb8a93a98052644
4781     ed4add87b3c704b27a027e00c5e88fa5
Name: user_id, Length: 20000, dtype: object

Using the user_id of 1e665751171140504fe141bc0b32d561 for evaulation

In [453]:
def precision_at_k2(recommendations,k, user_id):
    # get items that are relevant to user 
    user_rated_items = test_df[test_df['user_id'] == user_id]['book_id'].tolist()
    # then finds all the common books between the ones that the user has previously rated and then ones that the recommender has predicted
    # then divides by, the number of recommendations to get this value
    precision = len(set(recommendations['book_id']) & set(user_rated_items)) / k
    return precision

In [547]:
user_id = '1e665751171140504fe141bc0b32d561'
n=5
eval_recs_5 = hybrid_recommendations_rs1(user_id, test_df, n, predictions_2, matrix, ratings_100k)

In [548]:
user_id = '1e665751171140504fe141bc0b32d561'
n=10
eval_recs_10 = hybrid_recommendations_rs1(user_id, test_df, n, predictions_2, matrix, ratings_100k)

In [554]:
user_id = '1e665751171140504fe141bc0b32d561'
n=15
eval_recs_15 = hybrid_recommendations_rs1(user_id, test_df, n, predictions_2, matrix, ratings_100k)

In [549]:
user_id = '1e665751171140504fe141bc0b32d561'
n=20
eval_recs_20 = hybrid_recommendations_rs1(user_id, test_df, n, predictions_2, matrix, ratings_100k)

In [555]:
precision_hybrid = precision_at_k2(eval_recs_5,5,user_id)
print(f'Precision@{5} for hybrid recommendations: {precision_hybrid}')

Precision@5 for hybrid recommendations: 0.4


In [556]:
precision_hybrid = precision_at_k2(eval_recs_10,10,user_id)
print(f'Precision@{10} for hybrid recommendations: {precision_hybrid}')

Precision@10 for hybrid recommendations: 0.3


In [557]:
precision_hybrid = precision_at_k2(eval_recs_15,15,user_id)
print(f'Precision@{15} for hybrid recommendations: {precision_hybrid}')

Precision@15 for hybrid recommendations: 0.26666666666666666


In [552]:
precision_hybrid = precision_at_k2(eval_recs_20,20,user_id)
print(f'Precision@{20} for hybrid recommendations: {precision_hybrid}')

Precision@20 for hybrid recommendations: 0.25


# RS2 - Deep Learning + NLP approach 

In [74]:
# creates a copy of the 100k sample to allow for any preperation needed for rs2
rs2_ratings = ratings_100k.copy()

In [75]:
rs2_ratings.shape

(100000, 19)

In [76]:
rs2_ratings.head()

Unnamed: 0,book_id,title_without_series,book_description,publication_year,publisher,ratings_count,book_average_rating,cover_page,book_url,is_ebook,num_pages,country_code,language_code,user_id,review_text,review_id,n_votes,user_rating,weighted_avg
0,6066814,"Crowner Royal (Crowner John Mystery, #13)","London, 1196. At the command of Richard the Li...",2009.0,Simon & Schuster UK,186,3.93,https://images.gr-assets.com/books/1328724803m...,https://www.goodreads.com/book/show/6066814-cr...,False,400.0,US,en-US,9ab00cb54e02704c5a7bd5ca90564e2e,I enjoyed this book. It kept me guessing until...,8ee2d0644aa02cdf9a40dde4adaade04,0,5,3.929339
1,29074697,The Slaughtered Virgin of Zenopolis (Inspector...,"BATHS, BANKS AND ROMAN INSURRECTION\nDetective...",2011.0,Amazon Digital Services,192,3.49,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/29074697-t...,True,304.0,US,eng,42dabb3a33d9a437387496a2dd50a247,It seems to take place in a parallel world ful...,c0e3b3879264cb7a888c5da4f5bf25c4,0,2,3.491639
2,1902202,"Dead in the Morning (Patrick Grant, #1)","Gerald breezily introduced his wife, Helen, to...",1975.0,Ulverscroft,52,3.3,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1902202.De...,False,304.0,US,en-US,1e7bca7d5f8e3d6460320704c423c12e,Lively,22d5b7a4c8c7a48ec09b8779ecc76221,0,4,3.309555
3,1902202,"Dead in the Morning (Patrick Grant, #1)","Gerald breezily introduced his wife, Helen, to...",1975.0,Ulverscroft,52,3.3,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1902202.De...,False,304.0,US,en-US,6a352b99747118883209c79d4d1f587d,I'm always trying to find the perfect mystery ...,0b104fb43b46e65758a1837450fa90d0,0,4,3.309555
4,2805495,Wycliffe and the Cycle of Death,A respectable bookseller is found bludgeoned a...,2001.0,Orion,58,3.61,https://images.gr-assets.com/books/1328819096m...,https://www.goodreads.com/book/show/2805495-wy...,False,320.0,US,en-US,5c4fd56b7e95eb4d92ee8814b883eab8,"I enjoy the Wycliffe books,simpley because,the...",cc5598497557cd09bdd6b6246104bf99,0,4,3.613329


Text PreProcessing for Content based filtering rs2

In [85]:
# imports necessary for RS2
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
# removes stopwords in pre-processing
nltk.download('stopwords')
# uses this module for the text tokenization
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jacobdear/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jacobdear/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [86]:
# creats a list of all stopwords from the nltk built-in library
stop_words_tpp = set(stopwords.words('english'))
print(stop_words_tpp)

{'will', 'both', 'no', 'after', "hadn't", 'herself', 'y', 'hasn', 'have', 'but', 'over', 'above', 'your', 'be', 'do', 'below', 'hadn', 'isn', 'our', 'each', 'most', 'into', 'yourselves', 'who', 'having', "doesn't", 'during', 'just', "it's", 'she', 'are', 'by', 'off', 'then', 'before', 'there', 'needn', 'ma', 'won', 'under', 'because', 'again', 'her', 'were', 'me', 'while', 'ourselves', "you'll", 'very', 'not', 'that', 'we', "won't", 'how', 'shouldn', 'until', 'he', "shan't", 'd', "she's", 'here', 'doesn', 'wasn', "needn't", "haven't", 'himself', 'for', 'own', 'out', 'now', 'nor', 'if', "wouldn't", 'other', 'those', 'once', 'mustn', 'all', 'don', 'myself', 'i', 'whom', "isn't", 'the', 'where', 'some', 'themselves', 'more', 'down', 'further', 't', 's', 'and', 'can', 'same', 'been', 'of', "wasn't", "you'd", 'their', 'too', 'between', 'few', 'any', "hasn't", 'theirs', 'through', 'them', 'couldn', 'than', 'didn', "mightn't", 'haven', 'has', 're', 'its', 'about', 'you', 'mightn', 'from', 'm'

This pre-processing function has been used from here, but been made to work with my own textual data; https://github.com/ashok426/Recommender-System/blob/main/book_recommendation.ipynb 

In [130]:
rs2_ratings_copy = rs2_ratings.copy()
def nlp_preprocessing(total_text, index, column):
    if type(total_text) is not int:
        string = ""
        for words in total_text.split():
            # remove the special chars in review like '"#$@!%^&*()_+-~?>< etc.
            word = ("".join(e for e in words if e.isalnum()))
            # Conver all letters to lower-case
            word = word.lower()
            # stop-word removal
            if not word in stop_words_tpp:
                string += word + " "
        # updates the specified row and colummn pair with the new text
        rs2_ratings_copy.loc[index,column] = string

In [131]:
# perform it for the book_description
for index, row in rs2_ratings.iterrows():
    nlp_preprocessing(row['book_description'],index, 'book_description')

In [132]:
# perform it for the book description
for index, row in rs2_ratings.iterrows():
    nlp_preprocessing(row['title_without_series'],index, 'title_without_series')

In [193]:
rs2_ratings_copy.head()

Unnamed: 0,book_id,title_without_series,book_description,publication_year,publisher,ratings_count,book_average_rating,cover_page,book_url,is_ebook,...,country_code,language_code,user_id,review_text,review_id,n_votes,user_rating,weighted_avg,combined_text,tokenised_text
0,6066814,crowner royal crowner john mystery 13,london 1196 command richard lionheart sir john...,2009.0,Simon & Schuster UK,186,3.93,https://images.gr-assets.com/books/1328724803m...,https://www.goodreads.com/book/show/6066814-cr...,False,...,US,en-US,9ab00cb54e02704c5a7bd5ca90564e2e,I enjoyed this book. It kept me guessing until...,8ee2d0644aa02cdf9a40dde4adaade04,0,5,3.929339,crowner royal crowner john mystery 13 london ...,"[crowner, royal, crowner, john, mystery, 13, l..."
1,29074697,slaughtered virgin zenopolis inspector capstan 1,baths banks roman insurrection detective inspe...,2011.0,Amazon Digital Services,192,3.49,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/29074697-t...,True,...,US,eng,42dabb3a33d9a437387496a2dd50a247,It seems to take place in a parallel world ful...,c0e3b3879264cb7a888c5da4f5bf25c4,0,2,3.491639,slaughtered virgin zenopolis inspector capstan...,"[slaughtered, virgin, zenopolis, inspector, ca..."
2,1902202,dead morning patrick grant 1,gerald breezily introduced wife helen mrs mack...,1975.0,Ulverscroft,52,3.3,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1902202.De...,False,...,US,en-US,1e7bca7d5f8e3d6460320704c423c12e,Lively,22d5b7a4c8c7a48ec09b8779ecc76221,0,4,3.309555,dead morning patrick grant 1 gerald breezily ...,"[dead, morning, patrick, grant, 1, gerald, bre..."
3,1902202,dead morning patrick grant 1,gerald breezily introduced wife helen mrs mack...,1975.0,Ulverscroft,52,3.3,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1902202.De...,False,...,US,en-US,6a352b99747118883209c79d4d1f587d,I'm always trying to find the perfect mystery ...,0b104fb43b46e65758a1837450fa90d0,0,4,3.309555,dead morning patrick grant 1 gerald breezily ...,"[dead, morning, patrick, grant, 1, gerald, bre..."
4,2805495,wycliffe cycle death,respectable bookseller found bludgeoned strang...,2001.0,Orion,58,3.61,https://images.gr-assets.com/books/1328819096m...,https://www.goodreads.com/book/show/2805495-wy...,False,...,US,en-US,5c4fd56b7e95eb4d92ee8814b883eab8,"I enjoy the Wycliffe books,simpley because,the...",cc5598497557cd09bdd6b6246104bf99,0,4,3.613329,wycliffe cycle death respectable bookseller f...,"[wycliffe, cycle, death, respectable, booksell..."


In [134]:
# add a column that will allow us to tokenise the textual data, have already applied the preprocessing techniques above
rs2_ratings_copy['combined_text'] = rs2_ratings_copy['title_without_series'] + ' ' + rs2_ratings_copy['book_description']

In [135]:
# creates a new column that contains the tokenised text
rs2_ratings_copy['tokenised_text'] = rs2_ratings_copy['combined_text'].apply(word_tokenize)

In [262]:
rs2_ratings_copy.head()

Unnamed: 0,book_id,title_without_series,book_description,publication_year,publisher,ratings_count,book_average_rating,cover_page,book_url,is_ebook,...,country_code,language_code,user_id,review_text,review_id,n_votes,user_rating,weighted_avg,combined_text,tokenised_text
0,6066814,crowner royal crowner john mystery 13,london 1196 command richard lionheart sir john...,2009.0,Simon & Schuster UK,186,3.93,https://images.gr-assets.com/books/1328724803m...,https://www.goodreads.com/book/show/6066814-cr...,False,...,US,en-US,9ab00cb54e02704c5a7bd5ca90564e2e,I enjoyed this book. It kept me guessing until...,8ee2d0644aa02cdf9a40dde4adaade04,0,5,3.929339,crowner royal crowner john mystery 13 london ...,"[crowner, royal, crowner, john, mystery, 13, l..."
1,29074697,slaughtered virgin zenopolis inspector capstan 1,baths banks roman insurrection detective inspe...,2011.0,Amazon Digital Services,192,3.49,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/29074697-t...,True,...,US,eng,42dabb3a33d9a437387496a2dd50a247,It seems to take place in a parallel world ful...,c0e3b3879264cb7a888c5da4f5bf25c4,0,2,3.491639,slaughtered virgin zenopolis inspector capstan...,"[slaughtered, virgin, zenopolis, inspector, ca..."
2,1902202,dead morning patrick grant 1,gerald breezily introduced wife helen mrs mack...,1975.0,Ulverscroft,52,3.3,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1902202.De...,False,...,US,en-US,1e7bca7d5f8e3d6460320704c423c12e,Lively,22d5b7a4c8c7a48ec09b8779ecc76221,0,4,3.309555,dead morning patrick grant 1 gerald breezily ...,"[dead, morning, patrick, grant, 1, gerald, bre..."
3,1902202,dead morning patrick grant 1,gerald breezily introduced wife helen mrs mack...,1975.0,Ulverscroft,52,3.3,https://s.gr-assets.com/assets/nophoto/book/11...,https://www.goodreads.com/book/show/1902202.De...,False,...,US,en-US,6a352b99747118883209c79d4d1f587d,I'm always trying to find the perfect mystery ...,0b104fb43b46e65758a1837450fa90d0,0,4,3.309555,dead morning patrick grant 1 gerald breezily ...,"[dead, morning, patrick, grant, 1, gerald, bre..."
4,2805495,wycliffe cycle death,respectable bookseller found bludgeoned strang...,2001.0,Orion,58,3.61,https://images.gr-assets.com/books/1328819096m...,https://www.goodreads.com/book/show/2805495-wy...,False,...,US,en-US,5c4fd56b7e95eb4d92ee8814b883eab8,"I enjoy the Wycliffe books,simpley because,the...",cc5598497557cd09bdd6b6246104bf99,0,4,3.613329,wycliffe cycle death respectable bookseller f...,"[wycliffe, cycle, death, respectable, booksell..."


In [195]:
# turns the ratings into a .csv file to allow for the use in the command line interface
rs2_ratings_copy.to_csv('rs2_clean_ratings.csv')

In [230]:
# split this dataset 80/20 to allow for train/testing
books = rs2_ratings_copy['book_id'].unique().tolist()
# gets 80% of books
rs2_train = [books[i] for i in range(round(0.8*len(books)))]

train_df = rs2_ratings_copy[rs2_ratings_copy['book_id'].isin(rs2_train)]
# all values not in training set, 20% for testing set
test_df = rs2_ratings_copy[~rs2_ratings_copy['book_id'].isin(rs2_train)]

train_df, test_df = train_test_split(rs2_ratings_copy)

In [270]:
train_df_1, test_df_1 = train_test_split(rs2_ratings_copy, test_size=0.2)

In [271]:
train_df_1.shape

(80000, 21)

In [565]:
pip install gensim 


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [255]:
# use this to implement Word2Vec 

# paper # use the NLP approach, with word2vec model / CBOW model from https://arxiv.org/pdf/1301.3781.pdf  
from gensim.models import Word2Vec


This repository inspired me as to how to implemenent the word2vec model https://www.analyticsvidhya.com/blog/2019/07/how-to-build-recommendation-system-word2vec-python/

In [258]:
from tqdm import tqdm
# list to capture books read by the customers
books_read = []

# populate the list with the book codes
for index, row in tqdm(train_df_1.iterrows(), total=len(train_df_1)):
    temp = train_df_1[train_df_1["user_id"] == row['user_id']]['book_id'].tolist()
    books_read.append(temp)

100%|██████████| 80000/80000 [11:33<00:00, 115.28it/s] 


In [259]:
# list to capture books read by the customers
books_read_test = []

# populate the list with the book codes
for index, row in tqdm(test_df_1.iterrows(), total=len(test_df_1)):
    temp = test_df_1[test_df_1["user_id"] == row['user_id']]['book_id'].tolist()
    books_read_test.append(temp)

100%|██████████| 20000/20000 [00:50<00:00, 396.82it/s]


In [263]:
# train word2vec model
w2_model = Word2Vec(vector_size=100, window = 10, sg = 1,
                 negative = 10, # for negative sampling
                 alpha=0.025,
                 seed = 42)
# buils the vocabulary, using the book_ids and the tokenised text, so takes into account the book description and titles as well
w2_model.build_vocab(books_read + train_df['tokenised_text'].tolist())

Function interpreted from https://stackoverflow.com/questions/52038651/loss-does-not-decrease-during-training-word2vec-gensim

Allows for the training loss at each step rather than the total thing

In [260]:
from gensim.models.callbacks import CallbackAny2Vec
# allows us to print each epoch and the corresponding loss value
class callback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 1
        self.loss_previous_step = 0

    # when the epoch ends we print this out to see progress
    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print(f"Epoch {self.epoch}, Loss: {loss - self.loss_previous_step}")
        self.epoch += 1
        self.loss_previous_step = loss


In [831]:
#skipgram model - allow for finding which is best
w2_model.train(books_read + train_df['tokenised_text'].tolist(),
               total_examples=w2_model.corpus_count,
               epochs=10,
               compute_loss=True,
               callbacks=[callback()]
              )

Epoch 1, Loss: 29598854.0
Epoch 2, Loss: 18148558.0
Epoch 3, Loss: 16627320.0
Epoch 4, Loss: 9504900.0
Epoch 5, Loss: 8064872.0
Epoch 6, Loss: 7859680.0
Epoch 7, Loss: 7595984.0
Epoch 8, Loss: 6912848.0
Epoch 9, Loss: 5759848.0
Epoch 10, Loss: 4118480.0


(82358363, 83396620)

In [832]:
# train word2vec model, using cbow -> compare with
w4_model = Word2Vec(vector_size=100, window = 10, sg = 0,
                 negative = 10, # for negative sampling
                 alpha=0.025,
                 seed = 42)
# buils the vocabulary, using the book_ids and the tokenised text, so takes into account the book description and titles as well
w4_model.build_vocab(books_read + train_df['tokenised_text'].tolist())
# calls the class to log epochs
# trains on a combination of book_ids and tokenised text, for all the value in the corpus, which is the set of 
# vectors in the vocab, trains for 10 epochs
# compute_loss=True, so we can see the progress and then allows for the printing of epochs via callbacks
w4_model.train(books_read + train_df['tokenised_text'].tolist(),
               total_examples=w4_model.corpus_count,
               epochs=100,
               compute_loss=True,
               callbacks=[callback()]
              )

Epoch 1, Loss: 5202126.0
Epoch 2, Loss: 3383501.0
Epoch 3, Loss: 2873230.0
Epoch 4, Loss: 2779805.0
Epoch 5, Loss: 2708960.0
Epoch 6, Loss: 2187416.0
Epoch 7, Loss: 2140948.0
Epoch 8, Loss: 2116866.0
Epoch 9, Loss: 2169410.0
Epoch 10, Loss: 2472746.0


(82359252, 83396620)

In [290]:
books_read + train_df_1['tokenised_text'].tolist()

[[21524042, 31247797, 31212885, 23014725, 21525975, 18296034, 31450942],
 [13139541,
  25926794,
  20559181,
  18478288,
  17189646,
  32065756,
  33349745,
  17666614,
  22010295,
  25620200,
  31421768],
 [25553902, 20739400],
 [28700203, 28439261, 28700205, 28352399, 30753708, 31163136],
 [29619495],
 [178187, 15818484, 2458493, 3078243, 16171164, 745597],
 [2675253],
 [17210630],
 [33602157, 13418181, 33517546, 7777102],
 [3798689],
 [33287719],
 [16044661],
 [16135291,
  17159217,
  15728523,
  17325144,
  23462659,
  18835612,
  16131037,
  399488,
  13496561,
  1745179,
  24943335,
  20735474,
  24430636,
  13144471,
  18752899,
  13453467,
  8474440,
  20727623,
  6065908,
  3340430,
  17910157,
  610943,
  21432392,
  17231578,
  18305462,
  13227592,
  23348689,
  9627683,
  6971840,
  15897054,
  24982722,
  845874],
 [6801218, 7777102, 13418181],
 [21897920,
  31818197,
  28541293,
  34040919,
  9286133,
  33113495,
  35063606,
  32591869,
  35222416,
  33545061,
  32198502

In [272]:
# train word2vec model, using cbow, this is the model used in rs2
w3_model = Word2Vec(vector_size=100, window = 10, sg = 0,
                 negative = 10, # for negative sampling
                 alpha=0.025,
                 seed = 42)
# buils the vocabulary, using the book_ids and the tokenised text, so takes into account the book description and titles as well
w3_model.build_vocab(books_read + train_df_1['tokenised_text'].tolist())
# calls the class to log epochs
# trains on a combination of book_ids and tokenised text, for all the value in the corpus, which is the set of 
# vectors in the vocab, trains for 10 epochs
# compute_loss=True, so we can see the progress and then allows for the printing of epochs via callbacks
w3_model.train(books_read + train_df_1['tokenised_text'].tolist(),
               total_examples=w3_model.corpus_count,
               epochs=100,
               compute_loss=True,
               callbacks=[callback()]
              )

Epoch 1, Loss: 6816385.5
Epoch 2, Loss: 4216818.5
Epoch 3, Loss: 3825105.0
Epoch 4, Loss: 3365591.0
Epoch 5, Loss: 2926982.0
Epoch 6, Loss: 2908826.0
Epoch 7, Loss: 2895336.0
Epoch 8, Loss: 2904184.0
Epoch 9, Loss: 2920082.0
Epoch 10, Loss: 2346922.0
Epoch 11, Loss: 2147456.0
Epoch 12, Loss: 2167228.0
Epoch 13, Loss: 2188012.0
Epoch 14, Loss: 2205416.0
Epoch 15, Loss: 2231648.0
Epoch 16, Loss: 2238860.0
Epoch 17, Loss: 2259172.0
Epoch 18, Loss: 2251700.0
Epoch 19, Loss: 2287796.0
Epoch 20, Loss: 2289868.0
Epoch 21, Loss: 2301368.0
Epoch 22, Loss: 2303076.0
Epoch 23, Loss: 2316916.0
Epoch 24, Loss: 2310936.0
Epoch 25, Loss: 1367740.0
Epoch 26, Loss: 1115624.0
Epoch 27, Loss: 1121912.0
Epoch 28, Loss: 1124160.0
Epoch 29, Loss: 1119896.0
Epoch 30, Loss: 1123616.0
Epoch 31, Loss: 1122984.0
Epoch 32, Loss: 1128296.0
Epoch 33, Loss: 1128472.0
Epoch 34, Loss: 1130256.0
Epoch 35, Loss: 1129224.0
Epoch 36, Loss: 1115424.0
Epoch 37, Loss: 1118544.0
Epoch 38, Loss: 1110544.0
Epoch 39, Loss: 11175

(824915521, 836570400)

In [273]:
# saves the model, allowing for the use in the cli 
w3_model.save("w3_model.model")

In [274]:
print(w3_model)

Word2Vec<vocab=64215, vector_size=100, alpha=0.025>


In [508]:
# takes the average all previously rated items and produces a resultant vector
def aggregate_vectors(books):
    book_vec = []
    for i in books:
        try:
            book_vec.append(w3_model.wv[i])
        except KeyError:
            continue
        
    return np.mean(book_vec, axis=0)

In [563]:
def content_rs2(user_id, data, n):
    
    # extract liked books by user
    user_books = data[data['user_id'] == user_id]['book_id'].tolist()
    # calculate the resultant vector
    user_vector = aggregate_vectors(user_books)
    # extract most similar words/books for the input vector
    ms = w3_model.wv.most_similar(user_vector, topn= n+1)[1:]
    #print(ms)
    # extracts the top n most similar books, excluding itself
    book_ids = [int(j[0]) for j in ms]
    # creates a df of the most similar book_ids
    book_ids_df = pd.DataFrame({'book_id': book_ids})

    # merges the dataset with the original dataset to allow for the titles to be in proper formats
    # not lower and with all stopwords and punctutation removed like train_df
    new_ms = pd.merge(book_ids_df, rs2_ratings, how='left', on='book_id')
    # gets rid of the duplicated values when merging, can run into problems due to having mutliple instances 
    # of same book_id
    new_ms.drop_duplicates(inplace=True, subset='book_id', keep="last")
    
    # returns the dataframe but only the book_id and title column
    return new_ms[['book_id', 'title_without_series']]


In [574]:
# example usage
content_rs2('9b808f1cf7160f03647fb8b8aefd4ffb', ratings_100k,10)

Unnamed: 0,book_id,title_without_series
11,23664378,Guilt
20,22079279,"Behind a Closed Door (The Estate, #2)"
30,34146224,"Robbing the Dead (Inspector Jim Carruthers, #1)"
54,27845484,"Eve: A Christmas Ghost Story (Psychic Surveys,..."
65,29073299,A Necessary Act
70,28500919,Betrayal
74,32766340,The Hunger Within
79,28351603,Twisted Justice
85,22079303,"Fighting for Survival (The Estate, #3)"
120,27237074,The Theseus Paradox (DI Jake Flannagan #1)


In [581]:
def hybrid_recommendations(user_id, data, n, predictions):
    # calls the collaborative and content based filtering functions
    # calls rs2 for content rather than the original function
    content = content_rs2(user_id, data, n)
    collab = make_recommendations(data, n, predictions, user_id)
    # join the 2 dataframes
    hybrid = pd.concat([content, collab])
     
    # use a weighted system based on number of ratings, to decide on the weighting to use for the hybrid approach

    # inspired by this paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9590147 

    v = data['ratings_count']
    R = data['book_average_rating']

    C = data['book_average_rating'].mean()

    m = data['ratings_count']>25
    data_copy = data.copy()
    data_copy.loc[:,'weighted_avg'] = ((R*v) + (C*m)) / (v+m)

    # change column type of book_id to be a integer, as it was a float, so hard to look up the weighted average
    hybrid['book_id'] = hybrid['book_id'].astype(int)
    
    # look at how to use the book_id to then find the weighted average and then add this to hybrid table 
    id_weight = data_copy.set_index('book_id')['weighted_avg'].to_dict()

    # maps the weighted_avg values to the 'book_id' column
    hybrid['weighted_avg'] = hybrid['book_id'].map(id_weight)

    # sorts the dataframe by 'weighted_avg'
    hybrid_sorted = hybrid.sort_values(by='weighted_avg', ascending=False)
    # drops any duplicates
    top_n_unique_values = hybrid_sorted.drop_duplicates(subset='title_without_series')
    # gets rid of the weighted_avg column to allow for user friendly format
    values = top_n_unique_values.drop('weighted_avg',axis=1)

    # returns top n values
    return values.head(n)

In [582]:
# example usage
user_id = '9b808f1cf7160f03647fb8b8aefd4ffb'
n = 10
hybrid_recommendations(user_id, ratings_100k, n, predictions_2)

Unnamed: 0,book_id,title_without_series
27278,30242428,Bloodwalker
23073,27845484,"Eve: A Christmas Ghost Story (Psychic Surveys,..."
85,22079303,"Fighting for Survival (The Estate, #3)"
20,22079279,"Behind a Closed Door (The Estate, #2)"
52401,28351603,Twisted Justice
120,27237074,The Theseus Paradox (DI Jake Flannagan #1)
96085,35480285,A Deadly Game
11494,25651905,Why You Were Taken (When Tomorrow Calls #1)
32250,23664378,Guilt
74,32766340,The Hunger Within


Evaulation metric for RS2, novelty

In [634]:
from recmetrics import novelty
from collections import Counter

In [617]:
# A dictionary of all items alongside of its occurrences counter in the training data example, allows for us in novelty metric
book_id_counter = Counter(train_df_1['book_id'])
# turns the elements into a dict to allow for the correct use in function
occurrences_dict = dict(book_id_counter)

In [618]:
# gets the unique number of users in the training set
num_users = train_df_1['user_id'].nunique()

Using these recommendations for the metrics

In [530]:
user_id = '1e665751171140504fe141bc0b32d561'
n = 5
eval_rs2_5 = hybrid_recommendations(user_id, test_df, n, predictions_2)

In [644]:
user_id = '1e665751171140504fe141bc0b32d561'
n = 10
eval_rs2_10 = hybrid_recommendations(user_id, test_df, n, predictions_2)

In [560]:
user_id = '1e665751171140504fe141bc0b32d561'
n = 15
eval_rs2_15 = hybrid_recommendations(user_id, test_df, n, predictions_2)

In [559]:
user_id = '1e665751171140504fe141bc0b32d561'
n = 20
eval_rs2_20 = hybrid_recommendations(user_id, test_df, n, predictions_2)

Novelty@k for RS2

In [646]:
novelty([eval_rs2_5['book_id'].tolist()],occurrences_dict,num_users,5)

(11.865801902493896, [11.865801902493896])

In [645]:
novelty([eval_rs2_10['book_id'].tolist()],occurrences_dict,num_users,10)

(12.58109376010756, [12.58109376010756])

In [647]:
novelty([eval_rs2_15['book_id'].tolist()],occurrences_dict,num_users,15)

(13.096324585541785, [13.096324585541785])

In [648]:
novelty([eval_rs2_20['book_id'].tolist()],occurrences_dict,num_users,20)

(13.245443748186778, [13.245443748186778])

Precision @k for RS2

In [639]:
precision_hybrid = precision_at_k2(eval_rs2_5,5,user_id)
print(f'Precision@{5} for rs2 hybrid recommendations: {precision_hybrid}')

Precision@5 for rs2 hybrid recommendations: 1.0


In [640]:
precision_hybrid = precision_at_k2(eval_rs2_10,10,user_id)
print(f'Precision@{10} for rs2 hybrid recommendations : {precision_hybrid}')

Precision@10 for rs2 hybrid recommendations : 0.5


In [641]:
precision_hybrid = precision_at_k2(eval_rs2_15,15,user_id)
print(f'Precision@{15} for rs2 hybrid recommendations: {precision_hybrid}')

Precision@15 for rs2 hybrid recommendations: 0.3333333333333333


In [643]:
precision_hybrid = precision_at_k2(eval_rs2_20,20,user_id)
print(f'Precision@{20} for rs2 hybrid recommendations: {precision_hybrid}')

Precision@20 for rs2 hybrid recommendations: 0.25


Novelty@K for RS1

In [649]:
novelty([eval_recs_5['book_id'].tolist()],occurrences_dict,num_users,5)

(11.213195021327136, [11.213195021327136])

In [642]:
novelty([eval_recs_10['book_id'].tolist()],occurrences_dict,num_users,10)

(11.931908450474591, [11.931908450474591])

In [650]:
novelty([eval_recs_15['book_id'].tolist()],occurrences_dict,num_users,15)

(12.007517485614589, [12.007517485614589])

In [651]:
novelty([eval_recs_20['book_id'].tolist()],occurrences_dict,num_users,20)

(12.098266687637265, [12.098266687637265])

## Code for command line interface

Below is the code I used for the 2 recommender systems.
Note; recommender.py and cli.py are for RS1 and rs2_recommender.py and rs2_cli.py are for RS2 as these load different values

recommender.py

In [652]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# content based recommendation system
# for td_idf matrix
def create_tf_idf(data):

    # tokenize the text data e.g. descprtion
    # gets rid of the english stop words, to help give a better decsription
    tf_idf = TfidfVectorizer(stop_words='english')

    # fit data and transform to tdidf matrix
    tf_idf_matrix = tf_idf.fit_transform(data['book_description'])

    return tf_idf_matrix

# cold start avoidance in the case of a new user
def cold_start_avoidance(data, n,previous_recommendations):
    # makes sure these aren't already in the set
    popular_books = data[~data['book_id'].isin(previous_recommendations)]
    # gets rid of any duplicates
    popular_books = popular_books.drop_duplicates(subset='title_without_series')
    # sorts the most popular books via weighted avg, to help with the good books with less reviews
    popular_books = data.sort_values(by='weighted_avg', ascending=False)
    # gets the top n books
    popular_n_books = popular_books.head(n)[['book_id', 'title_without_series']]
    # turns into dataframe for easy of use and consistency
    df = pd.DataFrame(popular_n_books, columns=['book_id', 'title_without_series'])
    return df

# function for content-based recommendations
def get_content_recommendations(user_id, data, n, previous_recommendations):
    # first checks if the user is registered/ in the system:
    if user_id not in data['user_id'].unique():
        return cold_start_avoidance(data, n,previous_recommendations)
    
    # creates the tf_idf matrix dependent on the data given
    tf_idf_matrix = create_tf_idf(data)

    
    # Extract books liked by the user, if the id is valid 
    user_books = data[data['user_id'] == user_id]['title_without_series'].tolist()
        
    # Create a DataFrame to store recommendations and their cosine similarities
    recommendations_df = pd.DataFrame(columns=['Cosine Similarity'])

    # Keep track of recommended books to avoid duplicates
    recommended_books_set = set()

    # Look through each book liked by the user and find similar books based on cosine similarities
    for book in user_books:
        # Aggregate TF-IDF vectors for liked books
        liked_book_idx = data.index[(data['title_without_series'] == book) & (data['user_id'] == user_id)].tolist()[0]
        liked_book_tfidf = tf_idf_matrix[liked_book_idx]

        # Calculate cosine similarity between the liked book and all other books
        cosine_similarities = cosine_similarity(liked_book_tfidf, tf_idf_matrix).flatten()

        # Get indices of books sorted by similarity (excluding liked books)
        sim_indices = cosine_similarities.argsort()[::-1]

        # Filter out books the user has already liked and those already recommended
        sim_indices = [i for i in sim_indices if (data['title_without_series'].iloc[i] not in user_books) and 
                                                (data['title_without_series'].iloc[i] not in recommended_books_set)
                                                and(data['title_without_series'].iloc[i] not in previous_recommendations)]

        # Add top n recommendations to the DataFrame, for each book
        top_recommendations = data['title_without_series'].iloc[sim_indices].tolist()
        cosine_sim_values = cosine_similarities[sim_indices]
        book_id = data['book_id'].iloc[sim_indices].tolist()

        recommendations_df = recommendations_df.append(pd.DataFrame({'book_id':book_id, 'title_without_series': top_recommendations, 'Cosine Similarity': cosine_sim_values}))

        recommended_books_set.update(top_recommendations)

    # Sort recommendations by cosine similarity in descending order
    recommendations_df = recommendations_df.sort_values(by=['Cosine Similarity'], ascending=False)

    # Get unique top n recommendations
    unique_recommendations = recommendations_df.drop_duplicates(subset=['title_without_series']).head(n)

    # drops the Cosine Similarity column as it is no longer needed
    unique_recommendations = unique_recommendations.drop('Cosine Similarity', axis=1)

    return unique_recommendations

# collaborative filtering
def make_recommendations(data, n, predictions, user_id, previous_recommendations):
    #pred = pd.DataFrame(predictions)
    # Filter collab_ratings for the specified user_id
    user_ratings = data[data['user_id'] == user_id]
    
    # Merge predictions with user_ratings based on the book_id
    merged_data = pd.merge(predictions, user_ratings, left_on='iid', right_on='book_id')

    # Sorting the merged DataFrame based on estimated ratings
    merged_data.sort_values(by=['est'], inplace=True, ascending=False)

    unique_books_set = set()
    unique_recommendations = []

    for book_id in merged_data['iid']:
        # doesn't contain any duplicate or previous recs
        if book_id not in unique_books_set and book_id not in previous_recommendations:
            unique_books_set.add(book_id)
            unique_recommendations.append(book_id)

    corresponding_titles = data[data['book_id'].isin(unique_recommendations)][['book_id', 'title_without_series']].drop_duplicates()

    return corresponding_titles


# hybrid recommendation system
def hybrid_recommendations(user_id, data, n, predictions,previous_recommendations):
    # call the 2 functions incase they haven't already been run
    content = get_content_recommendations(user_id, data, n,previous_recommendations)
    collab = make_recommendations(data, n, predictions, user_id,previous_recommendations)
    # join the 2 dataframes
    hybrid = pd.concat([content, collab])

    # gets rid of any books that may have already been predicted
    hybrid = hybrid[~hybrid['book_id'].isin(previous_recommendations)]
     
    # use a weighted system based on number of ratings, to decide on the weighting to use for the hybrid approach

    # inspired by this paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9590147 

    v = data['ratings_count']
    R = data['book_average_rating']

    C = data['book_average_rating'].mean()

    m = data['ratings_count']>25
    
    data['weighted_avg'] = ((R*v) + (C*m)) / (v+m)

    # change column type of book_id to be a integer, as it was a float, so hard to look up the weighted average
    hybrid['book_id'] = hybrid['book_id'].astype(int)
    
    # look at how to use the book_id to then find the weighted average and then add this to hybrid table 
    id_weight = data.set_index('book_id')['weighted_avg'].to_dict()

    # maps the weighted_avg values to the 'book_id' column in the hybrid system
    hybrid['weighted_avg'] = hybrid['book_id'].map(id_weight)

    # sorts the df by 'weighted_avg' in descending order
    hybrid_sorted = hybrid.sort_values(by='weighted_avg', ascending=False)

    # selecs the top n rows with unique values
    top_n_unique_values = hybrid_sorted.head(n).drop_duplicates(subset='title_without_series')
    # gets rid of the weighted_avg column to allow for user friendly format
    
    values = top_n_unique_values.drop('weighted_avg',axis=1)

    # updates so that previous recommendations aren't given
    previous_recommendations.update(values['book_id'].tolist())

    print('here')
    
    # check incase we can't get enough recommendations due to content and collab not producing anymore
    if len(values) <n:
        needed = n - len(values)
        # makes sure these aren't already in the set
        possible_recs = data[~data['book_id'].isin(previous_recommendations)]
        # drops duplicates
        possible_recs = possible_recs.drop_duplicates(subset='title_without_series')
        # sorts the rest by the weighted avg, as want the best ones of these
        possible_recs = possible_recs.sort_values(by='weighted_avg', ascending=False)
        # takes the top needed values
        new_values = possible_recs.head(needed)[['book_id', 'title_without_series']]
        #new_values = new_values[~new_values['book_id'].isin(previous_recommendations)]
        # concat with the previous values
        values = pd.concat([values, new_values])

    return values, previous_recommendations


cli.py

In [654]:
# pandas to read the relevant data in 
import pandas as pd
# imports the function to allow for recommendations
from recommender import hybrid_recommendations

def main():
    print('Welcome to this Book Recommendation System!')
    print('===========================================')
    user_id = input("Enter your id to login: ")

    # load the data needed for the system to work
    ratings_data = pd.read_csv('./ratings_100k_weighted_avg.csv')
    predictions_data = pd.read_csv('./predictions_collab.csv')

    # intialise set
    previous_recommendations = set()


    n=10
    # next call the hybrid recommendation function
    hybrid_recs, prev_r = hybrid_recommendations(user_id,ratings_data,n,predictions_data,previous_recommendations)
    print(f"\nTop {n} recommendations for User {user_id}:\n")
    print(hybrid_recs)
    # set to hold previous recommendations, adds the previous recommendations
    previous_recommendations.update(prev_r)

    # then we give the option to change the number of recommendations, e.g. from a default 10 to 20 for example
    while True:
        print('==================================================================')
        choice = input("Do you want to recommend more books? (y/n) ")

        # if not then we exit
        if choice == "n":
            print("Thank you for using, this Book Recommendation System!")
            break
        
        if choice == "y":
            number = input("How many recommendations would you like? ")
            # calls the recommender system again, with the new number of recommendations
            recommendations, previous_r = hybrid_recommendations(user_id, ratings_data,int(number),predictions_data,previous_recommendations)
            print(f"\nTop {number} recommendations for User {user_id}:\n")
            print(recommendations)
            # udpates the set of previous recommendations, so they aren't duplicated
            previous_recommendations.update(previous_r)
if __name__ == '__main__':
    main()

Welcome to this Book Recommendation System!


here

Top 10 recommendations for User 9b808f1cf7160f03647fb8b8aefd4ffb:

        book_id                               title_without_series
26935  30242428                                        Bloodwalker
22818  27845484  Eve: A Christmas Ghost Story (Psychic Surveys,...
12     30357487        Guns n' Boys: He Is Mine (Guns n' Boys, #2)
51730  28351603                                    Twisted Justice
95014  35480285                                      A Deadly Game
11351  25651905        Why You Were Taken (When Tomorrow Calls #1)
0        288525                     Last Call (The Party Room, #2)
31836  23664378                                              Guilt
59     27135688                     Shady Hollow: A Murder Mystery
18698  34700191                                       Undercurrent
Thank you for using, this Book Recommendation System!


rs2_recommender.py


In [655]:
import numpy as np
import pandas as pd

# takes the average all previously rated items and produces a resultant vector
def aggregate_vectors(products, w2_model):
    product_vec = []
    for i in products:
        try:
            product_vec.append(w2_model.wv[i])
        except KeyError:
            continue
    if product_vec:
        return np.mean(product_vec, axis=0)
    else:
        return np.zeros(w2_model.vector_size)

def content_rs2(user_id, data, n,w2_model, previous_recommendations):
    
    # extract liked books by user
    user_books = data[data['user_id'] == user_id]['book_id'].tolist()
    # calculate the resultant vector
    user_vector = aggregate_vectors(user_books, w2_model)

    #print(user_books, user_vector)

    # extract most similar products for the input vector, adds the length of previous recommednations as well
    # as these are still most similar, want the next n tho!
    ms = w2_model.wv.most_similar([user_vector], topn=n + len(previous_recommendations))
    # extracts the top n most similar books, excluding itself and the previous recommendations
    book_ids = [int(j[0]) for j in ms if int(j[0]) not in previous_recommendations]
    # creates a df of the most similar book_ids
    book_ids_df = pd.DataFrame({'book_id': book_ids})
    # merges the dataset with 

    new_ms = pd.merge(book_ids_df, data, how='left', on='book_id')
    # gets rid of the duplicated values when merging, can run into problems due to having mutliple instances 
    # of same book_id
    new_ms.drop_duplicates(inplace=True, subset='book_id', keep="last")
    
    # returns the dataframe but only the book_id and title column
    return new_ms[['book_id', 'title_without_series']]


# collaborative filtering
def make_recommendations(data, n, predictions, user_id, previous_recommendations):
    #pred = pd.DataFrame(predictions)
    # Filter collab_ratings for the specified user_id
    user_ratings = data[data['user_id'] == user_id]
    
    # Merge predictions with user_ratings based on the book_id
    merged_data = pd.merge(predictions, user_ratings, left_on='iid', right_on='book_id')

    # Sorting the merged DataFrame based on estimated ratings
    merged_data.sort_values(by=['est'], inplace=True, ascending=False)

    unique_books_set = set()
    unique_recommendations = []

    for book_id in merged_data['iid']:
        # doesn't contain any duplicate or previous recs
        if book_id not in unique_books_set and book_id not in previous_recommendations:
            unique_books_set.add(book_id)
            unique_recommendations.append(book_id)

    corresponding_titles = data[data['book_id'].isin(unique_recommendations)][['book_id', 'title_without_series']].drop_duplicates()

    return corresponding_titles


# hybrid recommendation system
def hybrid_recommendations(user_id, data, n, predictions,previous_recommendations, w2_model):
    # call the 2 functions incase they haven't already been run
    content = content_rs2(user_id, data, n, w2_model,previous_recommendations)
    collab = make_recommendations(data, n, predictions, user_id,previous_recommendations)
    # join the 2 dataframes
    hybrid = pd.concat([content, collab])

    # gets rid of any books that may have already been predicted
    hybrid = hybrid[~hybrid['book_id'].isin(previous_recommendations)]
     
    # use a weighted system based on number of ratings, to decide on the weighting to use for the hybrid approach

    # inspired by this paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9590147 

    v = data['ratings_count']
    R = data['book_average_rating']

    C = data['book_average_rating'].mean()

    m = data['ratings_count']>25
    
    data['weighted_avg'] = ((R*v) + (C*m)) / (v+m)

    # change column type of book_id to be a integer, as it was a float, so hard to look up the weighted average
    hybrid['book_id'] = hybrid['book_id'].astype(int)
    
    # look at how to use the book_id to then find the weighted average and then add this to hybrid table 
    id_weight = data.set_index('book_id')['weighted_avg'].to_dict()

    # maps the weighted_avg values to the 'book_id' column in the hybrid system
    hybrid['weighted_avg'] = hybrid['book_id'].map(id_weight)

    # sorts the df by 'weighted_avg' in descending order
    hybrid_sorted = hybrid.sort_values(by='weighted_avg', ascending=False)

    # selects the top n rows with unique values
    top_n_unique_values = hybrid_sorted.head(n).drop_duplicates(subset='title_without_series')
    # gets rid of the weighted_avg column to allow for user friendly format
    
    values = top_n_unique_values.drop('weighted_avg',axis=1)

    # updates so that previous recommendations aren't given
    previous_recommendations.update(values['book_id'].tolist())

    print('here')
    
    # check incase we can't get enough recommendations due to content and collab not producing anymore
    if len(values) <n:
        needed = n - len(values)
        # makes sure these aren't already in the set
        possible_recs = data[~data['book_id'].isin(previous_recommendations)]
        # drops duplicates
        possible_recs = possible_recs.drop_duplicates(subset='title_without_series')
        # sorts the rest by the weighted avg, as want the best ones of these
        possible_recs = possible_recs.sort_values(by='weighted_avg', ascending=False)
        # takes the top needed values
        new_values = possible_recs.head(needed)[['book_id', 'title_without_series']]
        #new_values = new_values[~new_values['book_id'].isin(previous_recommendations)]
        # concat with the previous values
        values = pd.concat([values, new_values])

    return values, previous_recommendations


rs2_cli.py

In [656]:
# pandas to read the relevant data in 
import pandas as pd
# imports the function to allow for recommendations
from rs2_recommender import hybrid_recommendations
from gensim.models import Word2Vec

def main():
    print('Welcome to this Book Recommendation System!')
    print('===========================================')
    user_id = input("Enter your id to login: ")

    # load the data needed for the system to work
    ratings_data = pd.read_csv('./ratings_100k_weighted_avg.csv')
    predictions_data = pd.read_csv('./predictions_collab.csv')
    model = Word2Vec.load('./w3_model.model')

    # intialise set
    previous_recommendations = set()


    n=10
    # next call the hybrid recommendation function
    hybrid_recs, prev_r = hybrid_recommendations(user_id,ratings_data,n,predictions_data,previous_recommendations, model)
    print(f"\nTop {n} recommendations for User {user_id}:\n")
    print(hybrid_recs)
    # set to hold previous recommendations, adds the previous recommendations
    previous_recommendations.update(prev_r)

    # then we give the option to change the number of recommendations, e.g. from a default 10 to 20 for example
    while True:
        print('==================================================================')
        choice = input("Do you want to recommend more books? (y/n) ")

        # if not then we exit
        if choice == "n":
            print("Thank you for using, this Book Recommendation System!")
            break
        
        if choice == "y":
            number = input("How many recommendations would you like? ")
            # calls the recommender system again, with the new number of recommendations
            recommendations, previous_r = hybrid_recommendations(user_id, ratings_data,int(number),predictions_data,previous_recommendations,model)
            print(f"\nTop {number} recommendations for User {user_id}:\n")
            print(recommendations)
            # udpates the set of previous recommendations, so they aren't duplicated
            previous_recommendations.update(previous_r)
if __name__ == '__main__':
    main()

Welcome to this Book Recommendation System!
here

Top 10 recommendations for User 9b808f1cf7160f03647fb8b8aefd4ffb:

        book_id                               title_without_series
26935  30242428                                        Bloodwalker
59     27845484  Eve: A Christmas Ghost Story (Psychic Surveys,...
90     22079303             Fighting for Survival (The Estate, #3)
25     22079279              Behind a Closed Door (The Estate, #2)
84     28351603                                    Twisted Justice
95014  35480285                                      A Deadly Game
11351  25651905        Why You Were Taken (When Tomorrow Calls #1)
16     23664378                                              Guilt
43289  35706468                               When Time Is A River
28429  13554552                   Sapphire Reign (Royal Blood, #2)
here

Top 10 recommendations for User 9b808f1cf7160f03647fb8b8aefd4ffb:

        book_id                               title_without_series
116   