# Imports

In [1]:
# Import all required libraries
# You might need to install the surprise package (pip install scikit-surprise)
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
try:
    from surprise import NormalPredictor, KNNBasic, NMF, SlopeOne, SVD, Dataset
    from surprise.model_selection import cross_validate
except ModuleNotFoundError:
    !pip install surprise
    from surprise import NormalPredictor, KNNBasic, NMF, SlopeOne, SVD, Dataset
    from surprise.model_selection import cross_validate

Collecting surprise
  Using cached surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Using cached surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=597048 sha256=c585dbba1ed7250a2b38de2db91791e6fbcf2f68aa97043cdc70e59a84c7e18b
  Stored in directory: /home/jovyan/.cache/pip/wheels/2a/8f/6e/7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.4 surprise-0.1


# Load datasets

In [3]:
sales_path = "../data/sales_dataset/Books_Data_Cleaner.csv"
# sales_cols = ['index', 'Publishing Year', 'Book Name', 'Author', 'language_code', 'Author_Rating', 'Book_average_rating', 'Book_ratings_count', 'genre', 'gross sales', 'publisher revenue', 'sale price', 'sales rank', 'Publisher', 'units sold']
sales_df = pd.read_csv(sales_path)
sales_df.head(3)

Unnamed: 0,index,publishing_year,book_name,author,language_code,author_rating,book_average_rating,book_ratings_count,genre,gross_sales,publisher_revenue,sale_price,sales_rank,publisher,units_sold
0,0,1975.0,Beowulf,"Unknown, Seamus Heaney",en-US,Novice,3.42,155903,genre fiction,34160.0,20496.0,4.88,1,HarperCollins Publishers,7000
1,1,1987.0,Batman: Year One,"Frank Miller, David Mazzucchelli, Richmond Lew...",eng,Intermediate,4.23,145267,genre fiction,12437.5,7462.5,1.99,2,HarperCollins Publishers,6250
2,2,2015.0,Go Set a Watchman,Harper Lee,eng,Novice,3.31,138669,genre fiction,47795.0,28677.0,8.69,3,"Amazon Digital Services, Inc.",5500


In [4]:
sales_df.columns

Index(['index', 'publishing_year', 'book_name', 'author', 'language_code',
       'author_rating', 'book_average_rating', 'book_ratings_count', 'genre',
       'gross_sales', 'publisher_revenue', 'sale_price', 'sales_rank',
       'publisher', 'units_sold'],
      dtype='object')

In [5]:
sales_df.describe()

Unnamed: 0,index,publishing_year,book_average_rating,book_ratings_count,gross_sales,publisher_revenue,sale_price,sales_rank,units_sold
count,1070.0,1069.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0
mean,534.5,1971.377923,4.007,94909.913084,1856.622944,843.28103,4.869561,611.652336,9676.980374
std,309.026698,185.080257,0.247244,31513.242518,3936.92424,2257.596743,3.559919,369.84983,15370.571306
min,0.0,-560.0,2.97,27308.0,104.94,0.0,0.99,1.0,106.0
25%,267.25,1985.0,3.85,70398.0,372.465,0.0,1.99,287.5,551.25
50%,534.5,2003.0,4.015,89309.0,809.745,273.078,3.99,595.5,3924.0
75%,801.75,2010.0,4.17,113906.5,1487.9575,721.1805,6.99,932.5,5312.25
max,1069.0,2016.0,4.77,206792.0,47795.0,28677.0,33.86,1273.0,61560.0


# TF-IDF Vectorization

In [6]:
vector_sales = sales_df
subset = [
          'book_name',
          'author',
          'publisher',
          'publishing_year',
          # 'language_code',
          # 'author_rating',
          'book_average_rating',
          'book_ratings_count',
          'genre',
          'gross_sales',
          'publisher_revenue',
          'sale_price',
          'sales_rank',
          'units_sold'
         ]

vector_sales.dropna(subset=subset,inplace=True,axis=0)
vector_sales = vector_sales.reset_index(drop=True)

# two sets of vectors, one of words, one of numbers
# apply similarity to each vector

In [7]:
# vector_sales['book_name'] = [re.sub(',',' ',re.sub(' ','',t)) for t in vector_sales['book_name']]
vector_sales['author']    = [re.sub(',',' ',re.sub(' ','',t)) for t in vector_sales['author']]
vector_sales['publisher'] = [re.sub(',',' ',re.sub(' ','',t)) for t in vector_sales['publisher']]
vector_sales['genre'] = [re.sub(',',' ',re.sub(' ','',t)) for t in vector_sales['genre']]

In [8]:
vector_sales['combined'] = \
                            vector_sales['book_name'] + ' ' + \
                            vector_sales['author'] + ' ' + \
                            vector_sales['publisher'] + ' ' + \
                            vector_sales['publishing_year'].astype(str) + ' ' + \
                            vector_sales['book_average_rating'].astype(str) + ' ' + \
                            vector_sales['genre']
                            # vector_sales['book_ratings_count'].astype(str) + ' ' + \
                            # vector_sales['gross_sales'].astype(str) + ' ' + \
                            # vector_sales['publisher_revenue'].astype(str) + ' ' + \
                            # vector_sales['sale_price'].astype(str) + ' ' + \
                            # vector_sales['sales_rank'].astype(str) + ' ' + \
                            # vector_sales['units_sold'].astype(str)
                            # vector_sales['language_code'] + ' ' + \
                            # vector_sales['author_rating'] + ' ' + \

In [9]:
vector_sales['combined_numbers'] = \
                            vector_sales['book_ratings_count'].astype(str) + ' ' + \
                            vector_sales['gross_sales'].astype(str) + ' ' + \
                            vector_sales['publisher_revenue'].astype(str) + ' ' + \
                            vector_sales['sale_price'].astype(str) + ' ' + \
                            vector_sales['sales_rank'].astype(str) + ' ' + \
                            vector_sales['units_sold'].astype(str)
                            # vector_sales['language_code'] + ' ' + \
                            # vector_sales['author_rating'] + ' ' + \

In [10]:
vector_sales['combined'][0]

'Beowulf Unknown SeamusHeaney HarperCollinsPublishers 1975.0 3.42 genrefiction'

In [11]:
vector_sales['combined_numbers'][0]

'155903 34160.0 20496.0 4.88 1 7000'

In [26]:
print(vector_sales['combined'][51])
print(vector_sales['combined_numbers'][1])

The Magicians LevGrossman Macmillan 2009.0 3.47 genrefiction
145267 12437.5 7462.5 1.99 2 6250


In [46]:
# vectorizer = TfidfVectorizer()
# matrix = vectorizer.fit_transform(vector_sales['combined'])
# similarities = linear_kernel(matrix,matrix)
# book_title = vector_sales['book_name']
# indices = pd.Series(vector_sales.index, index=vector_sales['book_name'])

# Recommender

In [12]:
vectorizer = TfidfVectorizer(stop_words='english')
book_title = vector_sales['book_name']
indices = pd.Series(vector_sales.index, index=vector_sales['book_name'])

In [13]:
book_title

0                               Beowulf
1                      Batman: Year One
2                     Go Set a Watchman
3       When You Are Engulfed in Flames
4              Daughter of Smoke & Bone
                     ...               
1042                      Gray Mountain
1043                   The Power of One
1044                 The Maltese Falcon
1045                         Night Road
1046                           Tripwire
Name: book_name, Length: 1047, dtype: object

In [14]:
def content_recommender(title, num_to_rec=20, data=vector_sales, similarity_function='cosine'):
    # title = re.sub(',',' ',re.sub(' ','',title))
    matrix = vectorizer.fit_transform(data['combined'])
    idx = indices[title]
    
    # if similarity_function == 'cosine':
        # cosine similarity
    similarities = linear_kernel(matrix,matrix)
    
    sim_scores = list(enumerate(similarities[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_to_rec]
    book_indices = [i[0] for i in sim_scores]
    book_titles = book_title.iloc[book_indices]

    sim_scores = [item[1] for item in sim_scores]
    sim_scores = [round(score, 3) for score in sim_scores]
    
    recommendations = pd.DataFrame({'Books': book_titles, 'Similarity Score': sim_scores})
    return recommendations

In [15]:
num_to_rec = 20
similarity_function = 'cosine'

In [63]:
content_recommender('Pretty Little Liars', num_to_rec, vector_sales, similarity_function)

Unnamed: 0,Books,Similarity Score
802,Pretty Girls,0.233
776,All the Pretty Horses,0.232
524,The Summer I Turned Pretty,0.212
656,A Little Life,0.186
680,Stuart Little,0.159
565,Little Earthquakes,0.145
9,A Little Princess,0.144
962,Infidel,0.139
247,For One More Day,0.137
147,Little House In The Big Woods,0.136


In [64]:
content_recommender('The Bourne Supremacy', num_to_rec, vector_sales, similarity_function)

Unnamed: 0,Books,Similarity Score
458,Tales of a Fourth Grade Nothing,0.241
841,Eleven on Top,0.207
394,Fire,0.191
208,The Power of Now,0.181
247,For One More Day,0.177
1031,Forever,0.176
725,To the Nines,0.173
431,Steelheart,0.172
133,Along for the Ride,0.171
258,About a Boy,0.167


In [65]:
content_recommender('The Magicians', num_to_rec, vector_sales, similarity_function)

Unnamed: 0,Books,Similarity Score
799,Night Watch,0.215
372,Gerald's Game,0.214
221,Lonesome Dove,0.203
932,"Lover Enshrined, part two",0.173
228,The White Queen,0.123
394,Fire,0.121
141,Firestarter,0.118
624,The Last Straw,0.118
583,Two States,0.116
685,The Awakening,0.116


In [61]:
content_recommender('The Magicians', num_to_rec, vector_sales, similarity_function)

Unnamed: 0,Books,Similarity Score
799,Night Watch,0.211
372,Gerald's Game,0.211
221,Lonesome Dove,0.2
932,"Lover Enshrined, part two",0.149
228,The White Queen,0.142
685,The Awakening,0.135
856,The Year of the Flood,0.13
624,The Last Straw,0.124
428,The Dark Half,0.123
141,Firestarter,0.117


In [67]:
content_recommender('Me and Earl and the Dying Girl', num_to_rec, vector_sales, similarity_function)

Unnamed: 0,Books,Similarity Score
476,The Girl You Left Behind,0.256
349,As I Lay Dying,0.219
846,The Miniaturist,0.212
225,Twenties Girl,0.186
392,Not that Kind of Girl,0.186
274,The Good Girl,0.185
1006,Rapture,0.163
445,The Girl Who Loved Tom Gordon,0.145
71,"Girl, Interrupted",0.138
259,On Dublin Street,0.138


In [56]:
sales_df['author'].value_counts()

author
Stephen King                                              29
John Grisham                                              13
Jim Butcher                                               10
Jodi Picoult                                              10
Charlaine Harris                                          10
                                                          ..
John Scalzi                                                1
Irma S. Rombauer, Marion Rombauer Becker, Ethan Becker     1
Joe Haldeman                                               1
Stephen King, Bettina Blanch Tyroller                      1
Dashiell Hammett                                           1
Name: count, Length: 722, dtype: int64

In [57]:
sales_df.loc[sales_df['author'] == "Stephen King"]

Unnamed: 0,index,publishing_year,book_name,author,language_code,author_rating,book_average_rating,book_ratings_count,genre,gross_sales,publisher_revenue,sale_price,sales_rank,publisher,units_sold
22,22,1991.0,Needful Things,Stephen King,,Intermediate,3.87,153821,genre fiction,2772.0,0.0,0.99,23,"Amazon Digital Services, Inc.",2800
25,25,1981.0,Cujo,Stephen King,spa,Intermediate,3.65,158215,genre fiction,7774.0,0.0,2.99,26,"Amazon Digital Services, Inc.",2600
78,78,2006.0,Cell,Stephen King,en-US,Intermediate,3.64,144591,genre fiction,5933.78,3560.268,4.54,80,HarperCollins Publishers,1307
81,81,1987.0,The Drawing of the Three,Stephen King,eng,Intermediate,4.23,139052,nonfiction,2539.24,1523.544,1.99,82,HarperCollins Publishers,1276
92,92,2014.0,Mr. Mercedes,Stephen King,eng,Intermediate,3.92,125847,genre fiction,2202.93,1321.758,1.99,93,Macmillan,1107
139,139,1979.0,The Dead Zone,Stephen King,en-US,Intermediate,3.9,132723,genre fiction,1512.4,907.44,1.99,148,HarperCollins Publishers,760
144,144,1980.0,Firestarter,Stephen King,eng,Intermediate,3.85,142213,genre fiction,2952.6,1771.56,3.99,152,Macmillan,740
160,160,1999.0,On Writing: A Memoir of the Craft,Stephen King,eng,Intermediate,4.31,131481,genre fiction,1943.5,0.0,2.99,170,"Amazon Digital Services, Inc.",650
172,172,1998.0,Bag of Bones,Stephen King,en-US,Intermediate,3.87,131004,genre fiction,569.25,0.0,0.99,185,"Amazon Digital Services, Inc.",575
208,208,1982.0,Different Seasons,Stephen King,eng,Intermediate,4.34,105961,genre fiction,5000.45,3000.27,10.99,227,Simon and Schuster Digital Sales Inc,455


In [58]:
content_recommender("Firestarter", num_to_rec, vector_sales, similarity_function)

Unnamed: 0,Books,Similarity Score
796,The Mist,0.343
90,Mr. Mercedes,0.232
946,Hearts in Atlantis,0.225
815,Duma Key,0.224
428,The Dark Half,0.221
126,Congo,0.196
151,Gone,0.192
520,The Heir,0.185
716,The Twits,0.182
693,Cosmos,0.181


In [54]:
content_recommender("The Longest Ride", num_to_rec, vector_sales, similarity_function)

Unnamed: 0,Books,Similarity Score
38,The Guardian,0.419
21,The Rescue,0.282
182,The Choice,0.265
152,The Wedding,0.262
133,Along for the Ride,0.25
865,True Believer,0.236
284,The Best of Me,0.233
908,At First Sight,0.198
195,The Exorcist,0.194
89,Nights in Rodanthe,0.191


In [55]:
sales_df.loc[sales_df['author'] == "Nicholas Sparks"]

Unnamed: 0,index,publishing_year,book_name,author,language_code,author_rating,book_average_rating,book_ratings_count,genre,gross_sales,publisher_revenue,sale_price,sales_rank,publisher,units_sold
21,21,2000.0,The Rescue,Nicholas Sparks,en-US,Intermediate,4.1,142092,genre fiction,8569.34,0.0,2.99,22,"Amazon Digital Services, Inc.",2866
40,40,2003.0,The Guardian,Nicholas Sparks,en-US,Intermediate,4.14,136427,genre fiction,1873.08,0.0,0.99,42,"Amazon Digital Services, Inc.",1892
91,91,2002.0,Nights in Rodanthe,Nicholas Sparks,eng,Intermediate,3.82,129807,genre fiction,5603.77,3362.262,4.99,92,"Amazon Digital Services, Inc.",1123
152,152,2001.0,A Bend in the Road,Nicholas Sparks,eng,Intermediate,4.02,116800,genre fiction,1353.2,811.92,1.99,164,HarperCollins Publishers,680
155,155,2003.0,The Wedding,Nicholas Sparks,en-US,Intermediate,3.98,118045,genre fiction,658.35,395.01,0.99,167,"Amazon Digital Services, Inc.",665
185,185,2007.0,The Choice,Nicholas Sparks,en-US,Intermediate,3.99,93479,genre fiction,1483.04,0.0,2.99,202,"Amazon Digital Services, Inc.",496
290,290,2010.0,The Best of Me,Nicholas Sparks,en-US,Intermediate,3.91,103915,genre fiction,307.89,184.734,0.99,313,"Amazon Digital Services, Inc.",311
567,567,2012.0,The Longest Ride,Nicholas Sparks,eng,Excellent,4.14,71276,genre fiction,144.54,0.0,0.99,631,"Amazon Digital Services, Inc.",3942
883,883,2003.0,True Believer,Nicholas Sparks,en-US,Intermediate,3.8,62840,genre fiction,558.88,0.0,4.99,1035,"Amazon Digital Services, Inc.",4440
928,928,2004.0,At First Sight,Nicholas Sparks,en-US,Intermediate,3.81,62219,genre fiction,108.9,65.34,0.99,1091,"Amazon Digital Services, Inc.",4320


In [18]:
content_recommender("The Hobbit and The Lord of the Rings", num_to_rec, vector_sales, similarity_function)

Unnamed: 0,Books,Similarity Score
587,"The History of the Hobbit, Part One: Mr. Baggins",0.296
849,Extras,0.208
281,Dreamcatcher,0.193
654,A Wind in the Door,0.165
358,Rendezvous with Rama,0.164
692,Sybil,0.162
78,The Silmarillion,0.156
694,Haunted: A Novel of Stories,0.151
33,After You,0.029
964,The Twelve,0.026


In [19]:
sales_df.loc[sales_df['book_name'] == "The Magicians"]

Unnamed: 0,index,publishing_year,book_name,author,language_code,author_rating,book_average_rating,book_ratings_count,genre,gross_sales,publisher_revenue,sale_price,sales_rank,publisher,units_sold
53,53,2009.0,The Magicians,Lev Grossman,eng,Novice,3.47,147908,genre fiction,1675.08,1005.048,0.99,55,Macmillan,1692


In [140]:
content_recommender('Pretty Little Liars', num_to_rec, vector_sales, similarity_function)

[(16, 0.3533060618752304), (19, 0.13418085889237835), (429, 0.13041151799786596), (369, 0.12143160275036838), (304, 0.10618367813503904), (912, 0.0852537519782556), (146, 0.08518809997287166), (397, 0.08477365427712057), (1045, 0.08343138751791479), (962, 0.08298761104241659), (737, 0.08268934652888144), (525, 0.07840266291600931), (816, 0.07766863347059427), (439, 0.07573888460286328), (247, 0.07569378282494743), (957, 0.0743849690161382), (339, 0.07415084178477559), (849, 0.07207397945366964), (848, 0.0720123388688297)]


16                             TheWaroftheWorlds
19                          TheOmnivore'sDilemma
429                              Cleopatra:ALife
369                               TheTenthCircle
304     LoverAwakened(BlackDaggerBrotherhood #3)
912                           Enchanters'EndGame
146                             ThinkandGrowRich
397                             WolvesoftheCalla
1045                                   NightRoad
962                                      Infidel
737                                       Ã†neis
525                                  FourtoScore
816                               TheShiftingFog
439                                   MoonCalled
247                                ForOneMoreDay
957                                   FairyTail1
339                      DieunendlicheGeschichte
849                                       Extras
848                                TheLostColony
Name: book_name, dtype: object

In [None]:
# Item-based collaborative filtering: RMSE and MAE computed for a 10-fold cross-validation exercise
algo = KNNBasic(verbose=False,sim_options={'user_based':False})
scores = cross_validate(algo, sales_df, measures=['RMSE','MAE'], cv=10, n_jobs=-1, verbose=True)
results['Item-based Collaborative Filtering'] = scores