# Imports

In [27]:
# Import all required libraries
# You might need to install the surprise package (pip install scikit-surprise)
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [3]:
try:
    from surprise import NormalPredictor, KNNBasic, NMF, SlopeOne, SVD, Dataset
    from surprise.model_selection import cross_validate
except ModuleNotFoundError:
    !pip install surprise
    from surprise import NormalPredictor, KNNBasic, NMF, SlopeOne, SVD, Dataset
    from surprise.model_selection import cross_validate

# Load datasets

In [36]:
sales_path = "../data/sales_dataset/Books_Data_Cleaner.csv"
# sales_cols = ['index', 'Publishing Year', 'Book Name', 'Author', 'language_code', 'Author_Rating', 'Book_average_rating', 'Book_ratings_count', 'genre', 'gross sales', 'publisher revenue', 'sale price', 'sales rank', 'Publisher', 'units sold']
sales_df = pd.read_csv(sales_path)
sales_df.head(3)

Unnamed: 0,index,publishing_year,book_name,author,language_code,author_rating,book_average_rating,book_ratings_count,genre,gross_sales,publisher_revenue,sale_price,sales_rank,publisher,units_sold
0,0,1975.0,Beowulf,"Unknown, Seamus Heaney",en-US,Novice,3.42,155903,genre fiction,34160.0,20496.0,4.88,1,HarperCollins Publishers,7000
1,1,1987.0,Batman: Year One,"Frank Miller, David Mazzucchelli, Richmond Lew...",eng,Intermediate,4.23,145267,genre fiction,12437.5,7462.5,1.99,2,HarperCollins Publishers,6250
2,2,2015.0,Go Set a Watchman,Harper Lee,eng,Novice,3.31,138669,genre fiction,47795.0,28677.0,8.69,3,"Amazon Digital Services, Inc.",5500


In [37]:
sales_df.columns

Index(['index', 'publishing_year', 'book_name', 'author', 'language_code',
       'author_rating', 'book_average_rating', 'book_ratings_count', 'genre',
       'gross_sales', 'publisher_revenue', 'sale_price', 'sales_rank',
       'publisher', 'units_sold'],
      dtype='object')

In [38]:
sales_df.describe()

Unnamed: 0,index,publishing_year,book_average_rating,book_ratings_count,gross_sales,publisher_revenue,sale_price,sales_rank,units_sold
count,1070.0,1069.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0
mean,534.5,1971.377923,4.007,94909.913084,1856.622944,843.28103,4.869561,611.652336,9676.980374
std,309.026698,185.080257,0.247244,31513.242518,3936.92424,2257.596743,3.559919,369.84983,15370.571306
min,0.0,-560.0,2.97,27308.0,104.94,0.0,0.99,1.0,106.0
25%,267.25,1985.0,3.85,70398.0,372.465,0.0,1.99,287.5,551.25
50%,534.5,2003.0,4.015,89309.0,809.745,273.078,3.99,595.5,3924.0
75%,801.75,2010.0,4.17,113906.5,1487.9575,721.1805,6.99,932.5,5312.25
max,1069.0,2016.0,4.77,206792.0,47795.0,28677.0,33.86,1273.0,61560.0


# TF-IDF Vectorization

In [185]:
vector_sales = sales_df
subset = [
          'book_name',
          'author',
          'publisher',
          'publishing_year',
          # 'language_code',
          # 'author_rating',
          'book_average_rating',
          'book_ratings_count',
          'genre',
          'gross_sales',
          'publisher_revenue',
          'sale_price',
          'sales_rank',
          'units_sold'
         ]

vector_sales.dropna(subset=subset,inplace=True,axis=0)
vector_sales = vector_sales.reset_index(drop=True)

In [186]:
vector_sales['book_name'] = [re.sub(',',' ',re.sub(' ','',t)) for t in vector_sales['book_name']]
vector_sales['author']    = [re.sub(',',' ',re.sub(' ','',t)) for t in vector_sales['author']]
vector_sales['publisher'] = [re.sub(',',' ',re.sub(' ','',t)) for t in vector_sales['publisher']]
vector_sales['genre'] = [re.sub(',',' ',re.sub(' ','',t)) for t in vector_sales['genre']]

In [188]:
vector_sales['combined'] = \
                            vector_sales['book_name'] + ' ' + \
                            vector_sales['author'] + ' ' + \
                            vector_sales['publisher'] + ' ' + \
                            vector_sales['publishing_year'].astype(str) + ' ' + \
                            vector_sales['book_average_rating'].astype(str) + ' ' + \
                            vector_sales['book_ratings_count'].astype(str) + ' ' + \
                            vector_sales['genre'] + ' ' + \
                            vector_sales['gross_sales'].astype(str) + ' ' + \
                            vector_sales['publisher_revenue'].astype(str) + ' ' + \
                            vector_sales['sale_price'].astype(str) + ' ' + \
                            vector_sales['sales_rank'].astype(str) + ' ' + \
                            vector_sales['units_sold'].astype(str)
                            # vector_sales['language_code'] + ' ' + \
                            # vector_sales['author_rating'] + ' ' + \

In [189]:
vector_sales['combined'][0]

'Beowulf Unknown SeamusHeaney HarperCollinsPublishers 1975.0 3.42 155903 genrefiction 34160.0 20496.0 4.88 1 7000'

In [190]:
# vectorizer = TfidfVectorizer()
# matrix = vectorizer.fit_transform(vector_sales['combined'])
# similarities = linear_kernel(matrix,matrix)
# book_title = vector_sales['book_name']
# indices = pd.Series(vector_sales.index, index=vector_sales['book_name'])

# Recommender

In [191]:
vectorizer = TfidfVectorizer()
book_title = vector_sales['book_name']
indices = pd.Series(vector_sales.index, index=vector_sales['book_name'])

In [192]:
book_title

0                          Beowulf
1                   Batman:YearOne
2                   GoSetaWatchman
3       WhenYouAreEngulfedinFlames
4             DaughterofSmoke&Bone
                   ...            
1042                  GrayMountain
1043                 ThePowerofOne
1044              TheMalteseFalcon
1045                     NightRoad
1046                      Tripwire
Name: book_name, Length: 1047, dtype: object

In [193]:
def content_recommender(title, num_to_rec=20, data=vector_sales, similarity_function='cosine'):
    title = re.sub(',',' ',re.sub(' ','',title))
    matrix = vectorizer.fit_transform(data['combined'])
    idx = indices[title]
    
    # if similarity_function == 'cosine':
        # cosine similarity
    similarities = linear_kernel(matrix,matrix)
    
    sim_scores = list(enumerate(similarities[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_to_rec]
    book_indices = [i[0] for i in sim_scores]
    book_titles = book_title.iloc[book_indices]

    sim_scores = [item[1] for item in sim_scores]
    sim_scores = [round(score, 3) for score in sim_scores]
    
    recommendations = pd.DataFrame({'Books': book_titles, 'Similarity Score': sim_scores})
    return recommendations

In [194]:
num_to_rec = 20
similarity_function = 'cosine'

In [195]:
content_recommender('Pretty Little Liars', num_to_rec, vector_sales, similarity_function)

Unnamed: 0,Books,Similarity Score
16,TheWaroftheWorlds,0.356
19,TheOmnivore'sDilemma,0.138
429,Cleopatra:ALife,0.134
369,TheTenthCircle,0.125
304,LoverAwakened(BlackDaggerBrotherhood #3),0.11
912,Enchanters'EndGame,0.09
146,ThinkandGrowRich,0.09
397,WolvesoftheCalla,0.089
962,Infidel,0.088
1045,NightRoad,0.088


In [196]:
content_recommender('The Bourne Supremacy', num_to_rec, vector_sales, similarity_function)

Unnamed: 0,Books,Similarity Score
56,Easy,0.179
458,TalesofaFourthGradeNothing,0.178
25,TheVirginSuicides,0.135
181,DeathofaSalesman,0.129
783,DeadWake:TheLastCrossingoftheLusitania,0.125
346,HollowCity,0.12
839,TheLastJuror,0.117
985,ASpoolofBlueThread,0.116
208,ThePowerofNow,0.115
550,SycamoreRow,0.114


In [198]:
content_recommender('The Magicians', num_to_rec, vector_sales, similarity_function)

Unnamed: 0,Books,Similarity Score
821,TheOnceandFutureKing,0.142
524,TheSummerITurnedPretty,0.113
839,TheLastJuror,0.102
639,HighFive,0.097
133,AlongfortheRide,0.088
415,TwilightDirector'sNotebook,0.087
932,LoverEnshrined parttwo,0.074
221,LonesomeDove,0.07
73,JonathanLivingstonSeagull,0.067
893,HamonRye,0.066


In [140]:
content_recommender('Pretty Little Liars', num_to_rec, vector_sales, similarity_function)

[(16, 0.3533060618752304), (19, 0.13418085889237835), (429, 0.13041151799786596), (369, 0.12143160275036838), (304, 0.10618367813503904), (912, 0.0852537519782556), (146, 0.08518809997287166), (397, 0.08477365427712057), (1045, 0.08343138751791479), (962, 0.08298761104241659), (737, 0.08268934652888144), (525, 0.07840266291600931), (816, 0.07766863347059427), (439, 0.07573888460286328), (247, 0.07569378282494743), (957, 0.0743849690161382), (339, 0.07415084178477559), (849, 0.07207397945366964), (848, 0.0720123388688297)]


16                             TheWaroftheWorlds
19                          TheOmnivore'sDilemma
429                              Cleopatra:ALife
369                               TheTenthCircle
304     LoverAwakened(BlackDaggerBrotherhood #3)
912                           Enchanters'EndGame
146                             ThinkandGrowRich
397                             WolvesoftheCalla
1045                                   NightRoad
962                                      Infidel
737                                       Ã†neis
525                                  FourtoScore
816                               TheShiftingFog
439                                   MoonCalled
247                                ForOneMoreDay
957                                   FairyTail1
339                      DieunendlicheGeschichte
849                                       Extras
848                                TheLostColony
Name: book_name, dtype: object

In [None]:
# Item-based collaborative filtering: RMSE and MAE computed for a 10-fold cross-validation exercise
algo = KNNBasic(verbose=False,sim_options={'user_based':False})
scores = cross_validate(algo, sales_df, measures=['RMSE','MAE'], cv=10, n_jobs=-1, verbose=True)
results['Item-based Collaborative Filtering'] = scores