In [1]:
from datetime import datetime
import os
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from scipy import sparse
from scipy.sparse import csc_matrix
from sklearn.decomposition import TruncatedSVD

np.random.seed(0)

### Book  Dataset
https://zenodo.org/record/4265096#.YtXPf3ZBy5c


In [2]:
df = pd.read_csv('books_1.Best_Books_Ever.csv')

df.head(1)

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,...,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,9780439023481,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...","['Katniss Everdeen', 'Peeta Mellark', 'Cato (H...",...,,['Locus Award Nominee for Best Young Adult Boo...,6376780,"['3444695', '1921313', '745221', '171994', '93...",96.0,"['District 12, Panem', 'Capitol, Panem', 'Pane...",https://i.gr-assets.com/images/S/compressed.ph...,2993816,30516,5.09


### EDA on Rating dataset
- This dataset contain user rating information which will be very handy for recommendation system

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52478 entries, 0 to 52477
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   bookId            52478 non-null  object 
 1   title             52478 non-null  object 
 2   series            23470 non-null  object 
 3   author            52478 non-null  object 
 4   rating            52478 non-null  float64
 5   description       51140 non-null  object 
 6   language          48672 non-null  object 
 7   isbn              52478 non-null  object 
 8   genres            52478 non-null  object 
 9   characters        52478 non-null  object 
 10  bookFormat        51005 non-null  object 
 11  edition           4955 non-null   object 
 12  pages             50131 non-null  object 
 13  publisher         48782 non-null  object 
 14  publishDate       51598 non-null  object 
 15  firstPublishDate  31152 non-null  object 
 16  awards            52478 non-null  object

### About dataset
- This dataset have multiple columns describing details about books.These columns can be used to find similar books.So we can use content based filtering on the dataset.


In [4]:

df=df[['title','author','description','genres','awards']]
df['description']=df['description'].str.lower()
df.head(5)

Unnamed: 0,title,author,description,genres,awards
0,The Hunger Games,Suzanne Collins,winning means fame and fortune.losing means ce...,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",['Locus Award Nominee for Best Young Adult Boo...
1,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré (Illustrator)",there is a door at the end of a silent corrido...,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",['Bram Stoker Award for Works for Young Reader...
2,To Kill a Mockingbird,Harper Lee,the unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction', ...","['Pulitzer Prize for Fiction (1961)', 'Audie A..."
3,Pride and Prejudice,"Jane Austen, Anna Quindlen (Introduction)",alternate cover edition of isbn 9780679783268s...,"['Classics', 'Fiction', 'Romance', 'Historical...",[]
4,Twilight,Stephenie Meyer,about three things i was absolutely positive.\...,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...","['Georgia Peach Book Award (2007)', 'Buxtehude..."


In [7]:
df['tag'] = df['author'] + df['description'] + df['genres'] + df['awards']


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52478 entries, 0 to 52477
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        52478 non-null  object
 1   author       52478 non-null  object
 2   description  51140 non-null  object
 3   genres       52478 non-null  object
 4   awards       52478 non-null  object
 5   tag          51140 non-null  object
dtypes: object(6)
memory usage: 2.4+ MB


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')
vector = cv.fit_transform(df['tag'].values.astype('U')).toarray()

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity = cosine_similarity(vector)

In [23]:
def RecommendBook(book,df,similarity):
    index  = df[df['title'] == book].index [0]
    distance = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    result=[]
    for i in distance[1:11]:
        result.append(df.iloc[i[0]].title)
    return result    

In [36]:
df[df["title"]=='Harry Potter and the Prisoner of Azkaban']

Unnamed: 0,title,author,description,genres,awards,tag
93,Harry Potter and the Prisoner of Azkaban,"J.K. Rowling, Mary GrandPré (Illustrator)",harry potter's third year at hogwarts is full ...,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",['Bram Stoker Award for Best Work for Young Re...,"J.K. Rowling, Mary GrandPré (Illustrator)harry..."


In [33]:
resuls=RecommendBook('Harry Potter and the Prisoner of Azkaban',df,cosine_similarity)

In [34]:
results_df=df[df["title"].isin(resuls)]

In [35]:
results_df

Unnamed: 0,title,author,description,genres,awards,tag
1,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré (Illustrator)",there is a door at the end of a silent corrido...,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",['Bram Stoker Award for Works for Young Reader...,"J.K. Rowling, Mary GrandPré (Illustrator)there..."
32,Harry Potter and the Sorcerer's Stone,"J.K. Rowling, Mary GrandPré (Illustrator)",harry potter's life is miserable. his parents ...,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',...","[""Mythopoeic Fantasy Award for Children's Lite...","J.K. Rowling, Mary GrandPré (Illustrator)harry..."
61,The Giver,Lois Lowry (Goodreads Author),twelve-year-old jonas lives in a seemingly ide...,"['Young Adult', 'Fiction', 'Classics', 'Dystop...","['Newbery Medal (1994)', ""Mythopoeic Fantasy A...",Lois Lowry (Goodreads Author)twelve-year-old j...
103,Harry Potter and the Goblet of Fire,"J.K. Rowling, Mary GrandPré (Illustrator)",harry potter is midway through his training as...,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...","['Hugo Award for Best Novel (2001)', ""Mythopoe...","J.K. Rowling, Mary GrandPré (Illustrator)harry..."
113,Eragon,Christopher Paolini (Goodreads Author),an alternate cover edition for isbn 9780375826...,"['Fantasy', 'Young Adult', 'Fiction', 'Dragons...","[""Book Sense Book of the Year Award for Childr...",Christopher Paolini (Goodreads Author)an alter...
126,Harry Potter and the Chamber of Secrets,"J.K. Rowling, Mary GrandPré (Illustrator)",ever since harry potter had come home for the ...,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...","[""Mythopoeic Fantasy Award for Children's Lite...","J.K. Rowling, Mary GrandPré (Illustrator)ever ..."
138,Holes,Louis Sachar (Goodreads Author),stanley yelnats is under a curse. a curse that...,"['Young Adult', 'Fiction', 'Childrens', 'Middl...","['Newbery Medal (1999)', ""National Book Award ...",Louis Sachar (Goodreads Author)stanley yelnats...
397,The Amber Spyglass,Philip Pullman,"will is the bearer of the knife. now, accompan...","['Fantasy', 'Young Adult', 'Fiction', 'Science...","['Booker Prize Nominee for Longlist (2001)', '...",Philip Pullmanwill is the bearer of the knife....
822,The City of Ember,Jeanne DuPrau,an alternate cover edition can be found here.m...,"['Young Adult', 'Dystopia', 'Fantasy', 'Scienc...",['California Book Award for Young Adult (Silve...,Jeanne DuPrauan alternate cover edition can be...
5355,Escape from Mr. Lemoncello's Library,Chris Grabenstein (Goodreads Author),a new york times bestsellerkyle keeley is the ...,"['Middle Grade', 'Mystery', 'Fiction', 'Childr...","[""Anthony Award Nominee for Best Children's or...",Chris Grabenstein (Goodreads Author)a new york...
