In [51]:
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [52]:
df=pd.read_csv('books.csv')
df.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6810 entries, 0 to 6809
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   isbn13          6810 non-null   int64  
 1   isbn10          6810 non-null   object 
 2   title           6810 non-null   object 
 3   subtitle        2381 non-null   object 
 4   authors         6738 non-null   object 
 5   categories      6711 non-null   object 
 6   thumbnail       6481 non-null   object 
 7   description     6548 non-null   object 
 8   published_year  6804 non-null   float64
 9   average_rating  6767 non-null   float64
 10  num_pages       6767 non-null   float64
 11  ratings_count   6767 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 638.6+ KB


In [54]:
books=df.drop(columns=['isbn10','subtitle','thumbnail','published_year','num_pages'])
books.head()

Unnamed: 0,isbn13,title,authors,categories,description,average_rating,ratings_count
0,9780002005883,Gilead,Marilynne Robinson,Fiction,A NOVEL THAT READERS and critics have been eag...,3.85,361.0
1,9780002261982,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...,3.83,5164.0
2,9780006163831,The One Tree,Stephen R. Donaldson,American fiction,Volume Two of Stephen Donaldson's acclaimed se...,3.97,172.0
3,9780006178736,Rage of angels,Sidney Sheldon,Fiction,"A memorable, mesmerizing heroine Jennifer -- b...",3.93,29532.0
4,9780006280897,The Four Loves,Clive Staples Lewis,Christian life,Lewis' work on the nature of love divides love...,4.15,33684.0


In [55]:
books['popularity']=(books['average_rating']*books['ratings_count'])/100000
books.head()

Unnamed: 0,isbn13,title,authors,categories,description,average_rating,ratings_count,popularity
0,9780002005883,Gilead,Marilynne Robinson,Fiction,A NOVEL THAT READERS and critics have been eag...,3.85,361.0,0.013899
1,9780002261982,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...,3.83,5164.0,0.197781
2,9780006163831,The One Tree,Stephen R. Donaldson,American fiction,Volume Two of Stephen Donaldson's acclaimed se...,3.97,172.0,0.006828
3,9780006178736,Rage of angels,Sidney Sheldon,Fiction,"A memorable, mesmerizing heroine Jennifer -- b...",3.93,29532.0,1.160608
4,9780006280897,The Four Loves,Clive Staples Lewis,Christian life,Lewis' work on the nature of love divides love...,4.15,33684.0,1.397886


In [56]:
books.dropna(inplace=True)
books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6412 entries, 0 to 6809
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   isbn13          6412 non-null   int64  
 1   title           6412 non-null   object 
 2   authors         6412 non-null   object 
 3   categories      6412 non-null   object 
 4   description     6412 non-null   object 
 5   average_rating  6412 non-null   float64
 6   ratings_count   6412 non-null   float64
 7   popularity      6412 non-null   float64
dtypes: float64(3), int64(1), object(4)
memory usage: 450.8+ KB


In [57]:
books.describe()

Unnamed: 0,isbn13,average_rating,ratings_count,popularity
count,6412.0,6412.0,6412.0,6412.0
mean,9780665000000.0,3.931648,21950.72,0.886942
std,580676400.0,0.322645,141238.3,5.845063
min,9780002000000.0,0.0,0.0,0.0
25%,9780316000000.0,3.77,176.75,0.006892
50%,9780522000000.0,3.95,1087.0,0.042899
75%,9780806000000.0,4.13,6443.25,0.25353
max,9789042000000.0,5.0,5629932.0,251.65796


In [58]:
(books['popularity']>0.001).sum()

5889

In [59]:
def changeRating(num):
    new='badr'
    if num>=0.001 and num<0.01:
        new='lowr'
    elif num>=0.01 and num<0.1:
        new='poorr'
    elif num>=0.1 and num<1:
        new='decentr'
    elif num>=1 and num<10:
        new='goodr'
    elif num>=10 and num<50:
        new='outstandingr'
    elif num>=50:
        new='greatr'
    return new

In [60]:
books['popularity']=books['popularity'].apply(changeRating)

In [61]:
books.head(10)

Unnamed: 0,isbn13,title,authors,categories,description,average_rating,ratings_count,popularity
0,9780002005883,Gilead,Marilynne Robinson,Fiction,A NOVEL THAT READERS and critics have been eag...,3.85,361.0,poorr
1,9780002261982,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...,3.83,5164.0,decentr
2,9780006163831,The One Tree,Stephen R. Donaldson,American fiction,Volume Two of Stephen Donaldson's acclaimed se...,3.97,172.0,lowr
3,9780006178736,Rage of angels,Sidney Sheldon,Fiction,"A memorable, mesmerizing heroine Jennifer -- b...",3.93,29532.0,goodr
4,9780006280897,The Four Loves,Clive Staples Lewis,Christian life,Lewis' work on the nature of love divides love...,4.15,33684.0,goodr
5,9780006280934,The Problem of Pain,Clive Staples Lewis,Christian life,"""In The Problem of Pain, C.S. Lewis, one of th...",4.09,37569.0,goodr
6,9780006353287,An Autobiography,Agatha Christie,"Authors, English",Donation.,4.27,3975.0,decentr
7,9780006380832,Empires of the Monsoon,Richard Hall,"Africa, East",Until Vasco da Gama discovered the sea-route t...,4.41,65.0,lowr
8,9780006470229,The Gap Into Madness,Stephen R. Donaldson,"Hyland, Morn (Fictitious character)",A new-cover reissue of the fourth book in the ...,4.15,103.0,lowr
9,9780006472612,Master of the Game,Sidney Sheldon,Adventure stories,Kate Blackwell is an enigma and one of the mos...,4.11,43540.0,goodr


In [62]:
books=books.drop(columns=['average_rating','ratings_count'])

In [63]:
import re
def makeLists(text):
    results=re.split(';|&|and|\*|\n', text)
    lists=[]
    for r in results:
        lists.append((r.replace(" ","")).lower())
    return lists

In [64]:
books['authors']=books['authors'].apply(makeLists)
books['categories']=books['categories'].apply(makeLists)

In [65]:
books['popularity']=books['popularity'].apply(lambda a:a.split())

In [66]:
from nltk.stem.porter import PorterStemmer
def changeDescription(text):
    res=text.lower()
    res=re.sub(r'[^\w]', ' ', res)
    l=[]
    ps=PorterStemmer()
    for word in res.split():
        l.append(ps.stem(word))
    return l

In [67]:
books['description']=books['description'].apply(changeDescription)

In [69]:
books['label']=books['authors']+books['categories']+books['description']+books['popularity']

In [71]:
books=books.drop(columns=['authors','categories','description','popularity'])

In [78]:
books['label']=books['label'].apply(lambda w:" ".join(w))
books.head()

Unnamed: 0,isbn13,title,label
0,9780002005883,Gilead,marilynnerobinson fiction a novel that reader ...
1,9780002261982,Spider's Web,charlesosborne agathachristie detective myster...
2,9780006163831,The One Tree,stephenr.donaldson americanfiction volum two o...
3,9780006178736,Rage of angels,sidneysheldon fiction a memor mesmer heroin je...
4,9780006280897,The Four Loves,clivestapleslewis christianlife lewi work on t...


In [81]:
vect=CountVectorizer(max_features=4000,stop_words='english')
vector=vect.fit_transform(books['label']).toarray()
vector.shape

(6412, 4000)

In [83]:
proximityVector=cosine_similarity(vector)
proximityVector

array([[1.        , 0.02503915, 0.        , ..., 0.24286823, 0.07109956,
        0.        ],
       [0.02503915, 1.        , 0.02397317, ..., 0.07644956, 0.07121091,
        0.02397317],
       [0.        , 0.02397317, 1.        , ..., 0.0265747 , 0.02475369,
        0.        ],
       ...,
       [0.24286823, 0.07644956, 0.0265747 , ..., 1.        , 0.11051407,
        0.1062988 ],
       [0.07109956, 0.07121091, 0.02475369, ..., 0.11051407, 1.        ,
        0.02475369],
       [0.        , 0.02397317, 0.        , ..., 0.1062988 , 0.02475369,
        1.        ]])