## **Research Notebook for Book Recommender :**

In [310]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /Users/kapilojha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kapilojha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### **Exploratory Data Analysis :**

In [311]:
# read in the data
books = pd.read_csv("/Users/kapilojha/Desktop/ml/projects/book-recommender/data/books.csv",
                encoding="latin-1",
                on_bad_lines="skip",
                sep=";",
                low_memory=False
        )

In [312]:
!ls

research.ipynb


In [313]:
# view the data
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [314]:
print("Large URl: ", books["Image-URL-L"].iloc[0])
print("Medium URl: ", books["Image-URL-M"].iloc[0])
print("Small URl: ", books["Image-URL-S"].iloc[0])

Large URl:  http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg
Medium URl:  http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg
Small URl:  http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg


Each book has 3 diffrent Image url :
- depending on our requirements we can choose any of the three
- small, medium and large.

In [315]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271358 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [316]:
books.shape

(271360, 8)

We have total of 2,71,360 rows and 8 features.

Since we do not need the small and medium url's we should remove them from
our dataset.

In [317]:
books.drop(["Image-URL-S", "Image-URL-M"],
           axis=1,
           inplace=True)
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...


In [318]:
# rename the column names for easier processing
books.rename(columns={
    "ISBN" : "isbn",
    "Book-Title" : "title",
    "Book-Author" : "author",
    "Year-Of-Publication" : "year",
    "Publisher" : "publisher",
    "Image-URL-L" : "image-url"
}, inplace=True)
books.head()

Unnamed: 0,isbn,title,author,year,publisher,image-url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...


In [319]:
# import users data
users = pd.read_csv("/Users/kapilojha/Desktop/ml/projects/book-recommender/data/users.csv",
                    sep=";",
                    encoding="latin-1",
                    on_bad_lines="skip")
users["Age"] = users["Age"].bfill()
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",18.0
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",17.0
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",61.0


In [320]:
# rename users columns for easier processing
users.rename(columns={
    "User-ID" : "id",
    "Location" : "location",
    "Age" : "age"
}, inplace=True)

In [321]:
users.head()

Unnamed: 0,id,location,age
0,1,"nyc, new york, usa",18.0
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",17.0
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",61.0


In [322]:
# import the book ratings data
ratings = pd.read_csv("/Users/kapilojha/Desktop/ml/projects/book-recommender/data/ratings.csv",
                      sep=";",
                      encoding="latin-1",
                      on_bad_lines="skip")
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [323]:
# rename ratings columns names for easier processing
ratings.rename(columns={
    "User-ID" : "id",
    "ISBN" : "isbn",
    "Book-Rating" : "rating"
}, inplace=True)
ratings.head()

Unnamed: 0,id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [324]:
# view the data shapes together
print(books.shape, users.shape, ratings.shape)

(271360, 6) (278858, 3) (1149780, 3)


In [325]:
# no. of books an individual user read
ratings["id"].value_counts()[:10]

id
11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
212898     4785
278418     4533
76352      3367
110973     3100
235105     3067
Name: count, dtype: int64

As you can see `user 11676` has read `13,602 books`.

In [326]:
# users who have at least read more than 200 books
x = ratings["id"].value_counts() > 200

In [327]:
x.value_counts()

count
False    104384
True        899
Name: count, dtype: int64

We can see only 899 users have read more than 200 books.

In [328]:
# extracting those users id's 
y = x[x].index
y

Index([ 11676, 198711, 153662,  98391,  35859, 212898, 278418,  76352, 110973,
       235105,
       ...
       260183,  73681,  44296, 155916,   9856, 274808,  28634,  59727, 268622,
       188951],
      dtype='int64', name='id', length=899)

In [329]:
# select only those rows with people who read more than 200 books in ratings dataset
ratings = ratings[ratings["id"].isin(y)]
ratings.head()

Unnamed: 0,id,isbn,rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0


In [330]:
books.head()

Unnamed: 0,isbn,title,author,year,publisher,image-url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...


In [331]:
users.head()

Unnamed: 0,id,location,age
0,1,"nyc, new york, usa",18.0
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",17.0
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",61.0


In [332]:
ratings.head()

Unnamed: 0,id,isbn,rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0


In [333]:
# merge the ratings with books dataset using isbn number
ratings_with_books = ratings.merge(books, on="isbn")
ratings_with_books.head()

Unnamed: 0,id,isbn,rating,title,author,year,publisher,image-url
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
1,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,http://images.amazon.com/images/P/0026217457.0...
2,277427,003008685X,8,Pioneers,James Fenimore Cooper,1974,Thomson Learning,http://images.amazon.com/images/P/003008685X.0...
3,277427,0030615321,0,"Ask for May, Settle for June (A Doonesbury book)",G. B. Trudeau,1982,Henry Holt &amp; Co,http://images.amazon.com/images/P/0030615321.0...
4,277427,0060002050,0,On a Wicked Dawn (Cynster Novels),Stephanie Laurens,2002,Avon Books,http://images.amazon.com/images/P/0060002050.0...


In [334]:
# create a dataframe with book title and no. of review of that titled book
number_of_ratings_per_book = ratings_with_books.groupby("title")["rating"].count().reset_index()
number_of_ratings_per_book.rename(columns={"rating" : "total_ratings"}, inplace=True)
number_of_ratings_per_book.head()

Unnamed: 0,title,total_ratings
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1


In [335]:
# create a final dataset called book_ratings with the above dataset
book_ratings = ratings_with_books.merge(number_of_ratings_per_book, on="title")
book_ratings.head()

Unnamed: 0,id,isbn,rating,title,author,year,publisher,image-url,total_ratings
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
1,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,http://images.amazon.com/images/P/0026217457.0...,7
2,277427,003008685X,8,Pioneers,James Fenimore Cooper,1974,Thomson Learning,http://images.amazon.com/images/P/003008685X.0...,1
3,277427,0030615321,0,"Ask for May, Settle for June (A Doonesbury book)",G. B. Trudeau,1982,Henry Holt &amp; Co,http://images.amazon.com/images/P/0030615321.0...,1
4,277427,0060002050,0,On a Wicked Dawn (Cynster Novels),Stephanie Laurens,2002,Avon Books,http://images.amazon.com/images/P/0060002050.0...,13


In [336]:
# use only that rows which have more than 50 ratings per book
book_ratings = book_ratings[book_ratings["total_ratings"] >= 50]
book_ratings.head()

Unnamed: 0,id,isbn,rating,title,author,year,publisher,image-url,total_ratings
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
13,277427,0060930535,0,The Poisonwood Bible: A Novel,Barbara Kingsolver,1999,Perennial,http://images.amazon.com/images/P/0060930535.0...,133
15,277427,0060934417,0,Bel Canto: A Novel,Ann Patchett,2002,Perennial,http://images.amazon.com/images/P/0060934417.0...,108
18,277427,0061009059,9,One for the Money (Stephanie Plum Novels (Pape...,Janet Evanovich,1995,HarperTorch,http://images.amazon.com/images/P/0061009059.0...,108
24,277427,006440188X,0,The Secret Garden,Frances Hodgson Burnett,1998,HarperTrophy,http://images.amazon.com/images/P/006440188X.0...,79


In [337]:
print("Total no. of rows:", book_ratings.shape[0])

Total no. of rows: 61853


In [338]:
# drop the duplicate rows
book_ratings.drop_duplicates(["id", "title"], inplace=True)
book_ratings.head()

Unnamed: 0,id,isbn,rating,title,author,year,publisher,image-url,total_ratings
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
13,277427,0060930535,0,The Poisonwood Bible: A Novel,Barbara Kingsolver,1999,Perennial,http://images.amazon.com/images/P/0060930535.0...,133
15,277427,0060934417,0,Bel Canto: A Novel,Ann Patchett,2002,Perennial,http://images.amazon.com/images/P/0060934417.0...,108
18,277427,0061009059,9,One for the Money (Stephanie Plum Novels (Pape...,Janet Evanovich,1995,HarperTorch,http://images.amazon.com/images/P/0061009059.0...,108
24,277427,006440188X,0,The Secret Garden,Frances Hodgson Burnett,1998,HarperTrophy,http://images.amazon.com/images/P/006440188X.0...,79


In [339]:
print("Total no. of rows:", book_ratings.shape[0])

Total no. of rows: 59850


In [340]:
# check how many null values are still there in the dataset
book_ratings.isna().sum()

id               0
isbn             0
rating           0
title            0
author           0
year             0
publisher        0
image-url        0
total_ratings    0
dtype: int64

In [341]:
book_ratings["title"].isna().sum()

np.int64(0)

In [342]:
# initialize stopwords
stop_words = set(stopwords.words("english"))

# preprocessing the title column for easier search
book_ratings_title = book_ratings["title"]
processed_titles = []
for title in book_ratings_title.values:
    tokens = word_tokenize(title)
    tokens = [token.lower() for token in tokens if token not in stop_words]
    result = " ".join(tokens)
    processed_titles.append(result)

processed_titles

['politically correct bedtime stories : modern tales our life times',
 'the poisonwood bible : a novel',
 'bel canto : a novel',
 'one money ( stephanie plum novels ( paperback ) )',
 'the secret garden',
 'the tao pooh',
 'girl hyacinth blue',
 'chocolat',
 'the secret life bees',
 'three to get deadly : a stephanie plum novel ( a stephanie plum novel )',
 "full tilt ( janet evanovich 's full series )",
 'lucky : a memoir',
 'the dogs babel ( today show book club # 12 )',
 "white oleander : a novel ( oprah 's book club )",
 'white oleander : a novel',
 'the jester',
 'the lovely bones : a novel',
 'me talk pretty one day',
 'naked',
 'the hobbit : the enchanting prelude the lord rings',
 'a prayer owen meany',
 'silent witness',
 'no safe place',
 'the murder book',
 'middlesex : a novel',
 'postmortem',
 'all that remains ( kay scarpetta mysteries ( paperback ) )',
 'the rainmaker',
 "tuesdays morrie : an old man , young man , life 's greatest lesson",
 'into wild',
 'oryx crake',
 '

In [343]:
book_ratings["title"] = processed_titles
book_ratings.head()

Unnamed: 0,id,isbn,rating,title,author,year,publisher,image-url,total_ratings
0,277427,002542730X,10,politically correct bedtime stories : modern t...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
13,277427,0060930535,0,the poisonwood bible : a novel,Barbara Kingsolver,1999,Perennial,http://images.amazon.com/images/P/0060930535.0...,133
15,277427,0060934417,0,bel canto : a novel,Ann Patchett,2002,Perennial,http://images.amazon.com/images/P/0060934417.0...,108
18,277427,0061009059,9,one money ( stephanie plum novels ( paperback ) ),Janet Evanovich,1995,HarperTorch,http://images.amazon.com/images/P/0061009059.0...,108
24,277427,006440188X,0,the secret garden,Frances Hodgson Burnett,1998,HarperTrophy,http://images.amazon.com/images/P/006440188X.0...,79


In [344]:
def process_title(title):
    ''' 
    Preprocess a single title string
    '''
    tokens = word_tokenize(title)
    tokens = [token.lower() for token in tokens if token not in stop_words]
    title = " ".join(tokens)
    return title

#### **🎯 Pivot Table for Collaborative Filtering :**
Collaborative filtering works by finding similar users or similar items based on rating patterns.
You need a matrix where you can compute similarity between:
* Rows (item-based CF: similar books)
* that is why we need pivot table to compute similarity between diffrent users
* in pivot table books become index user-id become columns and ratings become values.



In [345]:
book_ratings.pivot_table(
    values="rating",
    index="title",
    columns="id"
)

id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,,,,,,0.0,,,,
1st die : a novel,,,,,,,,,,,...,,,,,,,,,,
2nd chance,,10.0,,,,,,,,,...,,,,0.0,,,,,0.0,
4 blondes,,,,,,,,,,0.0,...,,,,,,,,,,
84 charing cross road,,,,,,,,,,,...,,,,,,10.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wuthering heights,,,,,,,,,,,...,,,,0.0,,,,,,
year wonders,,,,7.0,,,,,7.0,,...,,,,,,0.0,,,,
you belong to me,,,,,,,,,,,...,,,,,,,,,,
zen art motorcycle maintenance : an inquiry values,,,,,0.0,,,,,0.0,...,,,,,,0.0,,,,


In [346]:
# create a pivot table from book ratings
book_pivot = book_ratings.pivot_table(
    index="title",
    columns="id",
    values="rating"
)
book_pivot.head(10)

id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,,,,,,0.0,,,,
1st die : a novel,,,,,,,,,,,...,,,,,,,,,,
2nd chance,,10.0,,,,,,,,,...,,,,0.0,,,,,0.0,
4 blondes,,,,,,,,,,0.0,...,,,,,,,,,,
84 charing cross road,,,,,,,,,,,...,,,,,,10.0,,,,
\o\ '' is outlaw '',,,,,,,,,,,...,,,8.0,,,,,,,
a bend road,0.0,,7.0,,,,,,,,...,,,,,,,,,,
a case need,,,0.0,,,,,,,0.0,...,,0.0,,7.0,,,,,,0.0
a child called \it\ '' : one child 's courage survive '',,,,,,,,,,,...,,,,,,,,,,
a civil action,,,,,0.0,,,,,,...,,,,,,,,,,


In [347]:
# replace the null values with 0
book_pivot = book_pivot.fillna(0)
book_pivot.head(10)

id,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st die : a novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84 charing cross road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
\o\ '' is outlaw '',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
a bend road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
a case need,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0
a child called \it\ '' : one child 's courage survive '',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
a civil action,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [348]:
book_pivot.iloc[0]

id
254       9.0
2276      0.0
2766      0.0
2977      0.0
3363      0.0
         ... 
275970    0.0
277427    0.0
277478    0.0
277639    0.0
278418    0.0
Name: 1984, Length: 888, dtype: float64

In [349]:
# now before training we need to
# convert pivot table to csr matrix
# which will make it more efficient to preprocess
from scipy.sparse import csr_matrix

books_sparse = csr_matrix(book_pivot)
books_sparse

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 14955 stored elements and shape (737, 888)>

### **Training Model :**

In [350]:
# initialize the model
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm="brute",
                        metric="cosine")

In [351]:
# fit the model
model.fit(books_sparse)

0,1,2
,n_neighbors,5
,radius,1.0
,algorithm,'brute'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,


In [352]:
book_pivot.iloc[237, :]

id
254       9.0
2276      0.0
2766      0.0
2977      0.0
3363      0.0
         ... 
275970    0.0
277427    0.0
277478    0.0
277639    0.0
278418    0.0
Name: harry potter goblet fire ( book 4 ), Length: 888, dtype: float64

In [353]:
distance, suggestion = model.kneighbors(book_pivot.iloc[237, :].values.reshape(1, -1),
                                        n_neighbors=6)

In [354]:
distance

array([[0.        , 0.36914631, 0.42346873, 0.45015559, 0.58315122,
        0.64810686]])

In [355]:
suggestion

array([[237, 239, 236, 238, 240, 241]])

In [357]:
suggested_books = [book_pivot.index[suggestion[i]] for i in range(len(suggestion))]
suggested_books

[Index(['harry potter goblet fire ( book 4 )',
        'harry potter prisoner azkaban ( book 3 )',
        'harry potter chamber secrets ( book 2 )',
        'harry potter order phoenix ( book 5 )',
        'harry potter sorcerer 's stone ( book 1 )',
        'harry potter sorcerer 's stone ( harry potter ( paperback ) )'],
       dtype='object', name='title')]

In [358]:
# book names idx : book name
book_names = {}
for idx, name in enumerate(book_pivot.index):
    book_names[idx] = name

In [359]:
for idx in suggestion[0]:
    print(book_names[idx])

harry potter goblet fire ( book 4 )
harry potter prisoner azkaban ( book 3 )
harry potter chamber secrets ( book 2 )
harry potter order phoenix ( book 5 )
harry potter sorcerer 's stone ( book 1 )
harry potter sorcerer 's stone ( harry potter ( paperback ) )


#### **How to extract the image URL :**

In [360]:
np.where(book_pivot.index == "4 blondes")[0][0]

np.int64(3)

In [362]:
ids = np.where(book_ratings["title"] == "harry potter prisoner azkaban ( book 3 )")[0][0]

In [363]:
book_ratings.iloc[ids]["image-url"]

'http://images.amazon.com/images/P/0439136369.01.LZZZZZZZ.jpg'

In [364]:
# extract book names
books_names = []
for idx in suggestion[0]:
    books_names.append(book_names[idx])
books_names

['harry potter goblet fire ( book 4 )',
 'harry potter prisoner azkaban ( book 3 )',
 'harry potter chamber secrets ( book 2 )',
 'harry potter order phoenix ( book 5 )',
 "harry potter sorcerer 's stone ( book 1 )",
 "harry potter sorcerer 's stone ( harry potter ( paperback ) )"]

In [365]:
# extract all image-urls
image_urls = []
for book in books_names:
    ids = np.where(book_ratings["title"] == book)[0][0]
    image_urls.append(book_ratings.iloc[ids]["image-url"])
image_urls

['http://images.amazon.com/images/P/0439139597.01.LZZZZZZZ.jpg',
 'http://images.amazon.com/images/P/0439136369.01.LZZZZZZZ.jpg',
 'http://images.amazon.com/images/P/0439064872.01.LZZZZZZZ.jpg',
 'http://images.amazon.com/images/P/043935806X.01.LZZZZZZZ.jpg',
 'http://images.amazon.com/images/P/043936213X.01.LZZZZZZZ.jpg',
 'http://images.amazon.com/images/P/059035342X.01.LZZZZZZZ.jpg']

In [367]:
import pickle

pickle.dump(model, open("artifacts/model.pkl", "wb"))
pickle.dump(book_names, open("artifacts/book_names.pkl", "wb"))
pickle.dump(book_ratings, open("artifacts/book_ratings.pkl", "wb"))
pickle.dump(book_pivot, open("artifacts/book_pivot.pkl", "wb"))