### Books Recommendation System using Clustering | Collaborative Filtering

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv

In [2]:
books = pd.read_csv('data//BX-Books.csv', sep=';', encoding='latin-1', quoting=1, dtype=str, on_bad_lines='skip')
books.head(2)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...


In [3]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [4]:
books_final = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher','Image-URL-L']]

In [5]:
books_final.head(2)	

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...


In [6]:
books_final.rename(columns={'Book-Title': 'title', 'Book-Author': 'author', 'Year-Of-Publication': 'year', 'Publisher': 'publisher', 'Image-URL-L': 'image_url'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_final.rename(columns={'Book-Title': 'title', 'Book-Author': 'author', 'Year-Of-Publication': 'year', 'Publisher': 'publisher', 'Image-URL-L': 'image_url'}, inplace=True)


In [7]:
books_final.head()

Unnamed: 0,ISBN,title,author,year,publisher,image_url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...


In [8]:
users = pd.read_csv('data//BX-Users.csv', sep=';', encoding='latin-1', quoting=1, dtype=str, on_bad_lines='skip')
users.head(2)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


In [9]:
users.shape

(278858, 3)

In [10]:
ratings = pd.read_csv('data//BX-Book-Ratings.csv', sep=';', encoding='latin-1', quoting=1, dtype=str, on_bad_lines='skip')
ratings.head(2)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5


In [11]:
ratings.shape

(1149780, 3)

In [12]:
print(books_final.shape)
print(users.shape)
print(ratings.shape)

(271360, 6)
(278858, 3)
(1149780, 3)


In [13]:
ratings.rename(columns={'User-ID': 'user_id', 'Book-Rating': 'rating'}, inplace=True)

ratings.head(2)

Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5


In [14]:
users.rename(columns={'User-ID': 'user_id', 'Location': 'location', 'Age': 'age'}, inplace=True)

users.head(2)

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


In [15]:
ratings['user_id'].value_counts()

user_id
11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
116180        1
116166        1
116154        1
116137        1
276723        1
Name: count, Length: 105283, dtype: int64

In [16]:
x = ratings['user_id'].value_counts() > 200

In [17]:
y = x[x].index

In [18]:
ratings = ratings[ratings['user_id'].isin(y)]

In [19]:
ratings.head(2)

Unnamed: 0,user_id,ISBN,rating
1456,277427,002542730X,10
1457,277427,0026217457,0


In [20]:
ratings.shape

(526356, 3)

In [21]:
rating_with_books = ratings.merge(books_final, on='ISBN')
rating_with_books.head(2)

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,image_url
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
1,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,http://images.amazon.com/images/P/0026217457.0...


In [22]:
rating_with_books.shape

(487671, 8)

In [23]:
num_rating = rating_with_books.groupby('title')['rating'].count().reset_index()

In [24]:
num_rating.head()

Unnamed: 0,title,rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1


In [25]:
num_rating.rename(columns={'rating': 'num_ratings'}, inplace=True)

In [26]:
num_rating.head(2)

Unnamed: 0,title,num_ratings
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1


In [27]:
final_ratings = rating_with_books.merge(num_rating, on='title')

In [28]:
final_ratings.head(2)

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,image_url,num_ratings
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
1,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,http://images.amazon.com/images/P/0026217457.0...,7


In [29]:
final_rating = final_ratings[final_ratings['num_ratings'] >= 50]

In [30]:
final_rating.sample(10)

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,image_url,num_ratings
195162,111174,037570504X,0,"Breath, Eyes, Memory",Edwidge Danticat,1998,Vintage Books USA,http://images.amazon.com/images/P/037570504X.0...,68
301031,174304,0804106304,0,The Joy Luck Club,Amy Tan,1994,Prentice Hall (K-12),http://images.amazon.com/images/P/0804106304.0...,181
386624,225199,0062502182,5,The Alchemist: A Fable About Following Your Dream,Paulo Coelho,1995,HarperSanFrancisco,http://images.amazon.com/images/P/0062502182.0...,58
57244,30735,0446394521,9,Daisy Fay and the Miracle Man,Fannie Flagg,1992,Warner Books,http://images.amazon.com/images/P/0446394521.0...,77
74368,36836,0060928336,0,Divine Secrets of the Ya-Ya Sisterhood: A Novel,Rebecca Wells,1997,Perennial,http://images.amazon.com/images/P/0060928336.0...,228
263772,153662,0425178579,10,Betrayal in Death,Nora Roberts,2001,Berkley Publishing Group,http://images.amazon.com/images/P/0425178579.0...,62
381997,223087,0380807343,0,Coraline,Neil Gaiman,2003,HarperTrophy,http://images.amazon.com/images/P/0380807343.0...,59
60938,31846,0446612790,0,2nd Chance,James Patterson,2003,Warner Vision,http://images.amazon.com/images/P/0446612790.0...,124
134736,76942,0553580930,0,A Man in Full,Tom Wolfe,1999,Bantam,http://images.amazon.com/images/P/0553580930.0...,65
374711,214786,0425146413,0,Night Prey,John Sandford,2004,Berkley Publishing Group,http://images.amazon.com/images/P/0425146413.0...,61


In [31]:
final_rating.shape

(61853, 9)

In [32]:
final_rating.drop_duplicates(['title', 'user_id'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_rating.drop_duplicates(['title', 'user_id'], inplace=True)


In [33]:
final_rating.shape

(59850, 9)

In [35]:
final_rating.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59850 entries, 0 to 487619
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      59850 non-null  object
 1   ISBN         59850 non-null  object
 2   rating       59850 non-null  object
 3   title        59850 non-null  object
 4   author       59850 non-null  object
 5   year         59850 non-null  object
 6   publisher    59850 non-null  object
 7   image_url    59850 non-null  object
 8   num_ratings  59850 non-null  int64 
dtypes: int64(1), object(8)
memory usage: 4.6+ MB


In [37]:
final_rating['rating'] = pd.to_numeric(final_rating['rating'], errors='coerce')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_rating['rating'] = pd.to_numeric(final_rating['rating'], errors='coerce')


In [38]:
book_pivot = final_rating.pivot_table(index='title', columns='user_id', values='rating')

In [39]:
book_pivot.head(2)

user_id,100459,100644,100846,100906,101209,101305,101851,101876,102275,102359,...,95932,95991,96054,96448,97874,98297,98391,9856,98741,98758
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,,,,,,,,,,,...,,,,,,,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,


In [40]:
book_pivot.fillna(0, inplace = True)

In [41]:
book_pivot.head(1)

user_id,100459,100644,100846,100906,101209,101305,101851,101876,102275,102359,...,95932,95991,96054,96448,97874,98297,98391,9856,98741,98758
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
from scipy.sparse import csr_matrix

In [43]:
book_sparse = csr_matrix(book_pivot)

In [44]:
book_sparse

<742x888 sparse matrix of type '<class 'numpy.float64'>'
	with 14961 stored elements in Compressed Sparse Row format>

In [45]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm='brute')

In [46]:
model.fit(book_sparse)

In [47]:
distance, suggestion = model.kneighbors(book_pivot.iloc[237,:].values.reshape(1,-1), n_neighbors = 6)

In [48]:
distance

array([[ 0.        , 67.75691847, 68.05145112, 72.277244  , 75.81556568,
        76.30203143]])

In [49]:
suggestion

array([[237, 238, 240, 241, 184, 536]], dtype=int64)

In [50]:
for i in range(len(suggestion)):
    print(book_pivot.index[suggestion[i]])

Index(['Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Sorcerer's Stone (Book 1)', 'Exclusive',
       'The Cradle Will Fall'],
      dtype='object', name='title')


In [51]:
books_name = book_pivot.index

In [52]:
import pickle
pickle.dump(model, open('artifacts/model.pkl', 'wb'))
pickle.dump(books_name, open('artifacts/books_name.pkl', 'wb'))
pickle.dump(final_rating, open('artifacts/final_rating.pkl', 'wb'))
pickle.dump(book_pivot, open('artifacts/book_pivot.pkl', 'wb'))