In [4]:
import numpy as np
import pandas as pd

In [5]:
books = pd.read_csv('books.csv', encoding='utf-8')
ratings = pd.read_csv('ratings.csv', encoding='utf-8')
users = pd.read_csv('users.csv', encoding='utf-8')

  books = pd.read_csv('books.csv', encoding='utf-8')


In [6]:
books.shape, ratings.shape, users.shape

((271360, 8), (1149780, 3), (278858, 3))

In [7]:
books.isnull().sum(), ratings.isnull().sum(), users.isnull().sum()

(ISBN                   0
 Book-Title             0
 Book-Author            2
 Year-Of-Publication    0
 Publisher              2
 Image-URL-S            0
 Image-URL-M            0
 Image-URL-L            3
 dtype: int64,
 User-ID        0
 ISBN           0
 Book-Rating    0
 dtype: int64,
 User-ID          0
 Location         0
 Age         110762
 dtype: int64)

## Popularity based recommendation of top 50 books

In [8]:
merged_ratings = ratings.merge(books, on='ISBN')

In [9]:
merged_ratings.isnull().sum()

User-ID                0
ISBN                   0
Book-Rating            0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            4
dtype: int64

In [10]:
merged_ratings.groupby('Book-Title').agg({'Book-Rating': 'mean'}).reset_index()

Unnamed: 0,Book-Title,Book-Rating
0,A Light in the Storm: The Civil War Diary of ...,2.250000
1,Always Have Popsicles,0.000000
2,Apple Magic (The Collector's series),0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,0.000000
...,...,...
241066,Ã?Â?lpiraten.,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,5.250000
241068,Ã?Â?sterlich leben.,7.000000
241069,Ã?Â?stlich der Berge.,2.666667


In [11]:
num_ratings = merged_ratings.groupby('Book-Title').count()['Book-Rating']
num_ratings = num_ratings.reset_index()
num_ratings.columns = ['Book-Title', 'Num-Ratings']

avg_ratings = merged_ratings.groupby('Book-Title').agg({'Book-Rating': 'mean'})
avg_ratings = avg_ratings.reset_index()
avg_ratings.columns = ['Book-Title', 'Avg-Rating']

popular_books = num_ratings.merge(avg_ratings, on='Book-Title')
popular_books = popular_books[popular_books['Num-Ratings'] >= 250]
popular_books = popular_books.sort_values(by='Avg-Rating', ascending=False)

top_50_books = popular_books.head(50)
top_50_books

Unnamed: 0,Book-Title,Num-Ratings,Avg-Rating
80434,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804
80422,Harry Potter and the Goblet of Fire (Book 4),387,5.824289
80441,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741
80426,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441
80414,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453
191612,The Hobbit : The Enchanting Prelude to The Lor...,281,5.007117
187377,The Fellowship of the Ring (The Lord of the Ri...,368,4.94837
80445,Harry Potter and the Sorcerer's Stone (Harry P...,575,4.895652
211384,"The Two Towers (The Lord of the Rings, Part 2)",260,4.880769
219741,To Kill a Mockingbird,510,4.7


In [12]:
popular_df = top_50_books.merge(books, on='Book-Title')
popular_df.drop_duplicates(subset='Book-Title', inplace=True)

popular_df = popular_df[['Book-Title', 'Book-Author', 'Avg-Rating', 'Num-Ratings', 'Image-URL-M']]
popular_df.columns = ['Book-Title', 'Book-Author', 'Avg-Rating', 'Num-Ratings', 'Image-URL-M']
# popular_df[['Book-Title', 'Book-Author', 'Avg-Rating', 'Num-Ratings', 'Image-URL-M']]

In [75]:
popular_df

Unnamed: 0,Book-Title,Book-Author,Avg-Rating,Num-Ratings,Image-URL-M
0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,5.852804,428,http://images.amazon.com/images/P/0439136350.0...
3,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,5.824289,387,http://images.amazon.com/images/P/0439139597.0...
5,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,5.73741,278,http://images.amazon.com/images/P/0590353403.0...
9,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,5.501441,347,http://images.amazon.com/images/P/043935806X.0...
13,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,5.183453,556,http://images.amazon.com/images/P/0439064872.0...
16,The Hobbit : The Enchanting Prelude to The Lor...,J.R.R. TOLKIEN,5.007117,281,http://images.amazon.com/images/P/0345339681.0...
17,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,4.94837,368,http://images.amazon.com/images/P/0345339703.0...
26,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,4.895652,575,http://images.amazon.com/images/P/059035342X.0...
28,"The Two Towers (The Lord of the Rings, Part 2)",J.R.R. TOLKIEN,4.880769,260,http://images.amazon.com/images/P/0345339711.0...
39,To Kill a Mockingbird,Harper Lee,4.7,510,http://images.amazon.com/images/P/0446310786.0...


## Collaborative filtering based recommendations

In [16]:
user_ratings = merged_ratings.groupby('User-ID').agg({'Book-Rating': 'count'}).reset_index().rename(columns={'Book-Rating': 'Num-Ratings'})

### List of user-IDs who has given ratings on over 200 books

In [None]:
user_ratings_over200 = user_ratings[user_ratings['Num-Ratings'] > 200]
user_over200 = user_ratings_over200['User-ID'].tolist()

user_books = merged_ratings[merged_ratings['User-ID'].isin(user_over200)]

### List of famous books which has >= 50 ratings

In [48]:
y = user_books.groupby('Book-Title').agg({'Book-Rating': 'count'}).reset_index()
famous_books = y[y['Book-Rating'] >= 50].index.tolist()

In [50]:
top_books = user_books[user_books['Book-Title'].isin(y['Book-Title'].iloc[famous_books])]
top_books

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
1150,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,http://images.amazon.com/images/P/002542730X.0...,http://images.amazon.com/images/P/002542730X.0...
1163,277427,0060930535,0,The Poisonwood Bible: A Novel,Barbara Kingsolver,1999,Perennial,http://images.amazon.com/images/P/0060930535.0...,http://images.amazon.com/images/P/0060930535.0...,http://images.amazon.com/images/P/0060930535.0...
1165,277427,0060934417,0,Bel Canto: A Novel,Ann Patchett,2002,Perennial,http://images.amazon.com/images/P/0060934417.0...,http://images.amazon.com/images/P/0060934417.0...,http://images.amazon.com/images/P/0060934417.0...
1168,277427,0061009059,9,One for the Money (Stephanie Plum Novels (Pape...,Janet Evanovich,1995,HarperTorch,http://images.amazon.com/images/P/0061009059.0...,http://images.amazon.com/images/P/0061009059.0...,http://images.amazon.com/images/P/0061009059.0...
1174,277427,006440188X,0,The Secret Garden,Frances Hodgson Burnett,1998,HarperTrophy,http://images.amazon.com/images/P/006440188X.0...,http://images.amazon.com/images/P/006440188X.0...,http://images.amazon.com/images/P/006440188X.0...
...,...,...,...,...,...,...,...,...,...,...
1029196,275970,1400031354,0,Tears of the Giraffe (No.1 Ladies Detective Ag...,Alexander McCall Smith,2002,Anchor,http://images.amazon.com/images/P/1400031354.0...,http://images.amazon.com/images/P/1400031354.0...,http://images.amazon.com/images/P/1400031354.0...
1029197,275970,1400031362,0,Morality for Beautiful Girls (No.1 Ladies Dete...,Alexander McCall Smith,2002,Anchor,http://images.amazon.com/images/P/1400031362.0...,http://images.amazon.com/images/P/1400031362.0...,http://images.amazon.com/images/P/1400031362.0...
1029270,275970,1573229725,0,Fingersmith,Sarah Waters,2002,Riverhead Books,http://images.amazon.com/images/P/1573229725.0...,http://images.amazon.com/images/P/1573229725.0...,http://images.amazon.com/images/P/1573229725.0...
1029309,275970,1586210661,9,Me Talk Pretty One Day,David Sedaris,2001,Time Warner Audio Major,http://images.amazon.com/images/P/1586210661.0...,http://images.amazon.com/images/P/1586210661.0...,http://images.amazon.com/images/P/1586210661.0...


### Creating pivot table to get ratings matrix for every user-book pair

In [55]:
final_pivot_matrix = top_books.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating').fillna(0)

In [56]:
final_pivot_matrix

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Calculating cosine similarity b/w movies in higher dim (d = 810)

In [57]:
from sklearn.metrics.pairwise import cosine_similarity
book_similarity = cosine_similarity(final_pivot_matrix)

book_similarity_df = pd.DataFrame(book_similarity, index=final_pivot_matrix.index, columns=final_pivot_matrix.index)

In [58]:
book_similarity

array([[1.        , 0.10255025, 0.01220856, ..., 0.12110367, 0.07347567,
        0.04316046],
       [0.10255025, 1.        , 0.2364573 , ..., 0.07446129, 0.16773875,
        0.14263397],
       [0.01220856, 0.2364573 , 1.        , ..., 0.04558758, 0.04938579,
        0.10796119],
       ...,
       [0.12110367, 0.07446129, 0.04558758, ..., 1.        , 0.07085128,
        0.0196177 ],
       [0.07347567, 0.16773875, 0.04938579, ..., 0.07085128, 1.        ,
        0.10602962],
       [0.04316046, 0.14263397, 0.10796119, ..., 0.0196177 , 0.10602962,
        1.        ]])

In [59]:
book_similarity_df.head()

Book-Title,1984,1st to Die: A Novel,2nd Chance,4 Blondes,A Bend in the Road,A Case of Need,"A Child Called \It\"": One Child's Courage to Survive""",A Civil Action,A Day Late and a Dollar Short,A Fine Balance,...,Winter Solstice,Wish You Well,Without Remorse,"Wizard and Glass (The Dark Tower, Book 4)",Wuthering Heights,Year of Wonders,You Belong To Me,Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,Zoya,"\O\"" Is for Outlaw"""
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,1.0,0.10255,0.012209,0.0,0.053672,0.027749,0.082165,0.137329,0.032617,0.036676,...,0.052372,0.011402,0.009687,0.125306,0.009099,0.058641,0.017696,0.121104,0.073476,0.04316
1st to Die: A Novel,0.10255,1.0,0.236457,0.0,0.109538,0.100929,0.117862,0.18762,0.096589,0.047161,...,0.066278,0.077286,0.15255,0.084055,0.063633,0.052353,0.193097,0.074461,0.167739,0.142634
2nd Chance,0.012209,0.236457,1.0,0.0,0.06909,0.105591,0.0,0.107744,0.067022,0.041682,...,0.153473,0.183089,0.019906,0.14858,0.018697,0.068549,0.165654,0.045588,0.049386,0.107961
4 Blondes,0.0,0.0,0.0,1.0,0.0,0.111582,0.0,0.0,0.074396,0.0,...,0.0,0.0,0.0,0.011948,0.123486,0.108638,0.076242,0.0,0.0,0.0
A Bend in the Road,0.053672,0.109538,0.06909,0.0,1.0,0.101331,0.09627,0.039933,0.074522,0.0,...,0.076387,0.082158,0.022133,0.075816,0.025587,0.0,0.098488,0.040017,0.112841,0.015204


### Recommending top 5 books with their image links

In [68]:
def get_book_recommendations(book_title, num_recommendations=5):
    books_with_links = []

    if book_title not in book_similarity_df.index:
        return f"Book '{book_title}' not found in the dataset."
    
    similar_books = book_similarity_df[book_title].sort_values(ascending=False).head(num_recommendations + 1)  # +1 to exclude the book itself
    similar_books = similar_books[similar_books.index != book_title]

    for book in similar_books.index:
        book_link = top_books[top_books['Book-Title'] == book]['Image-URL-M'].values[0]
        books_with_links.append((book, book_link))

    return books_with_links

    
    

In [71]:
get_book_recommendations('1984', num_recommendations=5)

[('Animal Farm',
  'http://images.amazon.com/images/P/0451526341.01.MZZZZZZZ.jpg'),
 ("The Handmaid's Tale",
  'http://images.amazon.com/images/P/0449212602.01.MZZZZZZZ.jpg'),
 ('Brave New World',
  'http://images.amazon.com/images/P/0060929871.01.MZZZZZZZ.jpg'),
 ('The Vampire Lestat (Vampire Chronicles, Book II)',
  'http://images.amazon.com/images/P/0345313860.01.MZZZZZZZ.jpg'),
 ('The Hours : A Novel',
  'http://images.amazon.com/images/P/0312243022.01.MZZZZZZZ.jpg')]

In [72]:
books.drop_duplicates(subset='Book-Title', inplace=True)

In [73]:
books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...
271354,0449906736,Flashpoints: Promise and Peril in a New World,Robin Wright,1993,Ballantine Books,http://images.amazon.com/images/P/0449906736.0...,http://images.amazon.com/images/P/0449906736.0...,http://images.amazon.com/images/P/0449906736.0...
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...


In [74]:
import pickle
with open('similarity.pkl', 'wb') as f:
    pickle.dump(book_similarity, f)

with open('final_pivot_matrix.pkl', 'wb') as f:
    pickle.dump(final_pivot_matrix, f)

with open('books.pkl', 'wb') as f:
    pickle.dump(books, f)

with open('popular.pkl', 'wb') as f:
    pickle.dump(popular_df, f)  

### Recommend function for app.py file

In [None]:
def recommend_function(user_input):
	# Check if the book exists in the similarity dataframe
	if user_input not in book_similarity_df.index:
		print(f"Book '{user_input}' not found in book_similarity_df.")
		return

	# Get the most similar books (excluding itself)
	similar_books = book_similarity_df[user_input].sort_values(ascending=False).iloc[1:5]

	data = []
	for book_title in similar_books.index:
		temp_df = books[books['Book-Title'] == book_title]
		if not temp_df.empty:
			item = []
			item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Title'].values))
			item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Author'].values))
			item.extend(list(temp_df.drop_duplicates('Book-Title')['Image-URL-M'].values))
			data.append(item)
		else:
			data.append([book_title, None, None])

	print(data)

recommend_function('1984')

[['Animal Farm', 'George Orwell', 'http://images.amazon.com/images/P/0451526341.01.MZZZZZZZ.jpg'], ["The Handmaid's Tale", 'Margaret Atwood', 'http://images.amazon.com/images/P/0449212602.01.MZZZZZZZ.jpg'], ['Brave New World', 'Aldous Huxley', 'http://images.amazon.com/images/P/0060809833.01.MZZZZZZZ.jpg'], ['The Vampire Lestat (Vampire Chronicles, Book II)', 'ANNE RICE', 'http://images.amazon.com/images/P/0345313860.01.MZZZZZZZ.jpg']]
