In [None]:
# Download these files

# books_titles.json
# https://drive.google.com/file/d/1Iqv9TROqNgYbUDijSaDegv4EPpxO97t3/view?usp=sharing

# goodreads_interactions.csv
# https://drive.google.com/open?id=1zmylV7XW2dfQVCLeg1LbllfQtHD2KUon

# book_id_map.csv
# https://drive.google.com/uc?id=1CHTAaNwyzvbi1TR08MJrJ03BxA266Yxr

# liked_books.csv
# https://drive.google.com/file/d/1dhPhfD5hAOJjrdf8JhvbOPxDpF4qWYnb/view?usp=sharing

# Full code is at https://github.com/dataquestio/project-walkthroughs/tree/master/books

import pandas as pd

my_books = pd.read_csv("liked_books.csv", index_col=0)
my_books["book_id"] = my_books["book_id"].astype(str)


In [385]:
my_books


Unnamed: 0,user_id,book_id,rating,title,cover_image,url,num_pages
0,-1,17184521,710,Algo más que vecinos,https://images.gr-assets.com/books/1356611655m...,https://www.goodreads.com/book/show/17184521-a...,180
1,-1,25574593,82,"Algo Maravilhoso (Sequels, #2)",https://images.gr-assets.com/books/1432723574m...,https://www.goodreads.com/book/show/25574593-a...,496
2,-1,31392898,19,"Algo más que amistad, algo menos que amor",https://images.gr-assets.com/books/1474740506m...,https://www.goodreads.com/book/show/31392898-a...,128
3,-1,17726266,18,Algo más que vecinos,https://images.gr-assets.com/books/1364918036m...,https://www.goodreads.com/book/show/17726266-a...,240


In [386]:
my_books = my_books[['user_id','book_id','rating','title']]


In [387]:
my_books


Unnamed: 0,user_id,book_id,rating,title
0,-1,17184521,710,Algo más que vecinos
1,-1,25574593,82,"Algo Maravilhoso (Sequels, #2)"
2,-1,31392898,19,"Algo más que amistad, algo menos que amor"
3,-1,17726266,18,Algo más que vecinos


In [388]:
csv_book_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id


In [389]:
book_set = set(my_books["book_id"])


In [390]:
overlap_users = {}

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        book_id = csv_book_mapping.get(csv_id)
        
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1


In [391]:
len(overlap_users)


15

In [392]:
overlap_users


{'1122': 1,
 '3331': 1,
 '3961': 1,
 '10873': 1,
 '14781': 1,
 '17813': 1,
 '18356': 1,
 '25112': 1,
 '28587': 1,
 '40368': 1,
 '42801': 1,
 '43252': 1,
 '44173': 1,
 '47296': 1,
 '50957': 1}

In [393]:
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/5])


In [394]:
len(filtered_overlap_users)


15

In [395]:
interactions_list = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])


In [396]:
len(interactions_list)


31568

In [397]:
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])


In [398]:
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])


In [399]:
interactions


Unnamed: 0,user_id,book_id,rating
0,-1,17184521,710
1,-1,25574593,82
2,-1,31392898,19
3,-1,17726266,18
0,1122,3860140,4
...,...,...,...
31563,50957,455592,0
31564,50957,25043529,0
31565,50957,17851885,0
31566,50957,25717793,0


In [400]:
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["ratings"] = pd.to_numeric(interactions["rating"])


In [401]:
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes


In [None]:
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes


In [403]:
from scipy.sparse import coo_matrix

ratings_mat_coo = coo_matrix((interactions["ratings"], (interactions["user_index"], interactions["book_index"])))


In [404]:
ratings_mat_coo.shape


(16, 28540)

In [405]:
ratings_mat = ratings_mat_coo.tocsr()


In [406]:
ratings_mat


<16x28540 sparse matrix of type '<class 'numpy.int64'>'
	with 31572 stored elements in Compressed Sparse Row format>

In [407]:
interactions[interactions["user_id"] == "-1"]


Unnamed: 0,user_id,book_id,rating,ratings,user_index,book_index
0,-1,17184521,710,710,0,7994
1,-1,25574593,82,82,0,17417
2,-1,31392898,19,19,0,19737
3,-1,17726266,18,18,0,9341


In [408]:
my_index = 0


In [409]:
from sklearn.metrics.pairwise import cosine_similarity


In [410]:
similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()


In [411]:
similarity[0]


1.0000000000000002

In [412]:
len(similarity)


16

In [413]:
import numpy as np

# Check if the length of similarity is greater than 15
if len(similarity) > 15:
    indices = np.argpartition(similarity, -15)[-15:]
else:
    # If less than 15, return all indices
    indices = np.arange(len(similarity))


In [414]:
indices


array([ 5,  0,  3,  4,  1,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
      dtype=int64)

In [415]:
similar_users = interactions[interactions["user_index"].isin(indices)].copy()


In [416]:
similar_users = similar_users[similar_users["user_id"]!="-1"]


In [417]:
similar_users


Unnamed: 0,user_id,book_id,rating,ratings,user_index,book_index
1859,3331,1885,5,5,8,12158
1860,3331,13413899,5,5,8,3759
1861,3331,18135,5,5,8,10529
1862,3331,1934,5,5,8,12498
1863,3331,117251,5,5,8,1781
...,...,...,...,...,...,...
31563,50957,455592,0,0,15,21923
31564,50957,25043529,0,0,15,16924
31565,50957,17851885,0,0,15,9654
31566,50957,25717793,0,0,15,17558


In [418]:
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])


TypeError: agg function failed [how->mean,dtype->object]

In [None]:
book_recs


Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4,2.0
1000846,1,1.0
100142,1,0.0
1001896,2,4.5
10021420,1,0.0
...,...,...
9994633,1,3.0
99952,1,0.0
99953,1,4.0
99955,1,4.0


In [None]:
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)


In [None]:
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")


In [None]:
book_recs


Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
0,1,4,2.0,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...
1,1000846,1,1.0,"Rex Libris, Volume I: I, Librarian (Rex Libris...",628,https://www.goodreads.com/book/show/1000846.Re...,https://s.gr-assets.com/assets/nophoto/book/11...,rex libris volume i i librarian rex libris 15
2,100142,1,0.0,The Art of Love,2669,https://www.goodreads.com/book/show/100142.The...,https://images.gr-assets.com/books/1320447371m...,the art of love
3,1001896,2,4.5,The Real Mother Goose,31911,https://www.goodreads.com/book/show/1001896.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,the real mother goose
4,10021420,1,0.0,Sex on the Moon: The Amazing Story Behind the ...,5642,https://www.goodreads.com/book/show/10021420-s...,https://images.gr-assets.com/books/1320558894m...,sex on the moon the amazing story behind the m...
...,...,...,...,...,...,...,...,...
12369,9994633,1,3.0,No Room for Dessert,33,https://www.goodreads.com/book/show/9994633-no...,https://s.gr-assets.com/assets/nophoto/book/11...,no room for dessert
12370,99952,1,0.0,"Season of Life: A Football Star, a Boy, a Jour...",1055,https://www.goodreads.com/book/show/99952.Seas...,https://s.gr-assets.com/assets/nophoto/book/11...,season of life a football star a boy a journey...
12371,99953,1,4.0,Collected Writings: Common Sense/The Crisis/Ri...,1503,https://www.goodreads.com/book/show/99953.Coll...,https://s.gr-assets.com/assets/nophoto/book/11...,collected writings common sensethe crisisright...
12372,99955,1,4.0,"Common Sense, The Rights of Man and Other Esse...",13207,https://www.goodreads.com/book/show/99955.Comm...,https://images.gr-assets.com/books/1309203355m...,common sense the rights of man and other essen...


In [None]:
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])


In [None]:
book_recs


Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count
0,1,4,2.0,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...,0.000009
1,1000846,1,1.0,"Rex Libris, Volume I: I, Librarian (Rex Libris...",628,https://www.goodreads.com/book/show/1000846.Re...,https://s.gr-assets.com/assets/nophoto/book/11...,rex libris volume i i librarian rex libris 15,0.001592
2,100142,1,0.0,The Art of Love,2669,https://www.goodreads.com/book/show/100142.The...,https://images.gr-assets.com/books/1320447371m...,the art of love,0.000375
3,1001896,2,4.5,The Real Mother Goose,31911,https://www.goodreads.com/book/show/1001896.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,the real mother goose,0.000125
4,10021420,1,0.0,Sex on the Moon: The Amazing Story Behind the ...,5642,https://www.goodreads.com/book/show/10021420-s...,https://images.gr-assets.com/books/1320558894m...,sex on the moon the amazing story behind the m...,0.000177
...,...,...,...,...,...,...,...,...,...
12369,9994633,1,3.0,No Room for Dessert,33,https://www.goodreads.com/book/show/9994633-no...,https://s.gr-assets.com/assets/nophoto/book/11...,no room for dessert,0.030303
12370,99952,1,0.0,"Season of Life: A Football Star, a Boy, a Jour...",1055,https://www.goodreads.com/book/show/99952.Seas...,https://s.gr-assets.com/assets/nophoto/book/11...,season of life a football star a boy a journey...,0.000948
12371,99953,1,4.0,Collected Writings: Common Sense/The Crisis/Ri...,1503,https://www.goodreads.com/book/show/99953.Coll...,https://s.gr-assets.com/assets/nophoto/book/11...,collected writings common sensethe crisisright...,0.000665
12372,99955,1,4.0,"Common Sense, The Rights of Man and Other Esse...",13207,https://www.goodreads.com/book/show/99955.Comm...,https://images.gr-assets.com/books/1309203355m...,common sense the rights of man and other essen...,0.000076


In [None]:
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]


In [None]:
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]


In [None]:
my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()


In [None]:
my_books["mod_title"] = my_books["mod_title"].str.replace("\s+", " ", regex=True)


In [None]:
book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]


In [None]:
book_recs = book_recs[book_recs["mean"] >=4]


In [None]:
book_recs = book_recs[book_recs["count"]>2]


In [None]:
top_recs = book_recs.sort_values("mean", ascending=False)


In [None]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

top_recs.style.format({'url': make_clickable, 'cover_image': show_image})


Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
736,11588,3,5.0,The Shining,804918,Goodreads,,the shining,1.1e-05,5.6e-05
2031,149267,4,5.0,The Stand,449730,Goodreads,,the stand,3.6e-05,0.000178
3828,19543,3,4.666667,Where the Wild Things Are,635947,Goodreads,,where the wild things are,1.4e-05,6.6e-05
10614,72441,3,4.666667,Bronx Masquerade,5270,Goodreads,,bronx masquerade,0.001708,0.00797
6663,30119,3,4.666667,Where the Sidewalk Ends,1029527,Goodreads,,where the sidewalk ends,9e-06,4.1e-05
3858,197084,3,4.666667,Are You My Mother?,177134,Goodreads,,are you my mother,5.1e-05,0.000237
80,10210,4,4.5,Jane Eyre,1207986,Goodreads,,jane eyre,1.3e-05,6e-05
8705,46677,4,4.5,"Alexander and the Terrible, Horrible, No Good, Very Bad Day",148206,Goodreads,,alexander and the terrible horrible no good very bad day,0.000108,0.000486
4404,22034,4,4.5,The Godfather,259150,Goodreads,,the godfather,6.2e-05,0.000278
5861,2657,6,4.5,To Kill a Mockingbird,3255518,Goodreads,,to kill a mockingbird,1.1e-05,5e-05
