In [1]:
import pandas as pd

my_books = pd.read_csv("liked_books.csv", index_col = 0)

In [2]:
my_books

Unnamed: 0,user_id,book_id,rating,title
0,-1,2517439,5,"The Forever War (The Forever War, #1)"
1,-1,113576,5,The Smartest Guys in the Room: The Amazing Ris...
2,-1,35100,5,Battle Cry of Freedom
3,-1,228221,5,The Mask of Command
5,-1,17662739,5,"2001: A Space Odyssey (Space Odyssey, #1)"
6,-1,356824,5,India After Gandhi: The History of the World's...
7,-1,12125412,5,The Lady or the Tiger?: and Other Logic Puzzles
8,-1,139069,5,Endurance: Shackleton's Incredible Voyage
10,-1,76680,5,"Foundation (Foundation, #1)"
11,-1,1898,5,Into Thin Air: A Personal Account of the Mount...


In [3]:
my_books["book_id"] = my_books["book_id"].astype(str)

In [4]:
csv_book_mapping = {}
with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

In [5]:
book_set = set(my_books["book_id"])

In [6]:
overlap_users = {}

with open("goodreads_interactions.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.strip().split(",")
        book_id = csv_book_mapping.get(csv_id)
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1

In [7]:
len(overlap_users)

316341

In [8]:
filter_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/10])

In [9]:
interactions_list = []

with open("goodreads_interactions.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.strip().split(",")
        if user_id in filter_overlap_users:
            book_id = csv_book_mapping.get(csv_id)
            interactions_list.append([user_id, book_id, rating])

In [10]:
len(interactions_list)

42786499

In [11]:
interactions_list[0]

['0', '12', '5']

In [12]:
interactions = pd.DataFrame(interactions_list, columns = ["user_id", "book_id", "rating"])

In [13]:
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])

In [14]:
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,2517439,5
1,-1,113576,5
2,-1,35100,5
3,-1,228221,5
5,-1,17662739,5
...,...,...,...
42786494,876022,94053,4
42786495,876022,16071764,5
42786496,876022,32938155,5
42786497,876022,25937671,0


In [15]:
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

In [16]:
interactions["user_id"].unique()

array(['-1', '0', '15', ..., '875781', '875835', '876022'], dtype=object)

In [17]:
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes

In [18]:
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes

In [19]:
len(interactions["book_index"].unique())

1466746

In [20]:
from scipy.sparse import coo_matrix

ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

In [21]:
ratings_mat_coo

<46465x1466746 sparse matrix of type '<class 'numpy.int64'>'
	with 42786526 stored elements in COOrdinate format>

In [22]:
ratings_mat = ratings_mat_coo.tocsr()

In [23]:
interactions[interactions["user_id"] == "-1"]

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,2517439,5,0,757635
1,-1,113576,5,0,70745
2,-1,35100,5,0,1077282
3,-1,228221,5,0,648675
5,-1,17662739,5,0,386625
6,-1,356824,5,0,1090831
7,-1,12125412,5,0,108535
8,-1,139069,5,0,225316
10,-1,76680,5,0,1324012
11,-1,1898,5,0,498398


In [24]:
my_index = 0

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()

In [26]:
similarity[1]

0.019866980414254484

In [27]:
import numpy as np

indices = np.argpartition(similarity, -30)[30:]

In [28]:
indices

array([   30,    31, 46450, ..., 39380, 30432,     0], dtype=int64)

In [29]:
similar_users = interactions[interactions["user_index"].isin(indices)].copy()

In [30]:
similar_users = similar_users[similar_users["user_id"] != "-1"]

In [31]:
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,0,12,5,1,102205
1,0,21,5,1,573452
2,0,30,5,1,942427
3,0,45,5,1,1140017
4,0,1,5,1,0
...,...,...,...,...,...
42786494,876022,94053,4,45028,1430294
42786495,876022,16071764,5,45028,305760
42786496,876022,32938155,5,45028,1021642
42786497,876022,25937671,0,45028,808500


In [32]:
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])

In [33]:
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,27475,3.917671
10,452,4.095133
100,5,1.600000
1000,34,0.441176
10000,227,0.475771
...,...,...
9999934,2,0.000000
9999954,3,2.000000
9999969,4,0.750000
999999,19,1.473684


In [34]:
book_titles = pd.read_json("books_titles.json")
book_titles["book_id"] = book_titles["book_id"].astype(str)

In [35]:
book_recs = book_recs.merge(book_titles, how="inner", on="book_id")

In [36]:
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
0,1,27475,3.917671,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...
1,10,452,4.095133,"Harry Potter Collection (Harry Potter, #1-6)",25245,https://www.goodreads.com/book/show/10.Harry_P...,https://images.gr-assets.com/books/1328867351m...,harry potter collection harry potter 16
2,100,5,1.600000,Simply Beautiful Beading,75,https://www.goodreads.com/book/show/100.Simply...,https://s.gr-assets.com/assets/nophoto/book/11...,simply beautiful beading
3,1000,34,0.441176,Millionaire Women Next Door: The Many Journeys...,460,https://www.goodreads.com/book/show/1000.Milli...,https://s.gr-assets.com/assets/nophoto/book/11...,millionaire women next door the many journeys ...
4,10000,227,0.475771,The Face of Another,2079,https://www.goodreads.com/book/show/10000.The_...,https://images.gr-assets.com/books/1320415026m...,the face of another
...,...,...,...,...,...,...,...,...
1094619,9999934,2,0.000000,Ten Moonstruck Piglets,61,https://www.goodreads.com/book/show/9999934-te...,https://s.gr-assets.com/assets/nophoto/book/11...,ten moonstruck piglets
1094620,9999954,3,2.000000,Dijkshoorn,175,https://www.goodreads.com/book/show/9999954-di...,https://s.gr-assets.com/assets/nophoto/book/11...,dijkshoorn
1094621,9999969,4,0.750000,Alien Aberrations,17,https://www.goodreads.com/book/show/9999969-al...,https://images.gr-assets.com/books/1293137133m...,alien aberrations
1094622,999999,19,1.473684,Roommates,46,https://www.goodreads.com/book/show/999999.Roo...,https://s.gr-assets.com/assets/nophoto/book/11...,roommates


In [37]:
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])

In [38]:
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]

In [39]:
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]

In [40]:
my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9]", "", regex = True).str.lower()

In [41]:
my_books["mod_title"] = my_books["mod_title"].str.replace("\s+"," ", regex = True)

In [42]:
book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]

In [43]:
book_recs = book_recs[book_recs["count"] > 10]

In [44]:
book_recs = book_recs[book_recs["mean"] > 4]

In [45]:
top_recs = book_recs.sort_values("score", ascending = False)

In [46]:
top_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
847638,5,28048,4.005419,Harry Potter and the Prisoner of Azkaban (Harr...,1876252,https://www.goodreads.com/book/show/5.Harry_Po...,https://images.gr-assets.com/books/1499277281m...,harry potter and the prisoner of azkaban harry...,419.288189,1679.424998
694489,3,34576,4.125000,Harry Potter and the Sorcerer's Stone (Harry P...,4765497,https://www.goodreads.com/book/show/3.Harry_Po...,https://images.gr-assets.com/books/1474154022m...,harry potter and the sorcerers stone harry pot...,250.865707,1034.821043
961156,72193,776,4.051546,Harry Potter and the Philosopher's Stone (Harr...,31614,https://www.goodreads.com/book/show/72193.Harr...,https://images.gr-assets.com/books/1327190600m...,harry potter and the philosophers stone harry ...,19.047764,77.172898
1050789,90072,1228,4.182410,"Dr. Seuss's Green Eggs and Ham: For Soprano, B...",91573,https://www.goodreads.com/book/show/90072.Dr_S...,https://s.gr-assets.com/assets/nophoto/book/11...,dr seusss green eggs and ham for soprano boy s...,16.467561,68.874100
836827,464164,596,4.068792,Harry Potter and the Prisoner of Azkaban (Harr...,22794,https://www.goodreads.com/book/show/464164.Har...,https://images.gr-assets.com/books/1310384602m...,harry potter and the prisoner of azkaban harry...,15.583750,63.407037
...,...,...,...,...,...,...,...,...,...,...
683683,2954458,11,4.090909,Harry Potter en de Halfbloed Prins (Harry Pott...,1254,https://www.goodreads.com/book/show/2954458-ha...,https://images.gr-assets.com/books/1372329105m...,harry potter en de halfbloed prins harry potter 6,0.096491,0.394737
665964,28767931,16,4.250000,Harry Potter and the Half-Blood Prince (Harry ...,2784,https://www.goodreads.com/book/show/28767931-h...,https://s.gr-assets.com/assets/nophoto/book/11...,harry potter and the halfblood prince harry po...,0.091954,0.390805
1021193,840577,17,4.058824,Harry Potter und der Halbblutprinz (Harry Pott...,3353,https://www.goodreads.com/book/show/840577.Har...,https://s.gr-assets.com/assets/nophoto/book/11...,harry potter und der halbblutprinz harry potter 6,0.086191,0.349836
920824,6573344,11,4.363636,Blood Work (Terry McCaleb #1),1605,https://www.goodreads.com/book/show/6573344-bl...,https://images.gr-assets.com/books/1328202999m...,blood work terry mccaleb 1,0.075389,0.328972


In [47]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

def show_image(val):
    return '<img src="{}" width=50></img>'.format(val)

In [48]:
top_recs.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
847638,5,28048,4.005419,"Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)",1876252,Goodreads,,harry potter and the prisoner of azkaban harry potter 3,419.288189,1679.424998
694489,3,34576,4.125,"Harry Potter and the Sorcerer's Stone (Harry Potter, #1)",4765497,Goodreads,,harry potter and the sorcerers stone harry potter 1,250.865707,1034.821043
961156,72193,776,4.051546,"Harry Potter and the Philosopher's Stone (Harry Potter, #1)",31614,Goodreads,,harry potter and the philosophers stone harry potter 1,19.047764,77.172898
1050789,90072,1228,4.18241,"Dr. Seuss's Green Eggs and Ham: For Soprano, Boy Soprano, and Orchestra",91573,Goodreads,,dr seusss green eggs and ham for soprano boy soprano and orchestra,16.467561,68.8741
836827,464164,596,4.068792,"Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)",22794,Goodreads,,harry potter and the prisoner of azkaban harry potter 3,15.58375,63.407037
1092065,99298,731,4.179207,"The Harry Potter Collection 1-4 (Harry Potter, #1-4)",44587,Goodreads,,the harry potter collection 14 harry potter 14,11.984682,50.08646
1009618,818056,371,4.061995,"Harry Potter and the Deathly Hallows (Harry Potter, #7)",13938,Goodreads,,harry potter and the deathly hallows harry potter 7,9.875233,40.113144
1,10,452,4.095133,"Harry Potter Collection (Harry Potter, #1-6)",25245,Goodreads,,harry potter collection harry potter 16,8.09285,33.141295
1035393,864890,298,4.07047,"Harry Potter and the Goblet of Fire (Harry Potter, #4)",11031,Goodreads,,harry potter and the goblet of fire harry potter 4,8.050403,32.768924
281159,17347383,139,4.309353,"Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)",3311,Goodreads,,harry potter and the prisoner of azkaban harry potter 3,5.835397,25.146783
