In [604]:
# Download these files

# books_titles.json
# https://drive.google.com/file/d/1Iqv9TROqNgYbUDijSaDegv4EPpxO97t3/view?usp=sharing

# goodreads_interactions.csv
# https://drive.google.com/open?id=1zmylV7XW2dfQVCLeg1LbllfQtHD2KUon

# book_id_map.csv
# https://drive.google.com/uc?id=1CHTAaNwyzvbi1TR08MJrJ03BxA266Yxr

# liked_books.csv
# https://drive.google.com/file/d/1dhPhfD5hAOJjrdf8JhvbOPxDpF4qWYnb/view?usp=sharing

# Full code is at https://github.com/dataquestio/project-walkthroughs/tree/master/books

import pandas as pd

my_books = pd.read_csv("liked_books.csv", index_col=0)
my_books["book_id"] = my_books["book_id"].astype(str)


In [605]:
my_books


Unnamed: 0,user_id,book_id,ratings,title,cover_image,url,num_pages,rating
0,-1,17184521,,Algo más que vecinos,https://images.gr-assets.com/books/1356611655m...,https://www.goodreads.com/book/show/17184521-a...,180,710.0
0,-1,4066338,,The History of Java,https://images.gr-assets.com/books/1395827710m...,https://www.goodreads.com/book/show/4066338-th...,938,268.0
0,-1,305773,,Beyond Java,https://images.gr-assets.com/books/1384259050m...,https://www.goodreads.com/book/show/305773.Bey...,208,52.0


In [606]:
my_books = my_books[['user_id','book_id','rating','title']]


In [607]:
my_books


Unnamed: 0,user_id,book_id,rating,title
0,-1,17184521,710.0,Algo más que vecinos
0,-1,4066338,268.0,The History of Java
0,-1,305773,52.0,Beyond Java


In [608]:
csv_book_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id


In [609]:
book_set = set(my_books["book_id"])


In [610]:
overlap_users = {}

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        book_id = csv_book_mapping.get(csv_id)
        
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1


In [611]:
len(overlap_users)


26

In [612]:
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/5])


In [613]:
len(filtered_overlap_users)


26

In [614]:
interactions_list = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])


In [615]:
len(interactions_list)


34759

In [616]:
interactions_list[0]


['3287', '1301625', '5']

In [617]:
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])


In [618]:
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])


In [619]:
interactions


Unnamed: 0,user_id,book_id,rating
0,-1,17184521,710.0
0,-1,4066338,268.0
0,-1,305773,52.0
0,3287,1301625,5
1,3287,1499340,5
...,...,...,...
34754,51287,28015354,0
34755,51287,29589269,0
34756,51287,26026930,0
34757,51287,1378968,4


In [620]:
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])


In [621]:
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes


In [622]:
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes


In [623]:
from scipy.sparse import coo_matrix

ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))


In [624]:
ratings_mat_coo.shape


(27, 30433)

In [625]:
ratings_mat = ratings_mat_coo.tocsr()


In [626]:
interactions[interactions["user_id"] == "-1"]


Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,17184521,710.0,0,8634
0,-1,4066338,268.0,0,22408
0,-1,305773,52.0,0,20213


In [627]:
my_index = 0


In [628]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()


In [629]:
similarity[0]


0.9999999999999999

In [630]:
import numpy as np

# Check if the length of similarity is greater than 15
if len(similarity) > 15:
    indices = np.argpartition(similarity, -15)[-15:]
else:
    # If less than 15, return all indices
    indices = np.arange(len(similarity))


In [631]:
indices


array([ 8,  5, 11, 10, 16, 14, 18,  3, 20,  6, 15, 26,  1,  2,  0],
      dtype=int64)

In [632]:
similar_users = interactions[interactions["user_index"].isin(indices)].copy()


In [633]:
similar_users = similar_users[similar_users["user_id"]!="-1"]


In [634]:
similar_users


Unnamed: 0,user_id,book_id,rating,user_index,book_index
120,3331,1885,5.0,14,12793
121,3331,13413899,5.0,14,4036
122,3331,18135,5.0,14,11173
123,3331,1934,5.0,14,13132
124,3331,117251,5.0,14,1880
...,...,...,...,...,...
33932,47296,27190170,0.0,20,18884
33933,47296,29634788,2.0,20,19855
33934,47296,22082914,0.0,20,15143
33935,47296,27795147,0.0,20,19106


In [635]:
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])


In [636]:
book_recs


Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3,3.666667
10,1,4.000000
10018310,1,3.000000
10021157,1,0.000000
10048888,1,3.000000
...,...,...
9973644,1,0.000000
9973902,1,0.000000
9974297,1,0.000000
999480,1,5.000000


In [637]:
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)


In [638]:
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")


In [639]:
book_recs


Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
0,1,3,3.666667,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...
1,10,1,4.000000,"Harry Potter Collection (Harry Potter, #1-6)",25245,https://www.goodreads.com/book/show/10.Harry_P...,https://images.gr-assets.com/books/1328867351m...,harry potter collection harry potter 16
2,10018310,1,3.000000,Shin Suikoden 1,270,https://www.goodreads.com/book/show/10018310-s...,https://s.gr-assets.com/assets/nophoto/book/11...,shin suikoden 1
3,10048888,1,3.000000,Kisah-kisah Tengah Malam,436,https://www.goodreads.com/book/show/10048888-k...,https://images.gr-assets.com/books/1293528971m...,kisahkisah tengah malam
4,10049436,2,0.000000,The Single Girl's To-Do List,10026,https://www.goodreads.com/book/show/10049436-t...,https://images.gr-assets.com/books/1327955203m...,the single girls todo list
...,...,...,...,...,...,...,...,...
6188,9959500,1,4.000000,"Los versos del destino (Cuentos de Bereth, #3)",155,https://www.goodreads.com/book/show/9959500-lo...,https://images.gr-assets.com/books/1299054794m...,los versos del destino cuentos de bereth 3
6189,9961796,1,4.000000,Lola and the Boy Next Door (Anna and the Frenc...,110284,https://www.goodreads.com/book/show/9961796-lo...,https://images.gr-assets.com/books/1358271832m...,lola and the boy next door anna and the french...
6190,9966140,1,4.000000,The White Lama #1 : Reinkarnasi,17,https://www.goodreads.com/book/show/9966140-th...,https://images.gr-assets.com/books/1292836984m...,the white lama 1 reinkarnasi
6191,999480,1,5.000000,Violent Volcanoes,236,https://www.goodreads.com/book/show/999480.Vio...,https://s.gr-assets.com/assets/nophoto/book/11...,violent volcanoes


In [640]:
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])


In [641]:
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]


In [642]:
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]


In [643]:
my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()


In [644]:
my_books["mod_title"] = my_books["mod_title"].str.replace("\s+", " ", regex=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_books["mod_title"] = my_books["mod_title"].str.replace("\s+", " ", regex=True)


In [645]:
book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]


In [646]:
book_recs = book_recs[book_recs["mean"] >=4]


In [647]:
book_recs = book_recs[book_recs["count"]>2]


In [648]:
top_recs = book_recs.sort_values("mean", ascending=False)


In [652]:
if len(top_recs) > 10 :
    top_recs = top_recs.head(10)


In [None]:
len(top_recs)


10

In [649]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

top_recs.style.format({'url': make_clickable, 'cover_image': show_image})


Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
5469,7260188,3,5.0,"Mockingjay (The Hunger Games, #3)",1743362,Goodreads,,mockingjay the hunger games 3,5e-06,2.6e-05
5006,6148028,3,5.0,"Catching Fire (The Hunger Games, #2)",1854746,Goodreads,,catching fire the hunger games 2,5e-06,2.4e-05
3579,2767052,3,5.0,"The Hunger Games (The Hunger Games, #1)",4899965,Goodreads,,the hunger games the hunger games 1,2e-06,9e-06
1256,157993,4,4.75,The Little Prince,763309,Goodreads,,the little prince,2.1e-05,0.0001
21,101558,3,4.666667,"Harry Potter y el cáliz de fuego (Harry Potter, #4)",4627,Goodreads,,harry potter y el cliz de fuego harry potter 4,0.001945,0.009077
1090,147865,3,4.333333,"Love, Rosie",56328,Goodreads,,love rosie,0.00016,0.000692
1238,15760001,3,4.333333,"On Dublin Street (On Dublin Street, #1)",138479,Goodreads,,on dublin street on dublin street 1,6.5e-05,0.000282
190,10959,3,4.333333,Sophie's World,111215,Goodreads,,sophies world,8.1e-05,0.000351
4978,6036910,3,4.333333,Nyanyi Sunyi Seorang Bisu 2,329,Goodreads,,nyanyi sunyi seorang bisu 2,0.027356,0.118541
4793,52529,3,4.0,"The Secret (The Secret, #1)",247542,Goodreads,,the secret the secret 1,3.6e-05,0.000145
