In [1]:
import pandas as pd

my_books = pd.read_csv("liked_books.csv", index_col=0)
my_books["book_id"] = my_books["book_id"].astype(str)

In [2]:
my_books

Unnamed: 0,user_id,book_id,rating,title
0,-1,2517439,5,"The Forever War (The Forever War, #1)"
1,-1,113576,5,The Smartest Guys in the Room: The Amazing Ris...
2,-1,35100,5,Battle Cry of Freedom
3,-1,228221,5,The Mask of Command
5,-1,17662739,5,"2001: A Space Odyssey (Space Odyssey, #1)"
6,-1,356824,5,India After Gandhi: The History of the World's...
7,-1,12125412,5,The Lady or the Tiger?: and Other Logic Puzzles
8,-1,139069,5,Endurance: Shackleton's Incredible Voyage
10,-1,76680,5,"Foundation (Foundation, #1)"
11,-1,1898,5,Into Thin Air: A Personal Account of the Mount...


In [3]:
def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

In [4]:
csv_book_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

In [5]:
book_set = set(my_books["book_id"])

In [6]:
overlap_users = {}

with open("interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        book_id = csv_book_mapping.get(csv_id)
        
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1

In [7]:
len(overlap_users)

316341

In [8]:
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/5])

In [9]:
interactions_list = []

with open("interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])

In [10]:
len(interactions_list)

5638701

In [11]:
interactions_list[0]

['282', '627206', '4']

In [12]:
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])

In [13]:
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])

In [14]:
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,2517439,5
1,-1,113576,5
2,-1,35100,5
3,-1,228221,5
5,-1,17662739,5
...,...,...,...
5638696,804100,475178,0
5638697,804100,186074,0
5638698,804100,153008,0
5638699,804100,45107,0


In [15]:
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

In [16]:
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes

In [17]:
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes

In [18]:
from scipy.sparse import coo_matrix

ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

In [19]:
ratings_mat_coo.shape

(1259, 802870)

In [20]:
ratings_mat = ratings_mat_coo.tocsr()

In [21]:
interactions[interactions["user_id"] == "-1"]

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,2517439,5,0,414880
1,-1,113576,5,0,38971
2,-1,35100,5,0,575858
3,-1,228221,5,0,356004
5,-1,17662739,5,0,214285
6,-1,356824,5,0,581743
7,-1,12125412,5,0,59763
8,-1,139069,5,0,124430
10,-1,76680,5,0,722098
11,-1,1898,5,0,276178


In [22]:
my_index = 0

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()

In [46]:
similarity[3]

0.0

In [25]:
import numpy as np

indices = np.argpartition(similarity, -15)[-15:]

In [26]:
indices

array([1188,  942,  218,  129,  496,  435, 1208,  795, 1213, 1210, 1143,
        321,  294,  862,    0], dtype=int64)

In [27]:
similar_users = interactions[interactions["user_index"].isin(indices)].copy()

In [28]:
similar_users = similar_users[similar_users["user_id"]!="-1"]

In [29]:
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
45312,4133,5359,3,942,632143
45313,4133,10464963,4,942,13492
45314,4133,3858,3,942,593622
45315,4133,11827808,4,942,51904
45316,4133,7913305,4,942,732465
...,...,...,...,...,...
5638521,712588,32388712,3,1143,543119
5638522,712588,16322,5,1143,183365
5638523,712588,860543,0,1143,759827
5638524,712588,853510,5,1143,756768


In [30]:
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])

In [31]:
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,6,3.833333
100322,1,0.000000
100365,1,0.000000
10046142,1,0.000000
1005,3,0.000000
...,...,...
99561,2,2.500000
99610,1,3.000000
99664,1,4.000000
9969571,3,2.333333


In [32]:
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [33]:
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")

In [34]:
book_recs.head(10).style.format({'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,cover_image,mod_title
0,1,6,3.833333,"Harry Potter and the Half-Blood Prince (Harry Potter, #6)",1713866,,harry potter and the halfblood prince harry potter 6
1,100322,1,0.0,Assata: An Autobiography,11057,,assata an autobiography
2,100365,1,0.0,The Mote in God's Eye,48736,,the mote in gods eye
3,10046142,1,0.0,Dancing in the Glory of Monsters: The Collapse of the Congo and the Great War of Africa,2391,,dancing in the glory of monsters the collapse of the congo and the great war of africa
4,1005,3,0.0,Think and Grow Rich,87634,,think and grow rich
5,10054335,1,0.0,Rules of Civility,78912,,rules of civility
6,10058,1,3.0,Flags of Our Fathers,39034,,flags of our fathers
7,100629,1,0.0,The Universe in a Single Atom: The Convergence of Science and Spirituality,6310,,the universe in a single atom the convergence of science and spirituality
8,10079321,1,5.0,"The Magician King (The Magicians, #2)",53532,,the magician king the magicians 2
9,1008101,1,0.0,Made in Japan: Akio Morita and Sony,2833,,made in japan akio morita and sony


In [35]:
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])

In [36]:
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]

In [37]:
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]

In [38]:
my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()

In [39]:
my_books["mod_title"] = my_books["mod_title"].str.replace("\s+", " ", regex=True)

In [40]:
book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]

In [41]:
book_recs = book_recs[book_recs["mean"] >=4]

In [42]:
book_recs = book_recs[book_recs["count"]>2]

In [43]:
top_recs = book_recs.sort_values("mean", ascending=False)

In [44]:
def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

top_recs.style.format({'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,cover_image,mod_title,adjusted_count,score
2260,62291,5,4.8,"A Storm of Swords (A Song of Ice and Fire, #3)",477834,,a storm of swords a song of ice and fire 3,5.2e-05,0.000251
600,157993,3,4.333333,The Little Prince,763309,,the little prince,1.2e-05,5.1e-05
1100,22034,3,4.333333,The Godfather,259150,,the godfather,3.5e-05,0.00015
1173,2318271,3,4.333333,The Last Lecture,245804,,the last lecture,3.7e-05,0.000159
1906,4381,3,4.333333,Fahrenheit 451,591506,,fahrenheit 451,1.5e-05,6.6e-05
243,119322,4,4.25,"The Golden Compass (His Dark Materials, #1)",973154,,the golden compass his dark materials 1,1.6e-05,7e-05
1441,2767793,4,4.25,"The Hero of Ages (Mistborn, #3)",149260,,the hero of ages mistborn 3,0.000107,0.000456
2558,78983,4,4.25,"Kane and Abel (Kane and Abel, #1)",75215,,kane and abel kane and abel 1,0.000213,0.000904
244,119324,3,4.0,"The Subtle Knife (His Dark Materials, #2)",246697,,the subtle knife his dark materials 2,3.6e-05,0.000146
398,13497,4,4.0,"A Feast for Crows (A Song of Ice and Fire, #4)",437398,,a feast for crows a song of ice and fire 4,3.7e-05,0.000146
