# Recommendations: Collaborative Filtering

This notebook reads in a .csv file of a user's ratings across several books (low and/or high ratings) to generate book recommendations using a collaborative filtering method.

In [11]:
# Read in my favourite books
import pandas as pd

my_books = pd.read_csv("liked_books_kevin.csv", index_col=0)
my_books["book_id"] = my_books["book_id"].astype(str)
my_books

Unnamed: 0,user_id,book_id,rating,title
0,-1,1334340,5,The Hobbit
1,-1,224516,5,The Falcon's Malteser
2,-1,12170167,5,A Study in Scarlet
3,-1,241798,5,The Hitchhiker's Guide to the Galaxy (Hitchhik...
4,-1,437143,5,A Short History of Nearly Everything
6,-1,3869,5,A Brief History of Time
7,-1,8492907,5,Physics of the Future: How Science Will Shape ...
8,-1,22543,5,Death by Black Hole - And Other Cosmic Quandaries
9,-1,1125545,5,The Wind in the Willows
10,-1,22841994,5,Harry Potter and the Philosopher's Stone


In [12]:
book_set = set(my_books["book_id"])

In [13]:
# Read in book ID mapping
csv_book_mapping = {}

with open("book_id_map.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

In [14]:
# Read in Goodreads user interactions that overlap with my books
overlap_users = {}

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        book_id = csv_book_mapping.get(csv_id)
        
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1

len(overlap_users)

36159

In [51]:
# Filter the top 10% of overlapping users
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/10])
len(filtered_overlap_users)

2825

In [52]:
# Import top users interactions
interactions_list = []

with open("goodreads_interactions.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.strip().split(",")
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])

len(interactions_list)

4627351

In [96]:
# Rearrange data and ensure data types are correct
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])
interactions = pd.concat([my_books[["user_id", "book_id","rating"]],interactions])
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,1334340,5
1,-1,224516,5
2,-1,12170167,5
3,-1,241798,5
4,-1,437143,5
...,...,...,...
4627346,875462,5600892,5
4627347,875462,10290474,5
4627348,875462,10290471,5
4627349,875462,2961821,5


In [54]:
# Convert user IDs to rows that can be assigned to a matrix
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes
interactions

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,1334340,5,0,95847
1,-1,224516,5,0,329566
2,-1,12170167,5,0,59883
3,-1,241798,5,0,376495
4,-1,437143,5,0,580779
...,...,...,...,...,...
4627346,875462,5600892,5,2742,610108
4627347,875462,10290474,5,2742,7909
4627348,875462,10290471,5,2742,7908
4627349,875462,2961821,5,2742,482938


In [55]:
# Convert users and books into a compressed sparse row matrix
from scipy.sparse import coo_matrix

ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))
ratings_mat = ratings_mat_coo.tocsr()

In [56]:
interactions[interactions["user_id"]=="-1"]

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,1334340,5,0,95847
1,-1,224516,5,0,329566
2,-1,12170167,5,0,59883
3,-1,241798,5,0,376495
4,-1,437143,5,0,580779
6,-1,3869,5,0,567687
7,-1,8492907,5,0,724471
8,-1,22543,5,0,331972
9,-1,1125545,5,0,35230
10,-1,22841994,5,0,341706


In [57]:
# Generate similarities of users compared to us (my_index)
from sklearn.metrics.pairwise import cosine_similarity

my_index = 0
similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()

In [70]:
# Find positions of 15 most similar users
import numpy as np
indices = np.argpartition(similarity, -20)[-20:]
indices

array([2422, 2242, 2529, 2686, 2685, 2320, 2689, 2427, 2436, 2446, 2520,
       2684, 2397, 2261, 2687, 2661, 2351, 2447, 2402,    0], dtype=int64)

In [71]:
# Remove our own user interactions
similar_users = interactions[interactions["user_index"].isin(indices)].copy()
similar_users = similar_users[similar_users["user_id"] != "-1"]
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
4613994,465435,11084145,0,2242,30118
4613995,465435,1845,0,2242,243138
4613996,465435,10884,0,2242,24951
4613997,465435,40929,0,2242,573917
4613998,465435,960,4,2242,758616
...,...,...,...,...,...
4625304,848000,4005310,3,2689,571655
4625305,848000,98685,3,2689,768740
4625306,848000,2330343,4,2689,355903
4625307,848000,150129,4,2689,135239


In [85]:
# Reassign user IDs by grouping
book_recs = similar_users.groupby("book_id").rating.agg(["count", "mean"])
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
100006,1,0.0
100641,1,5.0
100915,1,3.0
10210,1,0.0
10238,1,0.0
...,...,...
98685,1,3.0
99218,1,4.0
99219,1,4.0
99220,1,4.0


In [86]:
# Generate all relevant book recommendations
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
0,100006,1,0.0,Schrodinger's Kittens and the Search for Reali...,1095,https://www.goodreads.com/book/show/100006.Sch...,https://images.gr-assets.com/books/1344269008m...,schrodingers kittens and the search for realit...
1,100641,1,5.0,Beyond Star Trek: From Alien Invasions to the ...,334,https://www.goodreads.com/book/show/100641.Bey...,https://s.gr-assets.com/assets/nophoto/book/11...,beyond star trek from alien invasions to the e...
2,100915,1,3.0,"The Lion, the Witch, and the Wardrobe (Chronic...",1575387,https://www.goodreads.com/book/show/100915.The...,https://images.gr-assets.com/books/1353029077m...,the lion the witch and the wardrobe chronicles...
3,10210,1,0.0,Jane Eyre,1207986,https://www.goodreads.com/book/show/10210.Jane...,https://images.gr-assets.com/books/1327867269m...,jane eyre
4,10238,1,0.0,The Tao of Physics: An Exploration of the Para...,12814,https://www.goodreads.com/book/show/10238.The_...,https://images.gr-assets.com/books/1327908532m...,the tao of physics an exploration of the paral...
...,...,...,...,...,...,...,...,...
412,98685,1,3.0,Genius: The Life and Science of Richard Feynman,15189,https://www.goodreads.com/book/show/98685.Genius,https://images.gr-assets.com/books/1320409497m...,genius the life and science of richard feynman
413,99218,1,4.0,"The Machine Crusade (Legends of Dune, #2)",10250,https://www.goodreads.com/book/show/99218.The_...,https://images.gr-assets.com/books/1412547890m...,the machine crusade legends of dune 2
414,99219,1,4.0,"The Butlerian Jihad (Legends of Dune, #1)",15003,https://www.goodreads.com/book/show/99219.The_...,https://images.gr-assets.com/books/1505458671m...,the butlerian jihad legends of dune 1
415,99220,1,4.0,"The Battle of Corrin (Legends of Dune, #3)",8895,https://www.goodreads.com/book/show/99220.The_...,https://images.gr-assets.com/books/1317792692m...,the battle of corrin legends of dune 3


In [87]:
# Filter books that are generally popular (high numbers of ratings) for more niche selections
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
0,100006,1,0.0,Schrodinger's Kittens and the Search for Reali...,1095,https://www.goodreads.com/book/show/100006.Sch...,https://images.gr-assets.com/books/1344269008m...,schrodingers kittens and the search for realit...,9.132420e-04,0.000000
1,100641,1,5.0,Beyond Star Trek: From Alien Invasions to the ...,334,https://www.goodreads.com/book/show/100641.Bey...,https://s.gr-assets.com/assets/nophoto/book/11...,beyond star trek from alien invasions to the e...,2.994012e-03,0.014970
2,100915,1,3.0,"The Lion, the Witch, and the Wardrobe (Chronic...",1575387,https://www.goodreads.com/book/show/100915.The...,https://images.gr-assets.com/books/1353029077m...,the lion the witch and the wardrobe chronicles...,6.347647e-07,0.000002
3,10210,1,0.0,Jane Eyre,1207986,https://www.goodreads.com/book/show/10210.Jane...,https://images.gr-assets.com/books/1327867269m...,jane eyre,8.278242e-07,0.000000
4,10238,1,0.0,The Tao of Physics: An Exploration of the Para...,12814,https://www.goodreads.com/book/show/10238.The_...,https://images.gr-assets.com/books/1327908532m...,the tao of physics an exploration of the paral...,7.803964e-05,0.000000
...,...,...,...,...,...,...,...,...,...,...
412,98685,1,3.0,Genius: The Life and Science of Richard Feynman,15189,https://www.goodreads.com/book/show/98685.Genius,https://images.gr-assets.com/books/1320409497m...,genius the life and science of richard feynman,6.583712e-05,0.000198
413,99218,1,4.0,"The Machine Crusade (Legends of Dune, #2)",10250,https://www.goodreads.com/book/show/99218.The_...,https://images.gr-assets.com/books/1412547890m...,the machine crusade legends of dune 2,9.756098e-05,0.000390
414,99219,1,4.0,"The Butlerian Jihad (Legends of Dune, #1)",15003,https://www.goodreads.com/book/show/99219.The_...,https://images.gr-assets.com/books/1505458671m...,the butlerian jihad legends of dune 1,6.665334e-05,0.000267
415,99220,1,4.0,"The Battle of Corrin (Legends of Dune, #3)",8895,https://www.goodreads.com/book/show/99220.The_...,https://images.gr-assets.com/books/1317792692m...,the battle of corrin legends of dune 3,1.124227e-04,0.000450


In [88]:
my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]","",regex=True).str.lower()
my_books["mod_title"] = my_books["mod_title"].str.replace("\s+", " ", regex=True)

In [92]:
# Filter out my book selections, recommendation count, and mean rating
book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]
book_recs = book_recs[book_recs["count"] > 1]
book_recs = book_recs[book_recs["mean"] > 4]
top_recs = book_recs.sort_values("score", ascending=False)

In [93]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

def show_image(val):
    return '<img src="{}" width=50></img>'.format(val)

In [94]:
top_recs.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
327,5958783,2,5.0,Why Does E=mc²?,4951,Goodreads,,why does emc,0.000808,0.00404
206,2802,2,4.5,E=mc²: A Biography of the World's Most Famous Equation,5641,Goodreads,,emc a biography of the worlds most famous equation,0.000709,0.003191
146,2094,3,4.333333,A Briefer History of Time,20217,Goodreads,,a briefer history of time,0.000445,0.001929
364,73945,2,4.5,Apology,19111,Goodreads,,apology,0.000209,0.000942
46,12391521,2,4.5,The Psychopath Test: A Journey Through the Madness Industry,46907,Goodreads,,the psychopath test a journey through the madness industry,8.5e-05,0.000384
220,30289,2,4.5,The Republic,113894,Goodreads,,the republic,3.5e-05,0.000158
60,13335037,2,4.5,"Divergent (Divergent, #1)",1962813,Goodreads,,divergent divergent 1,2e-06,9e-06
