In [2]:
%matplotlib inline

import pandas as pd

r = pd.read_csv( 'ratings.csv' )
tr = pd.read_csv( 'to_read.csv' )
b = pd.read_csv( 'books.csv' )

t = pd.read_csv( 'tags.csv' )
bt = pd.read_csv( 'book_tags.csv')


# Cleaning up Data

## Removing rows that contain duplicate user_id, book_id

In [3]:
print(len(r))
r_duplicates_removed = r.drop_duplicates(['user_id','book_id'])
print(len(r_duplicates_removed))

5976479
5976479


## Removing users with no. of ratings below a threshold (20 here)

In [4]:
ruser = r.set_index('user_id',drop=False)
ruser['books_rated']=r['user_id'].value_counts()
print(len(ruser))
ruser.sort_values(by='books_rated').head()

5976479


Unnamed: 0_level_0,user_id,book_id,rating,books_rated
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
34590,34590,9967,5,19
34590,34590,852,5,19
34590,34590,35,5,19
34590,34590,8634,4,19
34590,34590,1180,5,19


In [5]:
ruser=ruser.drop(ruser[ruser.books_rated<20].index)
print(ruser.sort_values(by='books_rated').head())
r=ruser.reset_index(drop=True)
r.head()

         user_id  book_id  rating  books_rated
user_id                                       
43675      43675       27       5           20
43675      43675     8634       5           20
43675      43675      796       3           20
43675      43675       24       5           20
43675      43675      399       4           20


Unnamed: 0,user_id,book_id,rating,books_rated
0,1,258,5,117
1,2,4081,4,65
2,2,260,5,65
3,2,9296,5,65
4,2,2318,3,65


In [6]:
userbooks = r[['user_id','books_rated']].copy()
userbooks.drop_duplicates('user_id',inplace=True)
#userbooks.set_index('user_id',inplace=True,drop=False)
userbooks=userbooks.sort_values(by='books_rated',ascending=False)
userbooks.head()


Unnamed: 0,user_id,books_rated
2332463,30944,200
786481,12874,200
5884967,52036,199
1282352,12381,199
2073923,28158,199


## Extracting only English books 

In [7]:
b['is_english']=b.language_code=='eng'
b_lang=b[['book_id','is_english']]
rm=r.merge(b_lang,how='inner',on = 'book_id')
rm=rm.drop(rm[rm.is_english==False].index)
rm.head()

Unnamed: 0,user_id,book_id,rating,books_rated,is_english
0,1,258,5,117,True
1,11,258,3,120,True
2,143,258,4,159,True
3,242,258,5,123,True
4,325,258,4,132,True


## Make Ratings Matrix

In [8]:
import numpy as np
ratingMat = np.zeros((rm.user_id.max()+1,rm.book_id.max()+1), dtype=np.int8)
for idx,entry in rm.iterrows():
    if idx%100000 == 0:
        print("Doing iteration",idx)
    if(entry[4]):
        ratingMat[entry[0]][entry[1]] = entry[2]

('Doing iteration', 0)
('Doing iteration', 100000)
('Doing iteration', 200000)
('Doing iteration', 300000)
('Doing iteration', 400000)
('Doing iteration', 500000)
('Doing iteration', 700000)
('Doing iteration', 1000000)
('Doing iteration', 1100000)
('Doing iteration', 1200000)
('Doing iteration', 1300000)
('Doing iteration', 1400000)
('Doing iteration', 1600000)
('Doing iteration', 1700000)
('Doing iteration', 1800000)
('Doing iteration', 1900000)
('Doing iteration', 2000000)
('Doing iteration', 2100000)
('Doing iteration', 2200000)
('Doing iteration', 2300000)
('Doing iteration', 2400000)
('Doing iteration', 2500000)
('Doing iteration', 2600000)
('Doing iteration', 2700000)
('Doing iteration', 2900000)
('Doing iteration', 3000000)
('Doing iteration', 3200000)
('Doing iteration', 3600000)
('Doing iteration', 3700000)
('Doing iteration', 3800000)
('Doing iteration', 4000000)
('Doing iteration', 4100000)
('Doing iteration', 4200000)
('Doing iteration', 4400000)
('Doing iteration', 450000

# Collaborative Filtering k-Nearest-Neighbours

In [205]:
import scipy.stats as ss
def common_books_users(ratemat,tgtId):
    U,B = ratemat.shape
    comUsers = []
    userVec = ratemat[tgtId]
    for i in xrange(U):
        if i == tgtId:
            continue
        if (np.logical_and(ratemat[i],userVec)).any():
            comUsers.append(i)
#    print len(comUsers)
    return comUsers

In [197]:
def top_k_similar(simUsers,tgtId,ratemat,k=5):
    #simvec = ratemat[np.ix_(simUsers)]
    userVec = ratemat[tgtId]
    userWt= [] #userWt will have (user_id,weight) pairs for each common user
    for uid in simUsers:
        userWt.append((uid,np.corrcoef(ratemat[uid],userVec)[0,1]))
    userWt = sorted(userWt,key=lambda x:x[1])
    return userWt[-k:]

In [198]:
def recommend_books(userWt,tgtId,ratemat,k=10):
    U,B = ratemat.shape
    nbrRate = np.zeros(B)
    s=0.0
    for uid,wt in userWt:
        nbrRate+=wt*ratemat[uid,:]
        s+=wt
    nbrRate/=s
#     print "Neighbour rates"
#     print nbrRate
    recRate = np.where(ratemat[tgtId]>0,0.0,nbrRate)
#     print "Recommended rates"
#     print recRate
    recBooks = np.argsort(recRate)[-k:]
    return recBooks

In [204]:
def evaluate_in_topq(ratemat,tgtId,sim_users=20,q=100):
    tgtBooks = np.argwhere(ratemat[tgtId]==5).reshape(-1,)
    if len(tgtBooks)==0: 
        print "no top rated"
        return 0
    leftBook = np.random.choice(tgtBooks)
#     print leftBook
    loomat = np.copy(ratemat)
    loomat[t,leftBook] = 0
#     print loomat
    com=common_books_users(loomat,tgtId)
#     print "common_users"
#     print com
    wts = top_k_similar(com,tgtId,loomat,sim_users)
#     print "Similar user weights"
#     print wts
    recBooks = recommend_books(wts,tgtId,loomat,q)
#     print "Books recommended"
#     print recBooks
    if leftBook in recBooks:
        return 1
    else:
        return 0

In [203]:
a=ratingMat[54:64,30:40]
t=0
print a
print evaluate_in_topq(a,t,sim_users=4,q=2)

[[4 5 0 4 0 0 0 0 4 0]
 [0 3 0 0 0 0 0 0 0 0]
 [2 3 0 0 2 0 0 0 0 0]
 [0 3 0 0 0 0 0 1 3 0]
 [0 0 0 0 0 0 0 0 0 4]
 [0 0 3 4 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 4 0 0]
 [3 0 0 3 0 0 0 5 4 4]
 [0 0 0 0 0 3 0 0 0 0]
 [0 0 3 4 0 3 0 0 0 0]]
5
0


In [209]:
U,B = ratingMat.shape
netEval = U/10
hr = []
for tgt in xrange(1,netEval):
    hr.append(evaluate_in_topq(ratingMat,tgt,sim_users=50,q=100))
print("Accuracy of top 100 hits %1.2f percent"%(np.sum(hr)/netEval*100))

no top rated
no top rated
no top rated


KeyboardInterrupt: 

In [210]:
hr

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]