In [1]:
import pandas as pd
import numpy as np

In [2]:
items = pd.read_csv('BX-Books.csv', sep=';', error_bad_lines=False, encoding="latin-1")
items.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']
users = pd.read_csv('BX-Users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
users.columns = ['userID', 'Location', 'Age']
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
ratings.columns = ['userID', 'ISBN', 'bookRating']

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
items.head()

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [7]:
ratings.head()

Unnamed: 0,userID,ISBN,bookRating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [4]:
#Dont need image links
items.drop(['imageUrlS','imageUrlM','imageUrlL'], axis = 1, inplace=True)

In [10]:
#Checking size of data
print('Number of Users: ',users.shape[0])
print('Number of Items: ',items.shape[0])
print('Number of Ratings: ',ratings.shape[0])
print('Number of Users in Ratings: ', len(ratings['userID'].unique()))
print('Number of Items in Ratings: ', len(ratings['ISBN'].unique()))

Number of Users:  278858
Number of Items:  271360
Number of Ratings:  433671
Number of Users in Ratings:  77805
Number of Items in Ratings:  185973


In [9]:
#Getting rid of implicit ratings
imp_ratings = ratings[ratings['bookRating'] == 0]
ratings = ratings[ratings['bookRating'] > 0]

In [12]:
#users in ratings.csv are all included in users.csv
#But items in ratings.csv more than items.csv --> invalid ISBNs in ratings.csv. We first filter them out
ratings = ratings[ratings['ISBN'].isin(items['ISBN'])] # Users: 68091 items: 149836

In [13]:
ratings.head()

Unnamed: 0,userID,ISBN,bookRating
1,276726,0155061224,5
3,276729,052165615X,3
4,276729,0521795028,6
8,276744,038550120X,7
16,276747,0060517794,9


In [14]:
density = (float(len(ratings))/(len(np.unique(ratings['userID']))*len(np.unique(ratings['ISBN']))))*100
print("Density in percent: "+str(density) )
print("Users: "+str(len(np.unique(ratings['userID'])))+ " items: "+str(len(np.unique(ratings['ISBN']))))

Density in percent: 0.0037622409872253336
Users: 68091 items: 149836


In [15]:
#To reduce our dataset we are going to remove items which were rated less than 10 times
a = ratings.groupby('ISBN').filter(lambda x: len(x) >= 10)
densityi = (float(len(a))/(len(np.unique(a['userID']))*len(np.unique(a['ISBN']))))*100
print("Density after filtering items: "+str(densityi))
print("Users: "+str(len(np.unique(a['userID'])))+ " items: "+str(len(np.unique(a['ISBN']))))

Density after filtering items: 0.06499953850402322
Users: 39365 items: 5444


In [16]:
#Remove users who gave less than 20 ratings
b = a.groupby('userID').filter(lambda x: len(x) >= 20)
densityu = (float(len(b))/(len(np.unique(b['userID']))*len(np.unique(b['ISBN']))))*100
print("Density after filtering users: "+str(densityu))
print("Users: "+str(len(np.unique(b['userID'])))+ " items: "+str(len(np.unique(b['ISBN']))))

Density after filtering users: 0.8143211405243026
Users: 1117 items: 5356


In [51]:
from surprise import SVD, accuracy, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering 
from surprise.model_selection import cross_validate, KFold
from surprise.model_selection import train_test_split
from surprise import Reader, Dataset
from surprise.model_selection import GridSearchCV


In [18]:
b.head()

Unnamed: 0,userID,ISBN,bookRating
1456,277427,002542730X,10
1465,277427,0060542128,7
1474,277427,0061009059,9
1497,277427,0152050167,10
1522,277427,0316776963,8


In [19]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(b, reader)
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=0.25)

In [49]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), NormalPredictor(), KNNBaseline(), KNNBasic(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')    

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,1.510675,0.079455,0.069813
SVD,1.512058,2.109023,0.115691
CoClustering,1.678506,0.999666,0.118683
KNNBaseline,1.696198,0.142285,0.484042
KNNBasic,1.89054,0.072804,0.368683
NormalPredictor,2.327686,0.051527,0.080785


In [39]:
algo = NormalPredictor()
algo.fit(trainset)
predictions = algo.test(testset)
NP_acc = accuracy.rmse(predictions)

RMSE: 2.3377


In [40]:
algo = BaselineOnly()
algo.fit(trainset)
predictions = algo.test(testset)
BO_acc = accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 1.5072


# KNN

In [20]:
#KNN Baseline
algo = KNNBaseline()
algo.fit(trainset)
predictions = algo.test(testset)
KNN_Base = accuracy.rmse(predictions)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.6782


1.6781709525910349

In [73]:
algo = KNNBasic()
algo.fit(trainset)
predictions = algo.test(testset)
KNN_Basic_items = accuracy.rmse(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.8841


In [71]:
# #KNN Basic.  User based similarity with cosine similarity
# sim_options = {'name': 'cosine',
#                'user_based': False  # compute  similarities between items
#                }
# algo = KNNBasic(sim_options = sim_options)
# algo.fit(trainset)
# predictions = algo.test(testset)
# KNN_Basic_items = accuracy.rmse(predictions)

In [72]:
# #KNN Basix.  User based similarity with cosine similarity
# sim_options = {'name': 'cosine',
#                'user_based': True  # compute  similarities between users
#                }
# algo = KNNBasic(sim_options = sim_options)
# algo.fit(trainset)
# predictions = algo.test(testset)
# KNN_BASIC_USERS = accuracy.rmse(predictions)

# SVD

In [25]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x23ba8165a20>

In [26]:
predictions = algo.test(testset)

In [27]:
accuracy.rmse(predictions)

RMSE: 1.5076


1.5075921634544789

### Cross Validation

In [31]:
param_grid = {'n_epochs':[20,30],  'lr_all':[0.001,0.01],'reg_all':[0.02,0.5]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(data)

In [32]:
print(gs.best_params['rmse'])

{'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.5}


In [35]:
algo = gs.best_estimator['rmse']
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 1.5196


1.519632929419054

In [None]:
accuracy.rmse(predictions)

In [None]:
def get_Iu(uid):
    """Return the number of items rated by given user
    
    Args:
        uid: The raw id of the user.
    Returns:
        The number of items rated by the user.
    """
    
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError:  # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """Return the number of users that have rated given item
    
    Args:
        iid: The raw id of the item.
    Returns:
        The number of users that have rated the item.
    """
    
    try:
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:  # item was not part of the trainset
        return 0

df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])    
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)

In [None]:
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [None]:
worst_predictions

# Making Recommendations

In [None]:
iids = a['ISBN'].unique()
iids_37152 = a.loc[a['userID'] == 37152, 'ISBN']

In [None]:
a[a['userID'] == 37152]