In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
books_df = pd.read_csv('BX-Books.csv', encoding='latin-1')
users_df = pd.read_csv('BX-Users.csv', encoding='latin-1')
book_ratings_df = pd.read_csv('BX-Book-Ratings.csv', encoding='latin-1', nrows=10000)
recommend_df = pd.read_csv('Recommend.csv', encoding='latin-1', header = None)

#### Task 1: Read the books dataset and explore it

In [4]:
books_df.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [5]:
users_df.head()

Unnamed: 0,user_id,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [6]:
book_ratings_df.head()

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6


In [7]:
recommend_df.head()

Unnamed: 0,0,1,2,3
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


#### Task 2: Clean up NaN values

In [8]:
books_df.isnull().sum()

isbn                   0
book_title             0
book_author            1
year_of_publication    0
publisher              2
dtype: int64

In [9]:
users_df.isnull().sum()

user_id          0
Location         1
Age         110763
dtype: int64

In [10]:
book_ratings_df.isnull().sum()

user_id    0
isbn       0
rating     0
dtype: int64

In [11]:
recommend_df.isnull().sum()

0    0
1    0
2    0
3    0
dtype: int64

In [12]:
books_df.dropna(axis = 0, how = 'any', inplace = True)

In [13]:
users_df.drop('Age', axis = 1, inplace = True)

In [14]:
users_df.dropna(axis = 0, how = 'any', inplace = True)

In [15]:
books_df.isnull().sum()

isbn                   0
book_title             0
book_author            0
year_of_publication    0
publisher              0
dtype: int64

In [16]:
users_df.isnull().sum()

user_id     0
Location    0
dtype: int64

In [17]:
book_ratings_df.isnull().sum()

user_id    0
isbn       0
rating     0
dtype: int64

In [18]:
recommend_df.isnull().sum()

0    0
1    0
2    0
3    0
dtype: int64

#### Task 3 : Read the data where ratings are given by users

In [19]:
user_ratings_df = pd.merge(books_df, book_ratings_df, how = 'inner', on = 'isbn')

In [20]:
user_ratings_df.head(10)

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,user_id,rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,2,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,8,0
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,8,0
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,8,0
5,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,8,0
6,425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group,8,0
7,671870432,PLEADING GUILTY,Scott Turow,1993,Audioworks,8,0
8,679425608,Under the Black Flag: The Romance and the Real...,David Cordingly,1996,Random House,8,0
9,074322678X,Where You'll Find Me: And Other Stories,Ann Beattie,2002,Scribner,8,5


#### Task 4: Take a quick look at the number of unique users and books

In [21]:
user_ratings_df['user_id'].nunique()

828

In [22]:
user_ratings_df['isbn'].nunique()

8051

In [39]:
n_users = user_ratings_df['user_id'].nunique()
n_books = user_ratings_df['isbn'].nunique()

#### Task 5: Convert ISBN variables to numeric numbers in the correct order¶

In [45]:
#Convert and print length of isbn list
isbn_list = user_ratings_df.isbn.unique()
print(" Length of isbn List:", len(isbn_list))
itemindex = 0
def get_isbn_numeric_id(isbn):
    #print ("  isbn is:" , isbn)
    itemindex = np.where(isbn_list==isbn)
    return itemindex[0][0]

 Length of isbn List: 8051


In [43]:
isbn_list

array(['195153448', '2005018', '60973129', ..., '912333022', '1569661057',
       '345251547'], dtype=object)

#### Task 6: Convert the user_id variable to numeric numbers in the correct order

In [44]:
#Convert and print length of user_id list
userid_list = user_ratings_df.user_id.unique()
print(" Length of user_id List:", len(userid_list))
itemindex = 0
def get_user_id_numeric_id(user_id):
    #print ("  isbn is:" , isbn)
    itemindex = np.where(userid_list==user_id)
    return itemindex[0][0]

 Length of user_id List: 828


#### Task 7: Convert both user_id and ISBN to the ordered list, i.e., from 0...n-1

In [46]:
user_ratings_df['user_id_order'] = user_ratings_df['user_id'].apply(get_user_id_numeric_id)

In [47]:
user_ratings_df['isbn_id'] = user_ratings_df['isbn'].apply(get_isbn_numeric_id)


In [48]:
user_ratings_df.head()

Unnamed: 0,user_id_order,isbn_id,rating,book_title,book_author,year_of_publication,publisher,isbn,user_id
0,0,0,0,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,195153448,2
1,1,1,5,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,2005018,8
2,1,2,0,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,60973129,8
3,1,3,0,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,374157065,8
4,1,4,0,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,393045218,8


#### Task 8 : Re-index the columns to build a matrix


In [49]:
#Reindexing the columns
new_col_order = ['user_id_order', 'isbn_id', 'rating', 'book_title', 'book_author','year_of_publication','publisher','isbn','user_id']
user_ratings_df = user_ratings_df.reindex(columns= new_col_order)
user_ratings_df.head()

Unnamed: 0,user_id_order,isbn_id,rating,book_title,book_author,year_of_publication,publisher,isbn,user_id
0,0,0,0,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,195153448,2
1,1,1,5,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,2005018,8
2,1,2,0,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,60973129,8
3,1,3,0,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,374157065,8
4,1,4,0,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,393045218,8


In [29]:
user_ratings_df.drop_duplicates(inplace = True)

In [30]:
user_ratings_df = user_ratings_df.loc[:,~user_ratings_df.columns.duplicated()]

#### Task 9: Split your data into two sets (training and testing)

In [50]:
from sklearn.model_selection import train_test_split


In [51]:
train_data, test_data = train_test_split(user_ratings_df, test_size=0.20)

#### Task 10 : Make predictions based on user and item variables

In [52]:
#Create user-book matrix for training 
train_data_matrix = np.zeros((n_users, n_books))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]  
    
#Create user-book matrix for testing
test_data_matrix = np.zeros((n_users, n_books))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [53]:
#Importing pairwise_distances function
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [54]:
#Defining custom function to make predictions
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [55]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

#### Task 11: Use RMSE to evaluate the predictions

In [56]:
#Importing RMSE function 
from sklearn.metrics import mean_squared_error
from math import sqrt

#Defining custom function to filter out elements with ground_truth.nonzero
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [57]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 7.610667694837018
Item-based CF RMSE: 7.610181303694166
