In [1]:
#Importing Libraries
import ast
import pandas as pd
import numpy as np
from scipy.sparse import isspmatrix
from scipy.sparse import csr_matrix
import random
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

#Function which imports dataset and transforms into matrix of items and users
def load_data_into_matrix(sparse=True):
    #Importing dataset into dataframe
    data = pd.read_csv('csv/averaged_num_ratings.csv')

    #Converting column of string(dictionaries {user:rating}) into a list of dictionaries
    dict_list=[]
    for index, row in data.iterrows():
        temp_dict = ast.literal_eval(row['averaged_num_ratings'])
        dict_list.append(temp_dict)

    #Transforming list of dictionaries into numpy array using DictVectorizer
    v = DictVectorizer(sparse=sparse)
    D = dict_list
    X = v.fit_transform(D)
    #print('Shape of matrix:',X.shape)
    #print(X)
    #print('rows = books, columns = users')
    return X


#Function removes rated books, sorts top recommendations and indices along with original book ids and titles
def top_recommendations(predicted_rating_list, rated_books, num_recommendations):
    df = pd.DataFrame({'matrix_index':list(range(0,5993)),'predicted_rating':predicted_rating_list})
    df = df[~df.matrix_index.isin(rated_books)]
    df = df.sort_values(by=['predicted_rating'], ascending = False)
    df = df[0:num_recommendations]
    #calling a function to get book title and ids from goodreads
    good_bid_list, title_list = get_books_details(list(df['matrix_index'])) 
    df['good_bid'] = good_bid_list
    df['title'] = title_list
    return df

#Function to get book meta data(id and title) from goodreads dataset
def get_books_details(index_list):
    df_temp = pd.read_csv('csv/Final_Dataset.csv')
    df_bid = pd.read_csv('csv/good_bid_index_bid.csv')
    good_bid_list=[]
    title_list=[]
    for i in index_list:
        good_bid = df_bid['good_bid'].loc[i]
        good_bid_list.append(good_bid)
        title_list.append(str(list(df_temp['good_title'].loc[df_temp['good_bid']==good_bid])))
    return good_bid_list, title_list

#Function to predict the recommendations for a user
def predict(matrix, sim_matrix, user_id, num_recommendations=10):
    matrix = matrix.T
    predicted_rating_list = []
    #user
    #user_id = 0
    #marking rated books
    rated_books = list(np.where(matrix[user_id]!=0)[0])
    #for each book
    for index, value in np.ndenumerate(matrix[user_id]):
        index = index[0]
        #predict rating of books based on its similarity to other books
        #for each non-rated book
        if value == 0:
            if np.sum(sim_matrix[index])==0: #to check division by zero error
                predicted_rating = 0
            else:
                predicted_rating = np.dot(matrix[user_id],sim_matrix[index]) / np.sum(sim_matrix[index])
            predicted_rating_list.append(predicted_rating)
        #for rated book
        else:
            predicted_rating_list.append(value)
    df = top_recommendations(predicted_rating_list, rated_books, num_recommendations) #num_recommendations
    return df

#Function to randomly manipulate ratings of highly rated books to evaluate the model
def manipulate(user_id, dense_matrix, max_rating=3, num_books=3):
    rated_books = list(np.where(dense_matrix.T[user_id]>=max_rating)[0])
    selected_books = random.sample(rated_books, num_books)
    for book in selected_books:
        dense_matrix.T[user_id][book]=0
    return selected_books, dense_matrix

def similarity(matrix):
    if isspmatrix(matrix):
        return cosine_similarity(matrix)
    else:
        return cosine_similarity(csr_matrix(matrix))

In [2]:
sparse_matrix = load_data_into_matrix(sparse=True)
print(sparse_matrix.shape)
print(sparse_matrix.T.shape)

(5993, 433249)
(433249, 5993)


In [3]:
sparse_matrix

<5993x433249 sparse matrix of type '<class 'numpy.float64'>'
	with 3261272 stored elements in Compressed Sparse Row format>

###### Execute sim_matrix again after changing the ratings

In [4]:
#similarity between books
sim_matrix = similarity(sparse_matrix)

In [6]:
user_id=0
num_recommendations=10

In [7]:
dense_matrix = sparse_matrix.toarray()
dense_matrix.shape

(5993, 433249)

In [8]:
predict(dense_matrix, sim_matrix, user_id, num_recommendations)

Unnamed: 0,matrix_index,predicted_rating,good_bid,title
3950,3950,0.367807,29780256,['Jane Eyre']
383,383,0.346994,2785278,['Wuthering Heights']
875,875,0.340467,26165692,['Hamlet']
37,37,0.339933,10846264,['The Adventures of Tom Sawyer']
761,761,0.337097,15802925,['Macbeth']
2707,2707,0.335061,144994,['The Velveteen Rabbit 8X8']
78,78,0.333632,22070742,['Los Miserables']
96,96,0.328377,3209316,['Emma']
1132,1132,0.324215,18306730,"['Moby-Dick; or, The Whale']"
3567,3567,0.315505,1415644,['A Little Princess']


###### Evaluation

In [9]:
selected_books, dense_matrix = manipulate(user_id, dense_matrix)
print(selected_books)

[475, 756, 669]


In [10]:
sim_matrix = similarity(dense_matrix)

In [11]:
predict(dense_matrix, sim_matrix, user_id, num_recommendations)

Unnamed: 0,matrix_index,predicted_rating,good_bid,title
756,756,0.362883,8514833,['Romeo and Juliet']
669,669,0.324551,6497728,['Priide and Prejudice']
37,37,0.30085,10846264,['The Adventures of Tom Sawyer']
2707,2707,0.298615,144994,['The Velveteen Rabbit 8X8']
3950,3950,0.298429,29780256,['Jane Eyre']
875,875,0.290467,26165692,['Hamlet']
761,761,0.288109,15802925,['Macbeth']
383,383,0.287018,2785278,['Wuthering Heights']
1132,1132,0.284218,18306730,"['Moby-Dick; or, The Whale']"
78,78,0.283256,22070742,['Los Miserables']
