In [1]:
#Importing Libraries
import ast
import random
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import isspmatrix
from scipy import sparse
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import NearestNeighbors


#Function which imports dataset and transforms into matrix of items and users
def load_data_into_matrix(sparse=True):
    #Importing dataset into dataframe
    data = pd.read_csv('csv/averaged_num_ratings.csv')

    #Converting column of string(dictionaries {user:rating}) into a list of dictionaries
    dict_list=[]
    for index, row in data.iterrows():
        temp_dict = ast.literal_eval(row['averaged_num_ratings'])
        dict_list.append(temp_dict)

    #Transforming list of dictionaries into numpy array using DictVectorizer
    v = DictVectorizer(sparse=sparse)
    D = dict_list
    X = v.fit_transform(D)
    #print('Shape of matrix:',X.shape)
    #print(X)
    #print('rows = books, columns = users')
    return X

#Function which removes self index from given lists
def remove_self(similarities, indices, user_id, n_neighbors):
    similarities = np.delete(similarities, np.where(indices == user_id)).reshape(1,-1)
    similarities = similarities[:n_neighbors-1] #n-1 nearest neighbors (after removing self index)
    indices = np.delete(indices, np.where(indices == user_id)).reshape(1,-1)
    indices = indices[:n_neighbors-1] #n-1 nearest neighbors (after removing self index)
    return similarities, indices

#Function sorts top recommendations and indices along with original book ids and titles
def top_recommendations(pred_rating_list, index_list, num_recommendations):
    df = pd.DataFrame(list(zip(pred_rating_list, index_list)), columns=['predicted_rating', 'matrix_index'])
    df = df.sort_values(by = ['predicted_rating'], ascending=False)
    df = df.drop_duplicates(subset=['matrix_index'])
    df = df[0:num_recommendations]
    #calling a function to get book title and ids from goodreads
    good_bid_list, title_list = get_books_details(list(df['matrix_index'])) 
    df['good_bid'] = good_bid_list
    df['title'] = title_list
    return df

#Function to get book meta data(id and title) from goodreads dataset
def get_books_details(index_list):
    df_temp = pd.read_csv('csv/Final_Dataset.csv')
    df_bid = pd.read_csv('csv/good_bid_index_bid.csv')
    good_bid_list=[]
    title_list=[]
    for i in index_list:
        good_bid = df_bid['good_bid'].loc[i]
        good_bid_list.append(good_bid)
        title_list.append(str(list(df_temp['good_title'].loc[df_temp['good_bid']==good_bid])))
    return good_bid_list, title_list


#Function to predict the recommendations
def predict(matrix, user_id, num_recommendations=10):
    knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors = 10)
    knn.fit(matrix)

    #user
    #user_id = 0

    #calculate nearest neighbors of user
    distances, indices = knn.kneighbors(matrix[user_id].reshape(1,-1))
    similarities = 1-distances

    #remove self index
    similarities, indices = remove_self(similarities, indices, user_id, n_neighbors=knn.get_params()['n_neighbors'])

    #for each book
    pred_rating_list = []
    index_list = []
    
    #if input matrix is sparse change it to array for further processing
    if isspmatrix(matrix):
        matrix=matrix.toarray()
        
    for index, value in np.ndenumerate(matrix[user_id]): #user 0
        index = index[0] #tuple to int
        #for each non-rated book
        if value == 0:
            rating = []
            #retrieve ratings from similar users
            for i in indices[0]: #np array indices is of format [[]] 
                rating.append(matrix[i][index])
            rating = np.array([rating]).reshape(-1,1)
            if rating.any() != 0:
                #predict rating
                predicted_rating = np.dot(similarities, rating) / np.sum(similarities)
                pred_rating_list.append(float(predicted_rating))
                index_list.append(index)
        if index % 1000 == 0:
            print(index, '/5993')
    df = top_recommendations(pred_rating_list, index_list, num_recommendations)
    return df

#Function to randomly manipulate ratings of highly rated books to evaluate the model
def manipulate(user_id, matrix, max_rating=3, num_books=3):
    dense_matrix = matrix.toarray()
    rated_books = list(np.where(dense_matrix[user_id]>=max_rating)[0])
    selected_books = random.sample(rated_books, num_books)
    for book in selected_books:
        dense_matrix[user_id][book]=0 
    sparse_matrix = csr_matrix(dense_matrix)
    return selected_books, sparse_matrix

In [2]:
sparse_matrix = load_data_into_matrix(sparse = True)
sparse_matrix = sparse_matrix.T
sparse_matrix.shape

(433249, 5993)

In [3]:
user_id = 0

In [4]:
%%time
predict(sparse_matrix, user_id)

0 /5993
1000 /5993
2000 /5993
3000 /5993
4000 /5993
5000 /5993
Wall time: 1.25 s


Unnamed: 0,predicted_rating,matrix_index,good_bid,title
12,1.318678,3950,29780256,['Jane Eyre']
2,0.571872,109,1221719,['The Picture of Dorian Gray']
6,0.567776,469,9030474,['The Adventures of Robin Hood']
7,0.555014,631,9512348,['The Three Musketeers']
10,0.454221,2441,105018,['Robinson Crusoe']
9,0.44823,2372,1912434,['Gullivers Travels']
11,0.444011,3566,13034188,['A Journey to the Center of the Earth']
5,0.333008,351,922737,['The Light Princess']
0,0.32998,18,288840,['The Strange Case of Dr Jekyll and Mr Hyde']
8,0.322794,1132,18306730,"['Moby-Dick; or, The Whale']"


In [5]:
%%time
deleted_ratings, sparse_matrix = manipulate(user_id, sparse_matrix)
print(deleted_ratings)

[696, 1990, 2805]
Wall time: 1min 6s


In [6]:
%%time
predict(sparse_matrix, user_id)

0 /5993
1000 /5993
2000 /5993
3000 /5993
4000 /5993
5000 /5993
Wall time: 3.37 s


Unnamed: 0,predicted_rating,matrix_index,good_bid,title
11,0.779423,1990,8479703,['Frankenstein']
2,0.55954,109,1221719,['The Picture of Dorian Gray']
0,0.55787,7,1386631,['Peter Pan']
1,0.55787,37,10846264,['The Adventures of Tom Sawyer']
7,0.552986,631,9512348,['The Three Musketeers']
4,0.454131,181,2150464,['Dracula']
13,0.447632,3950,29780256,['Jane Eyre']
8,0.444243,696,9867049,['Great Expectations']
9,0.443237,875,26165692,['Hamlet']
12,0.442389,3566,13034188,['A Journey to the Center of the Earth']
