In [24]:
#Importing all the required libraries
import numpy as np
import pandas as pd
import os
from math import sqrt
import matplotlib.pyplot as plt
%matplotlib inline

In [25]:
#Retrieving Dataset
#Storing the Book information
books_df = pd.read_csv('books.csv')
#Storing the user information
ratings_df = pd.read_csv('ratings.csv')

In [26]:
#Books Data
books_df.head(2)

Unnamed: 0,booksId,title,genres
0,1,Toy Story (1995),Adventure|Cartoon|Children|Fictional|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [27]:
#Ratings Data
ratings_df.head(5)

Unnamed: 0,userId,booksId,rating,ISBN
0,83438,497,5.0,1081992433
1,83438,500,3.5,1081992600
2,83438,534,4.5,1081992229
3,83438,543,5.0,1081992067
4,83438,587,3.5,1081992611


In [28]:
#pre-processing Data
books_df['year'] = books_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
books_df['year'] = books_df.year.str.extract('(\d\d\d\d)',expand=False)
books_df['title'] = books_df.title.str.replace('(\(\d\d\d\d\))', '')
books_df['title'] = books_df['title'].apply(lambda x: x.strip())

  books_df['title'] = books_df.title.str.replace('(\(\d\d\d\d\))', '')


In [29]:
books_df.head(2)

Unnamed: 0,booksId,title,genres,year
0,1,Toy Story,Adventure|Cartoon|Children|Fictional|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995


In [30]:
#Dropping genres Column
books_df = books_df.drop('genres', 1)
books_df.head(2)

  books_df = books_df.drop('genres', 1)


Unnamed: 0,booksId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995


In [31]:
ratings_df = ratings_df.drop('ISBN', 1)
ratings_df.head()

  ratings_df = ratings_df.drop('ISBN', 1)


Unnamed: 0,userId,booksId,rating
0,83438,497,5.0
1,83438,500,3.5
2,83438,534,4.5
3,83438,543,5.0
4,83438,587,3.5


In [32]:
#Collaborative Filtering Process
input_user = [
            {'title':'Heat', 'rating':5},
            {'title':'GoldenEye', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Sabrina", 'rating':5},
            {'title':'Sudden Death', 'rating':4.5}
         ] 
input_books = pd.DataFrame(input_user)
input_books

Unnamed: 0,title,rating
0,Heat,5.0
1,GoldenEye,3.5
2,Jumanji,2.0
3,Sabrina,5.0
4,Sudden Death,4.5


In [33]:
#Adding BookID
inputId = books_df[books_df['title'].isin(input_books['title'].tolist())]
input_books = pd.merge(inputId, input_books)
input_books = input_books.drop('year', 1)
input_books

  input_books = input_books.drop('year', 1)


Unnamed: 0,booksId,title,rating
0,2,Jumanji,2.0
1,6,Heat,5.0
2,73608,Heat,5.0
3,131274,Heat,5.0
4,176577,Heat,5.0
5,7,Sabrina,5.0
6,915,Sabrina,5.0
7,9,Sudden Death,4.5
8,170419,Sudden Death,4.5
9,10,GoldenEye,3.5


In [34]:
#Getting similar users from dataset
similar_users = ratings_df[ratings_df['booksId'].isin(input_books['booksId'].tolist())]
similar_users.head()

Unnamed: 0,userId,booksId,rating
61,83439,2,3.5
287,83441,10,4.0
609,83446,2,3.0
1340,83449,2,5.0
1343,83449,7,3.0


In [35]:
#grouping the rows based on User ID since each user will have multiple books rated.
grouped_users = similar_users.groupby(['userId'])
grouped_users.get_group(83449)

Unnamed: 0,userId,booksId,rating
1340,83449,2,5.0
1343,83449,7,3.0


In [36]:
#For better recommendation, Sorting the above group based on users 
grouped_users = sorted(grouped_users,  key=lambda x: len(x[1]), reverse=True)
grouped_users[:2]

[(83579,
         userId  booksId  rating
  18887   83579        2     4.0
  18889   83579        6     3.5
  18890   83579        7     3.5
  18892   83579        9     3.0
  18893   83579       10     3.5
  19271   83579      915     3.5),
 (93512,
          userId  booksId  rating
  949631   93512        2     5.0
  949635   93512        6     4.0
  949636   93512        7     4.0
  949637   93512        9     3.0
  949638   93512       10     4.0
  949999   93512      915     4.0)]

In [37]:
grouped_users = grouped_users[0:100]

In [38]:
#Calculating Pearson Correlation Coefficient
correlateDict = {}
for name, group in grouped_users:
    group = group.sort_values(by='booksId')
    inputBooks = input_books.sort_values(by='booksId')
    nRatings = len(group)
    temp_df = input_books[input_books['booksId'].isin(group['booksId'].tolist())]
    tempRatingList = temp_df['rating'].tolist()
    tempGroupList = group['rating'].tolist()
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    if Sxx != 0 and Syy != 0:
        correlateDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        correlateDict[name] = 0

In [39]:
#Converting the Correlation Coefficients to a Dataframe for a better view.
correlateDF = pd.DataFrame.from_dict(correlateDict, orient='index')
correlateDF.columns = ['similarityIndex']
correlateDF['userId'] = correlateDF.index
correlateDF.index = range(len(correlateDF))
correlateDF.head()

Unnamed: 0,similarityIndex,userId
0,-0.783349,83579
1,-0.783349,93512
2,0.470882,93773
3,0.077357,83629
4,0.353553,84875


In [40]:
#Getting Top 20 similiar users
similarusers=correlateDF.sort_values(by='similarityIndex', ascending=False)[0:50]
similarusers.head()

Unnamed: 0,similarityIndex,userId
42,1.0,85037
18,0.912871,91298
73,0.904534,90116
65,0.904534,89452
55,0.904534,87333


In [41]:
#Recommendation Process
similarusrsrating=similarusers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
similarusrsrating.head()

Unnamed: 0,similarityIndex,userId,booksId,rating
0,1.0,85037,2,2.0
1,1.0,85037,6,4.0
2,1.0,85037,7,4.0
3,1.0,85037,10,3.0
4,1.0,85037,16,5.0


In [42]:
#multiplying the Similarity score column and the rating column to get the weighted rating values and adding it as a new column.
similarusrsrating['weightedRating'] = similarusrsrating['similarityIndex']*similarusrsrating['rating']
similarusrsrating.head()

Unnamed: 0,similarityIndex,userId,booksId,rating,weightedRating
0,1.0,85037,2,2.0,2.0
1,1.0,85037,6,4.0,4.0
2,1.0,85037,7,4.0,4.0
3,1.0,85037,10,3.0,3.0
4,1.0,85037,16,5.0,5.0


In [43]:
#grouping the data based on User ID and getting a sum of the similarity scores and weighted rating columns.
tmpsimilarusrsrating = similarusrsrating.groupby('booksId').sum()[['similarityIndex','weightedRating']]
tmpsimilarusrsrating.columns = ['sum_similarityIndex','sum_weightedRating']
tmpsimilarusrsrating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
booksId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,16.688989,63.309208
2,24.263673,64.141682
3,10.657998,33.269259
4,4.842138,13.263731
5,14.031265,40.965243


In [44]:
#Providing Recommendation
recommend_books = pd.DataFrame()
recommend_books['weighted recom score'] = tmpsimilarusrsrating['sum_weightedRating']/tmpsimilarusrsrating['sum_similarityIndex']
recommend_books['booksId'] = tmpsimilarusrsrating.index
recommend_books.head()

Unnamed: 0_level_0,weighted recom score,booksId
booksId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.793472,1
2,2.643527,2
3,3.12153,3
4,2.73923,4
5,2.919569,5


In [45]:
recommend_books = recommend_books.sort_values(by='weighted recom score', ascending=False)
recommend_books.head()

Unnamed: 0_level_0,weighted recom score,booksId
booksId,Unnamed: 1_level_1,Unnamed: 2_level_1
134849,5.0,134849
60818,5.0,60818
42548,5.0,42548
42217,5.0,42217
171271,5.0,171271


In [46]:
#Top Five Recommended Books
#Match the Books IDs with the original Book data frame to get the Book names too.
books_df.loc[books_df['booksId'].isin(recommend_books.head()['booksId'].tolist())]

Unnamed: 0,booksId,title,year
10729,42217,Late Spring (Banshun),1949
10745,42548,Whisky Galore,1949
12914,60818,Hogfather (Terry Pratchett's Hogfather),2006
32291,134849,Duck Amuck,1953
48059,171271,"Lewis Black: Red, White & Screwed",2006
