In [1]:
import numpy as np
import pandas as pd

## Dataset

In [2]:
books = pd.read_csv('collaborative/Books.csv')
ratings = pd.read_csv('collaborative/Ratings.csv')
users = pd.read_csv('collaborative/Users.csv')

  books = pd.read_csv('collaborative/Books.csv')


In [3]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271359 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [4]:
books.drop(['Year-Of-Publication', 'Publisher','Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1, inplace=True)

In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [6]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [7]:
users.drop(['Age', 'Location'], axis=1, inplace=True)

In [8]:
book_rating = pd.merge(books, ratings, on='ISBN')
user_rating = pd.merge(users, ratings, on='User-ID')

In [9]:
book_rating.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,User-ID,Book-Rating
0,195153448,Classical Mythology,Mark P. O. Morford,2,0
1,2005018,Clara Callan,Richard Bruce Wright,8,5
2,2005018,Clara Callan,Richard Bruce Wright,11400,0
3,2005018,Clara Callan,Richard Bruce Wright,11676,8
4,2005018,Clara Callan,Richard Bruce Wright,41385,0


In [10]:
user_rating.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,2,195153448,0
1,7,34542252,0
2,8,2005018,5
3,8,60973129,0
4,8,374157065,0


In [11]:
len(book_rating), len(user_rating)

(1031136, 1149780)

In [12]:
num_rating_per_book = book_rating.groupby('Book-Title')['Book-Rating'].count().reset_index()
avg_rating_per_book = book_rating.groupby('Book-Title')['Book-Rating'].mean().reset_index()

In [13]:
final_rating_info = pd.merge(num_rating_per_book, avg_rating_per_book,on='Book-Title')
final_rating_info = final_rating_info.rename(columns={'Book-Rating_x': 'Total-Ratings', 'Book-Rating_y': 'Avg-Ratings'})

In [14]:
final_rating_info

Unnamed: 0,Book-Title,Total-Ratings,Avg-Ratings
0,A Light in the Storm: The Civil War Diary of ...,4,2.250000
1,Always Have Popsicles,1,0.000000
2,Apple Magic (The Collector's series),1,0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.000000
...,...,...,...
241066,Ã?Â?lpiraten.,2,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,4,5.250000
241068,Ã?Â?sterlich leben.,1,7.000000
241069,Ã?Â?stlich der Berge.,3,2.666667


## To reduce complexity and improve performance
- Selecting book that have been rated by more than 50 people
- Selection of users who have rated more than 200 books

In [15]:
x = book_rating.groupby('User-ID')['Book-Rating'].count() > 200
educated_users  = x[x].index

book_rating = book_rating[book_rating['User-ID'].isin(educated_users)]

y  = book_rating.groupby('Book-Title')['Book-Rating'].count() >= 50
famous_books = y[y].index

final = book_rating[book_rating['Book-Title'].isin(famous_books)]

In [16]:
final.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,User-ID,Book-Rating
31,399135782,The Kitchen God's Wife,Amy Tan,11676,9
33,399135782,The Kitchen God's Wife,Amy Tan,36836,0
34,399135782,The Kitchen God's Wife,Amy Tan,46398,9
38,399135782,The Kitchen God's Wife,Amy Tan,113270,0
39,399135782,The Kitchen God's Wife,Amy Tan,113519,0


In [17]:
pt = final.pivot_table(index='User-ID', columns='Book-Title', values='Book-Rating').fillna(0)
pt = pt.reset_index()
pt.head()

Book-Title,User-ID,1984,1st to Die: A Novel,2nd Chance,4 Blondes,A Bend in the Road,A Case of Need,"A Child Called \It\"": One Child's Courage to Survive""",A Civil Action,A Day Late and a Dollar Short,...,Winter Solstice,Wish You Well,Without Remorse,"Wizard and Glass (The Dark Tower, Book 4)",Wuthering Heights,Year of Wonders,You Belong To Me,Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,Zoya,"\O\"" Is for Outlaw"""
0,254,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2276,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2766,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2977,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0
4,3363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
new_pt = pt.iloc[:, 1:]

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(new_pt)

In [20]:
similarity_scores.shape

(810, 810)

In [21]:
def recommend(userid):
    # Find idx of books for userid that is not yet rated, i,e rating=0
    rating_pt = pt[pt['User-ID']==userid].values.reshape(-1)[1:]
    idx_No_rating = np.where(rating_pt==0)[0]

    # Keep track of user index - to remove for weighted calculation
    user_idx = pt[pt['User-ID']==userid].index[0]

    # find similarity score vector for userid with other users
    similarity_score = cosine_similarity(new_pt)
    similarity_score_vector = similarity_score[user_idx]

    # remove userid row from new_pt
    temp_pt = new_pt[new_pt.index != user_idx]

    # remove userid's similarity value from similarity vector
    similarity_score_vector = np.delete(similarity_score_vector, user_idx)

    # multiply similiarty vector to each columns in new_pt and sum along the rows and divide by sum of similarity - stores as series
    sum_of_product = np.sum(temp_pt * similarity_score_vector.reshape(-1,1), axis=0)
    final_ratings = (sum_of_product/np.sum(similarity_score_vector))

    # reset index for series 
    final_ratings = final_ratings.reset_index()

    # obtain only the idx zero rated books
    final_ratings_nonwatched = final_ratings.iloc[idx_No_rating]
    final_ratings_nonwatched = final_ratings_nonwatched.iloc[:,-1]

    # Find top 5
    idx = np.argsort(final_ratings_nonwatched)[::-1].values
    top_5 = idx[:5]

    recommend = new_pt.columns[top_5]
    for book in recommend:
        print(book)

In [35]:
for i in range(5):
    random_user = np.random.choice(final['User-ID'].values)
    print(f'Recommendation for user: {random_user}')
    recommend(userid=random_user)
    print()

Recommendation for user: 11601
The Last Time They Met : A Novel
The Robber Bride
The Clinic (Alex Delaware Novels (Paperback))
Bridget Jones : The Edge of Reason
Like Water for Chocolate: A Novel in Monthly Installments, With Recipes, Romances, and Home Remedies

Recommendation for user: 95359
Shattered
Birthright
The Catcher in the Rye
Fingersmith
Family Album

Recommendation for user: 178181
The Poisonwood Bible
Bastard Out of Carolina
House of Sand and Fog
ANGELA'S ASHES
Girl, Interrupted

Recommendation for user: 16634
The Honk and Holler Opening Soon
The Tale of the Body Thief (Vampire Chronicles (Paperback))
The Boy Next Door
H Is for Homicide (Kinsey Millhone Mysteries (Paperback))
Dragonfly in Amber

Recommendation for user: 88733
Breath, Eyes, Memory
The Guardian
The Surgeon
A Time to Kill
The Hunt for Red October

