In [1]:
pip install numpy

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import pandas as pd

In [5]:
books = pd.read_csv('https://yudiantosujana.com/files/books/books.csv', low_memory=False)
users = pd.read_csv('https://yudiantosujana.com/files/books/users.csv')
ratings = pd.read_csv('https://yudiantosujana.com/files/books/ratings.csv')

In [6]:
books['Image-URL-M'][1]

'http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg'

In [7]:
users.head()

Unnamed: 0.1,Unnamed: 0,User-ID,Location,Age
0,0,1,"nyc, new york, usa",
1,1,2,"stockton, california, usa",18.0
2,2,3,"moscow, yukon territory, russia",
3,3,4,"porto, v.n.gaia, portugal",17.0
4,4,5,"farnborough, hants, united kingdom",


In [8]:
ratings.head()

Unnamed: 0.1,Unnamed: 0,User-ID,ISBN,Book-Rating
0,0,276725,034545104X,0
1,1,276726,0155061224,5
2,2,276727,0446520802,0
3,3,276729,052165615X,3
4,4,276729,0521795028,6


In [9]:
print(books.shape)
print(ratings.shape)
print(users.shape)

(271360, 9)
(1149780, 4)
(278858, 4)


In [10]:
books.isnull().sum()

Unnamed: 0             0
ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [11]:
users.isnull().sum()

Unnamed: 0         0
User-ID            0
Location           0
Age           110762
dtype: int64

In [12]:
ratings.isnull().sum()

Unnamed: 0     0
User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [13]:
books.duplicated().sum()

0

In [14]:
ratings.duplicated().sum()

0

In [15]:
users.duplicated().sum()

0

## Popularity Based Recommender System

In [16]:
ratings_with_name = ratings.merge(books,on='ISBN')

In [17]:
num_rating_df = ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()
num_rating_df.rename(columns={'Book-Rating':'num_ratings'},inplace=True)
num_rating_df

Unnamed: 0,Book-Title,num_ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241066,Ã?Â?lpiraten.,2
241067,Ã?Â?rger mit Produkt X. Roman.,4
241068,Ã?Â?sterlich leben.,1
241069,Ã?Â?stlich der Berge.,3


In [22]:
avg_rating_df = ratings_with_name.groupby('Book-Title').how()['Book-Rating'].reset_index()
avg_rating_df.rename(columns={'Book-Rating':'avg_ratings'},inplace=True)
avg_rating_df

AttributeError: 'DataFrameGroupBy' object has no attribute 'how'

In [None]:
popular_df = num_rating_df.merge(avg_rating_df,on='Book-Title')
popular_df

In [None]:
popular_df = popular_df[popular_df['num_ratings']>=250].sort_values('avg_rating',ascending=False).head(50)

In [None]:
popular_df = popular_df.merge(books,on='Book-Title').drop_duplicates('Book-Title')[['Book-Title','Book-Author','Image-URL-M','num_ratings','avg_rating']]

In [None]:
popular_df['Image-URL-M'][0]

## Collaborative Filtering Based Recommender System

In [None]:
x = ratings_with_name.groupby('User-ID').count()['Book-Rating'] > 200
padhe_likhe_users = x[x].index

In [None]:
filtered_rating = ratings_with_name[ratings_with_name['User-ID'].isin(padhe_likhe_users)]

In [None]:
y = filtered_rating.groupby('Book-Title').count()['Book-Rating']>=50
famous_books = y[y].index

In [None]:
final_ratings = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]

In [None]:
pt = final_ratings.pivot_table(index='Book-Title',columns='User-ID',values='Book-Rating')

In [None]:
pt.fillna(0,inplace=True)

In [None]:
pt

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity_scores = cosine_similarity(pt)

In [None]:
similarity_scores.shape

In [None]:
def recommend(book_name):
    # index fetch
    index = np.where(pt.index==book_name)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[1:5]
    
    data = []
    for i in similar_items:
        item = []
        temp_df = books[books['Book-Title'] == pt.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Title'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Author'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Image-URL-M'].values))
        
        data.append(item)
    
    return data

In [None]:
recommend('1984')

In [None]:
pt.index[545]

In [None]:
import pickle
pickle.dump(popular_df,open('popular.pkl','wb'))

In [None]:
books.drop_duplicates('Book-Title')

In [None]:
pickle.dump(pt,open('pt.pkl','wb'))
pickle.dump(books,open('books.pkl','wb'))
pickle.dump(similarity_scores,open('similarity_scores.pkl','wb'))