# Movie Recommender System
<hr>

In [46]:
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

import pickle

## EDA

In [47]:
books = pd.read_csv('data/Books.csv')
ratings = pd.read_csv('data/Ratings.csv')
users = pd.read_csv('data/Users.csv')

  books = pd.read_csv('data/Books.csv')


In [48]:
print(books.shape)
print(ratings.shape)
print(users.shape)

(271360, 8)
(1149780, 3)
(278858, 3)


In [49]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [50]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [51]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [52]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [53]:
users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [54]:
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [55]:
books.duplicated().sum()

0

In [56]:
ratings.duplicated().sum()

0

In [57]:
users.duplicated().sum()

0

## Popularity Based

Recommend top 50 books with the highest average rating, which have greater than 250 ratings.

In [58]:
# merging ratings and books dataframes

books_with_ratings = ratings.merge(books, on='ISBN')
books_with_ratings.shape

(1031136, 10)

The number of rows in `ratings` and `books_with_ratings` dont match.

This may be because some books on which rating is present is not there in the `books`.

In [59]:
books_with_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...
2,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
3,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,http://images.amazon.com/images/P/052165615X.0...,http://images.amazon.com/images/P/052165615X.0...,http://images.amazon.com/images/P/052165615X.0...
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,http://images.amazon.com/images/P/0521795028.0...,http://images.amazon.com/images/P/0521795028.0...,http://images.amazon.com/images/P/0521795028.0...


In [60]:
count_ratings_df = books_with_ratings.groupby("Book-Title").count()['Book-Rating'].reset_index()
count_ratings_df = count_ratings_df.rename(columns = {'Book-Rating': 'Rating-Count'})
count_ratings_df.head()

Unnamed: 0,Book-Title,Rating-Count
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


In [61]:
mean_ratings_df = books_with_ratings.groupby("Book-Title")['Book-Rating'].mean().reset_index()
mean_ratings_df = mean_ratings_df.rename(columns = {'Book-Rating': 'Mean-Rating'})
mean_ratings_df.head()

Unnamed: 0,Book-Title,Mean-Rating
0,A Light in the Storm: The Civil War Diary of ...,2.25
1,Always Have Popsicles,0.0
2,Apple Magic (The Collector's series),0.0
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.0
4,Beyond IBM: Leadership Marketing and Finance ...,0.0


In [62]:
popularity_df = count_ratings_df.merge(mean_ratings_df, on='Book-Title')
popularity_df.head()

Unnamed: 0,Book-Title,Rating-Count,Mean-Rating
0,A Light in the Storm: The Civil War Diary of ...,4,2.25
1,Always Have Popsicles,1,0.0
2,Apple Magic (The Collector's series),1,0.0
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.0
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.0


In [63]:
popularity_df = popularity_df[popularity_df['Rating-Count'] > 300]
popularity_df.head()

Unnamed: 0,Book-Title,Rating-Count,Mean-Rating
818,1st to Die: A Novel,509,3.575639
1048,2nd Chance,356,3.269663
1760,A Bend in the Road,346,3.364162
3988,A Heartbreaking Work of Staggering Genius,302,3.423841
4808,A Map of the World,327,2.492355


In [64]:
popularity_df = popularity_df.sort_values(by='Mean-Rating', ascending=False)
popularity_df = popularity_df.head(50)
popularity_df.head()

Unnamed: 0,Book-Title,Rating-Count,Mean-Rating
80434,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804
80422,Harry Potter and the Goblet of Fire (Book 4),387,5.824289
80426,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441
80414,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453
187377,The Fellowship of the Ring (The Lord of the Ri...,368,4.94837


We need the name of the author and publisher.

In [65]:
popularity_df = popularity_df.merge(books, on='Book-Title').drop_duplicates('Book-Title')
# popularity_df.head()

popularity_df = popularity_df[['Book-Title', 'Book-Author', 'Publisher', 'Mean-Rating', 'Image-URL-L']]
popularity_df.head()

Unnamed: 0,Book-Title,Book-Author,Publisher,Mean-Rating,Image-URL-L
0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,Scholastic,5.852804,http://images.amazon.com/images/P/0439136350.0...
3,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,Scholastic,5.824289,http://images.amazon.com/images/P/0439139597.0...
5,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,Scholastic,5.501441,http://images.amazon.com/images/P/043935806X.0...
9,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,Scholastic,5.183453,http://images.amazon.com/images/P/0439064872.0...
12,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,Del Rey,4.94837,http://images.amazon.com/images/P/0345339703.0...


## Collaborative Based Filtering

We will create a grid which stores the rating of every book given by every user.

We will consider only those users who have done more than 200 ratings and only those books with atleast 50 ratings count.

In [66]:
books_with_ratings = ratings.merge(books, on='ISBN')
books_with_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...
2,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
3,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,http://images.amazon.com/images/P/052165615X.0...,http://images.amazon.com/images/P/052165615X.0...,http://images.amazon.com/images/P/052165615X.0...
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,http://images.amazon.com/images/P/0521795028.0...,http://images.amazon.com/images/P/0521795028.0...,http://images.amazon.com/images/P/0521795028.0...


Filtering out users who have rated less than 200 books

In [67]:
x = books_with_ratings.groupby('User-ID').count()['Book-Rating'] > 200
x

User-ID
2         False
8         False
9         False
10        False
12        False
          ...  
278846    False
278849    False
278851    False
278852    False
278854    False
Name: Book-Rating, Length: 92106, dtype: bool

In [68]:
x = x[x].index
x

Index([   254,   2276,   2766,   2977,   3363,   4017,   4385,   6251,   6323,
         6543,
       ...
       271705, 273979, 274004, 274061, 274301, 274308, 275970, 277427, 277639,
       278418],
      dtype='int64', name='User-ID', length=811)

In [69]:
filtered_rating = books_with_ratings[books_with_ratings['User-ID'].isin(x)]

Filtering out books with less than 50 ratings

In [70]:
y = filtered_rating.groupby('Book-Title').count()['Book-Rating'] >= 50
y = y[y].index

In [71]:
filtered_rating = filtered_rating[filtered_rating['Book-Title'].isin(y)]

In [72]:
filtered_rating.drop_duplicates()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
1150,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,http://images.amazon.com/images/P/002542730X.0...,http://images.amazon.com/images/P/002542730X.0...
1163,277427,0060930535,0,The Poisonwood Bible: A Novel,Barbara Kingsolver,1999,Perennial,http://images.amazon.com/images/P/0060930535.0...,http://images.amazon.com/images/P/0060930535.0...,http://images.amazon.com/images/P/0060930535.0...
1165,277427,0060934417,0,Bel Canto: A Novel,Ann Patchett,2002,Perennial,http://images.amazon.com/images/P/0060934417.0...,http://images.amazon.com/images/P/0060934417.0...,http://images.amazon.com/images/P/0060934417.0...
1168,277427,0061009059,9,One for the Money (Stephanie Plum Novels (Pape...,Janet Evanovich,1995,HarperTorch,http://images.amazon.com/images/P/0061009059.0...,http://images.amazon.com/images/P/0061009059.0...,http://images.amazon.com/images/P/0061009059.0...
1174,277427,006440188X,0,The Secret Garden,Frances Hodgson Burnett,1998,HarperTrophy,http://images.amazon.com/images/P/006440188X.0...,http://images.amazon.com/images/P/006440188X.0...,http://images.amazon.com/images/P/006440188X.0...
...,...,...,...,...,...,...,...,...,...,...
1029196,275970,1400031354,0,Tears of the Giraffe (No.1 Ladies Detective Ag...,Alexander McCall Smith,2002,Anchor,http://images.amazon.com/images/P/1400031354.0...,http://images.amazon.com/images/P/1400031354.0...,http://images.amazon.com/images/P/1400031354.0...
1029197,275970,1400031362,0,Morality for Beautiful Girls (No.1 Ladies Dete...,Alexander McCall Smith,2002,Anchor,http://images.amazon.com/images/P/1400031362.0...,http://images.amazon.com/images/P/1400031362.0...,http://images.amazon.com/images/P/1400031362.0...
1029270,275970,1573229725,0,Fingersmith,Sarah Waters,2002,Riverhead Books,http://images.amazon.com/images/P/1573229725.0...,http://images.amazon.com/images/P/1573229725.0...,http://images.amazon.com/images/P/1573229725.0...
1029309,275970,1586210661,9,Me Talk Pretty One Day,David Sedaris,2001,Time Warner Audio Major,http://images.amazon.com/images/P/1586210661.0...,http://images.amazon.com/images/P/1586210661.0...,http://images.amazon.com/images/P/1586210661.0...


Convert df to a pivot table

In [73]:
pt = filtered_rating.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')

In [74]:
pt.shape

(706, 810)

In [75]:
pt = pt.fillna(0)

Finding cosine similarity

In [76]:
sim_score = cosine_similarity(pt)
sim_score

array([[1.        , 0.10255025, 0.01220856, ..., 0.12110367, 0.07347567,
        0.04316046],
       [0.10255025, 1.        , 0.2364573 , ..., 0.07446129, 0.16773875,
        0.14263397],
       [0.01220856, 0.2364573 , 1.        , ..., 0.04558758, 0.04938579,
        0.10796119],
       ...,
       [0.12110367, 0.07446129, 0.04558758, ..., 1.        , 0.07085128,
        0.0196177 ],
       [0.07347567, 0.16773875, 0.04938579, ..., 0.07085128, 1.        ,
        0.10602962],
       [0.04316046, 0.14263397, 0.10796119, ..., 0.0196177 , 0.10602962,
        1.        ]])

In [77]:
sim_score.shape

(706, 706)

In [78]:
def recommend(book_title):
    recommendations = []
    book_idx = np.where(pt.index == book_title)[0][0]
    distances = sim_score[book_idx]
    similar_books = sorted(list(enumerate(distances)), key=lambda x: x[1], reverse=True)[1:6]
    for book in similar_books:
        recommendations.append(pt.index[book[0]])
    return recommendations

In [79]:
recommend("1984")

['Animal Farm',
 "The Handmaid's Tale",
 'Brave New World',
 'The Vampire Lestat (Vampire Chronicles, Book II)',
 'The Hours : A Novel']

In [80]:
books['Image-URL-L'].head()

0    http://images.amazon.com/images/P/0195153448.0...
1    http://images.amazon.com/images/P/0002005018.0...
2    http://images.amazon.com/images/P/0060973129.0...
3    http://images.amazon.com/images/P/0374157065.0...
4    http://images.amazon.com/images/P/0393045218.0...
Name: Image-URL-L, dtype: object

In [81]:
ratings['Book-Rating'].max()

10

Exporting popularity_df

In [82]:
pickle.dump(popularity_df, open('data/popular.pkl', 'wb'))

In [83]:
popularity_df.columns

Index(['Book-Title', 'Book-Author', 'Publisher', 'Mean-Rating', 'Image-URL-L'], dtype='object')

In [84]:
print(popularity_df.head())

                                           Book-Title     Book-Author  \
0   Harry Potter and the Prisoner of Azkaban (Book 3)   J. K. Rowling   
3        Harry Potter and the Goblet of Fire (Book 4)   J. K. Rowling   
5   Harry Potter and the Order of the Phoenix (Boo...   J. K. Rowling   
9    Harry Potter and the Chamber of Secrets (Book 2)   J. K. Rowling   
12  The Fellowship of the Ring (The Lord of the Ri...  J.R.R. TOLKIEN   

     Publisher  Mean-Rating                                        Image-URL-L  
0   Scholastic     5.852804  http://images.amazon.com/images/P/0439136350.0...  
3   Scholastic     5.824289  http://images.amazon.com/images/P/0439139597.0...  
5   Scholastic     5.501441  http://images.amazon.com/images/P/043935806X.0...  
9   Scholastic     5.183453  http://images.amazon.com/images/P/0439064872.0...  
12     Del Rey     4.948370  http://images.amazon.com/images/P/0345339703.0...  


In [85]:
popularity_df['Mean-Rating'].max()

5.852803738317757

In [86]:
# rounding off the mean rating to 2 decimal places
popularity_df['Mean-Rating'] = popularity_df['Mean-Rating'].round(2)

In [87]:
pickle.dump(popularity_df, open('data/popular.pkl', 'wb'))

Exporting other data

In [92]:
pickle.dump(pt, open('data/pivot_table.pkl', 'wb'))
pickle.dump(sim_score, open('data/sim_score.pkl', 'wb'))
pickle.dump(books, open('data/books.pkl', 'wb'))

In [89]:
books.nunique()

ISBN                   271360
Book-Title             242135
Book-Author            102022
Year-Of-Publication       202
Publisher               16807
Image-URL-S            271044
Image-URL-M            271044
Image-URL-L            271041
dtype: int64

In [91]:
books['Book-Title'].drop_duplicates()

0                                       Classical Mythology
1                                              Clara Callan
2                                      Decision in Normandy
3         Flu: The Story of the Great Influenza Pandemic...
4                                    The Mummies of Urumchi
                                ...                        
271354        Flashpoints: Promise and Peril in a New World
271356                              From One to One Hundred
271357    Lily Dale : The True Story of the Town that Ta...
271358                          Republic (World's Classics)
271359    A Guided Tour of Rene Descartes' Meditations o...
Name: Book-Title, Length: 242135, dtype: object