# <Center> User Based Recommender System

## Load Libraries

In [1]:
# Basic libraries
import numpy as np
import pandas as pd

# matplotlib
import matplotlib.pyplot as plt

# Scipy
from scipy.sparse import csr_matrix

# sklearn
from sklearn.neighbors import NearestNeighbors



## Load Data

In [2]:
books_dataset   = pd.read_csv("BX-Books.csv", sep=';', encoding='latin-1', on_bad_lines='skip', low_memory=False)
ratings_dataset = pd.read_csv("BX-Book-Ratings.csv", sep=';', encoding='latin-1', on_bad_lines='skip', low_memory=False)
users_dataset = pd.read_csv("BX-Users.csv", sep=';', encoding='latin-1', on_bad_lines='skip', low_memory=False)

In [3]:
books_dataset.head(5)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [4]:
ratings_dataset.head(5)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [5]:
users_dataset.head(5)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


## Preprocessing books

In [6]:
books = books_dataset[["ISBN", "Book-Title", "Book-Author", "Year-Of-Publication", "Publisher"]]
books = books.rename(columns={"ISBN": "isbn", "Book-Title": "title", "Book-Author": "author", "Year-Of-Publication": "year", "Publisher": "publisher"})
books.head(5)

Unnamed: 0,isbn,title,author,year,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


## Preprocessing ratings

In [7]:
ratings = ratings_dataset.rename(columns={"User-ID": "user_id", "ISBN": "isbn", "Book-Rating": "rating"})
ratings.head(5)

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


## Preprocessing users

In [8]:
users = users_dataset.rename(columns={"User-ID": "user_id", "Location": "location", "Age": "age"})
users.shape

(278858, 3)

## Filtering ratings where a user gave 50+ ratings

In [9]:
user_freq = ratings["user_id"].value_counts() >= 50
user_ids = user_freq[user_freq].index

rating_over_50 = ratings[ratings["user_id"].isin(user_ids)]
rating_over_50.head(10)

Unnamed: 0,user_id,isbn,rating
173,276847,446364193,0
174,276847,3257200552,5
175,276847,3379015180,0
176,276847,3404145909,8
177,276847,3404148576,8
178,276847,3404921178,7
179,276847,3423071516,10
180,276847,3423204885,0
181,276847,3423205806,0
182,276847,3426029553,8


In [10]:
rating_over_50.shape

(768472, 3)

## Merging Datasets

### Merging books with rating subset with books over 50 ratings

In [11]:
book_rating_data = rating_over_50.merge(books, on="isbn")
book_rating_data.shape

(703434, 7)

In [12]:
book_rating_data.head()

Unnamed: 0,user_id,isbn,rating,title,author,year,publisher
0,276847,446364193,0,Along Came a Spider (Alex Cross Novels),James Patterson,1993,Warner Books
1,278418,446364193,0,Along Came a Spider (Alex Cross Novels),James Patterson,1993,Warner Books
2,5483,446364193,0,Along Came a Spider (Alex Cross Novels),James Patterson,1993,Warner Books
3,7346,446364193,0,Along Came a Spider (Alex Cross Novels),James Patterson,1993,Warner Books
4,8362,446364193,0,Along Came a Spider (Alex Cross Novels),James Patterson,1993,Warner Books


### Find the rating count of each book

In [13]:
rating_numbers = book_rating_data.groupby("isbn")["rating"].count().reset_index()
rating_numbers.rename(columns= {'rating':'number_of_ratings'}, inplace=True)
rating_numbers.head()

Unnamed: 0,isbn,number_of_ratings
0,0000913154,1
1,0001010565,2
2,0001046438,1
3,000104687X,1
4,0001047213,1


### Merge rating counts found above with already merged Books and ratings dataset

In [14]:
rating_count_dataset = book_rating_data.merge(rating_numbers, on="isbn")
rating_count_dataset.shape

(703434, 8)

### Getting the ratings with count over 50

In [15]:
ratings_with_count_df = rating_count_dataset[rating_count_dataset["number_of_ratings"] >= 50]
ratings_with_count_df.shape

(101510, 8)

In [16]:
ratings_with_count_df = ratings_with_count_df.drop_duplicates(["user_id", "title"])
ratings_with_count_df

Unnamed: 0,user_id,isbn,rating,title,author,year,publisher,number_of_ratings
0,276847,0446364193,0,Along Came a Spider (Alex Cross Novels),James Patterson,1993,Warner Books,193
1,278418,0446364193,0,Along Came a Spider (Alex Cross Novels),James Patterson,1993,Warner Books,193
2,5483,0446364193,0,Along Came a Spider (Alex Cross Novels),James Patterson,1993,Warner Books,193
3,7346,0446364193,0,Along Came a Spider (Alex Cross Novels),James Patterson,1993,Warner Books,193
4,8362,0446364193,0,Along Came a Spider (Alex Cross Novels),James Patterson,1993,Warner Books,193
...,...,...,...,...,...,...,...,...
385885,245963,0451166582,0,The Eyes of the Dragon,Stephen King,2001,Signet Book,56
385886,259901,0451166582,10,The Eyes of the Dragon,Stephen King,2001,Signet Book,56
385887,260944,0451166582,10,The Eyes of the Dragon,Stephen King,2001,Signet Book,56
385888,264317,0451166582,0,The Eyes of the Dragon,Stephen King,2001,Signet Book,56


### Get those books with rating count of over 100

In [17]:
ratings_over_100 = ratings_with_count_df[ratings_with_count_df["number_of_ratings"] > 100]

In [18]:
ratings_pivot_table = ratings_over_100.pivot_table(columns='user_id', index='title', values="rating")
ratings_pivot_table.fillna(0, inplace=True)
ratings_pivot_table

user_id,243,254,507,626,638,741,882,929,1025,1211,...,277928,277965,278026,278137,278144,278188,278418,278582,278633,278843
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
White Oleander : A Novel (Oprah's Book Club),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
White Teeth: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Wicked: The Life and Times of the Wicked Witch of the West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Wild Animus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
rating_sparse = csr_matrix(ratings_pivot_table)
rating_sparse

<289x3055 sparse matrix of type '<class 'numpy.float64'>'
	with 14697 stored elements in Compressed Sparse Row format>

## Fitting the model

In [20]:
knn_model = NearestNeighbors()
knn_model.fit(rating_sparse)

## Get the top 10 recommendations

In [22]:
distances, suggestions = knn_model.kneighbors(ratings_pivot_table.iloc[200, :].values.reshape(1, -1), n_neighbors=10)

for i in range(10):
  print(ratings_pivot_table.index[suggestions[0][i]])

The Da Vinci Code
The Sum of All Fears (Jack Ryan Novels)
The Burden of Proof
Slow Waltz in Cedar Bend
Under the Tuscan Sun
4 Blondes
Three Junes
The Loop
Songs in Ordinary Time (Oprah's Book Club (Paperback))
I'll Be Seeing You
