# Top N Recommendation via Nearest Neighbor Model

## 1. prepare dataset

In [1]:
import pandas as pd
# remember to convert dataset into utf-8 explicitly, eg: save as utf-8 via notebook
# http://www2.informatik.uni-freiburg.de/~cziegler/BX/

In [12]:
# import book data
bookFile='./data/BX-Books.csv'
books=pd.read_csv(bookFile,sep=";",header=0,error_bad_lines=False,usecols=[0,1,2],index_col=0,names=["isbn","title","author"])
# error_bad_lines=False will ignore rows with errors
books.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


In [13]:
# import rating data
dataFile='./data/BX-Book-Ratings.csv'
data=pd.read_csv(dataFile,sep=";",header=0,names=["user","isbn","rating"])
data.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


### ensure the isbn is valid

In [16]:
# subset data to only valid isbn in books database
data = data[data["isbn"].isin(books.index)]

### Setup a function to find book metadata

In [14]:
def bookMeta(isbn):
    title = books.at[isbn,"title"]
    author = books.at[isbn,"author"]
    return title, author
# testing
bookMeta("0671027360")

('Angels &amp; Demons', 'Dan Brown')

### setup a function to find the favorite books for a user

In [17]:
def favBooks(user,N):
    userRatings = data[data["user"]==user]
    sortedRatings = pd.DataFrame.sort_values(userRatings,['rating'],ascending=[0])[:N]
    sortedRatings["title"]=sortedRatings["isbn"].apply(bookMeta)
    return sortedRatings
# testing
favBooks(204622,5)

Unnamed: 0,user,isbn,rating,title
844955,204622,0967560500,10.0,"(Natural Hormonal Enhancement, Rob Faigin)"
844935,204622,0671027360,10.0,"(Angels &amp; Demons, Dan Brown)"
844926,204622,0385504209,10.0,"(The Da Vinci Code, Dan Brown)"
844958,204622,097173660X,9.0,"(Life After School Explained, Cap &amp; Compass)"
844920,204622,0060935464,9.0,"(To Kill a Mockingbird, Harper Lee)"


## 2. Creating the Rating Matrix

In [18]:
data.shape

(1031175, 3)

### deciding the size of the matrix

#### isbn size

In [19]:
usersPerISBN = data.isbn.value_counts()
usersPerISBN.head(10)

0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
044023722X     647
0142001740     615
067976402X     614
0671027360     586
0446672211     585
Name: isbn, dtype: int64

In [20]:
usersPerISBN.shape

(270170,)

#### user size

In [21]:
ISBNsPerUser = data.user.value_counts()
ISBNsPerUser.head(10)

11676     11144
198711     6456
153662     5814
98391      5779
35859      5646
212898     4290
278418     3996
76352      3329
110973     2971
235105     2943
Name: user, dtype: int64

In [22]:
ISBNsPerUser.shape

(92107,)

### reduce the sparse dataset

In [23]:
data = data[data["isbn"].isin(usersPerISBN[usersPerISBN>10].index)] # isbn is the index of usersPerISBN
data = data[data["user"].isin(ISBNsPerUser[ISBNsPerUser>10].index)] # user is the index of ISBNsPerUser

### create the rating matrix

In [24]:
userItemRatingMatrix=pd.pivot_table(data, values='rating', index=['user'], columns=['isbn'])
userItemRatingMatrix.head()

isbn,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,5.0,,,,,,,,,,...,,,,,,,,,,
99,,,,,,,,,,,...,,,,,,,,,,
242,,,,,,,,,,,...,,,,,,,,,,
243,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,


In [25]:
userItemRatingMatrix.shape

(10706, 15451)

## 3. Find the K Nearest Neighbors

### create distance calculation function

In [35]:
# discret rating values can be calculated using hamming distance
user1 = 204622
user2 = 255489

In [45]:
import numpy as np
from scipy.spatial.distance import hamming
def distance(user1,user2):
    try:
        user1Ratings = userItemRatingMatrix.transpose()[user1]
        user2Ratings = userItemRatingMatrix.transpose()[user2]
        distance = hamming(user1Ratings,user2Ratings)
    except:
        distance = np.NaN
    return distance
# testing
distance(user1,user2)

0.9999352792699502

### find the k nearest neighbors

In [57]:
user = 204622

In [58]:
def nearestNeighbors(user,K=10):
    allUsers = pd.DataFrame(userItemRatingMatrix.index)
    allUsers = allUsers[allUsers.user!=user] # remove the active user
    allUsers["distance"] = allUsers["user"].apply(lambda x: distance(user,x)) # calculate the distance between each user with the active user
    KnearestUsers = allUsers.sort_values(["distance"], ascending=True)['user'][:K] # pick the top K users
    return KnearestUsers
# testing
KnearestUsers = nearestNeighbors(user)
KnearestUsers

3201     82893
3368     87555
2624     68555
1813     48046
5401    140036
7584    198711
565      16795
8866    232131
239       7346
9693    251422
Name: user, dtype: int64

## 4. Find the top N recommendations

In [66]:
def topN(user,N=3):
    # get k nearest neighbors for all books
    KnearestUsers = nearestNeighbors(user)
    # Average the ratings of nearest neighbors for unrated books
    ## filter out the ratings for books only for top K neighbor users
    NNRatings = userItemRatingMatrix[userItemRatingMatrix.index.isin(KnearestUsers)]
    ## calculate the average
    avgRating = NNRatings.apply(np.nanmean).dropna() # aggregate bu column and ignore NaN values, with dropping NaN mean value column(product/book)
    # get the ratings of the active user
    booksAlreadyRead = userItemRatingMatrix.transpose()[user].dropna().index # only isbn, drop books without a rating
    # remove the average ratings for books already read by the user
    avgRating = avgRating[~avgRating.index.isin(booksAlreadyRead)]
    # get top N rating products/books for the active user which not already been read yet
    topNISBNs = avgRating.sort_values(ascending=False).index[:N]
    return pd.Series(topNISBNs).apply(bookMeta)
# testing
favBooks(204813,10)

Unnamed: 0,user,isbn,rating,title
845417,204813,399149848,10.0,"(Birthright, Nora Roberts)"
845407,204813,385504209,10.0,"(The Da Vinci Code, Dan Brown)"
845382,204813,373218036,10.0,"(Truly, Madly Manhattan, Nora Roberts)"
845359,204813,142001805,10.0,"(The Eyre Affair: A Novel, Jasper Fforde)"
845431,204813,446527793,10.0,"(The Guardian, Nicholas Sparks)"
845416,204813,399149392,10.0,"(Chesapeake Blue (Quinn Brothers (Hardcover)),..."
845432,204813,446531332,9.0,"(Nights in Rodanthe, Nicholas Sparks)"
845434,204813,446606243,9.0,"(The Tenth Justice, Brad Meltzer)"
845451,204813,671027360,9.0,"(Angels &amp; Demons, Dan Brown)"
845433,204813,446532452,9.0,"(The Wedding, Nicholas Sparks)"


In [67]:
topN(204813,10)# still need logic to build userItemRatingMatrix first

  labels=labels)


0    (Waiting For Nick (Silhouette Special Edition)...
1           (Wringer (Trophy Newbery), Jerry Spinelli)
2    (The Star Wars Trilogy: Star Wars, the Empire ...
3          (One, Two, Buckle My Shoe, Agatha Christie)
4                          (On the Road, Jack Kerouac)
5                 (Dead Poets Society, N.H. Kleinbaum)
6     (Go Ask Alice (Avon/Flare Book), James Jennings)
7                        (Carolina Moon, Nora Roberts)
8    (Illusions: The Adventures of a Reluctant Mess...
9    (You Just Don't Duct Tape a Baby!: True Tales ...
Name: isbn, dtype: object