In [None]:
'''
Collaborative filtering: Choose top N books to recommend to a user


Process:
    1. Find K nearest neighbors of a user
    2. Fill in unrated products by taking a weighted average of nearest neighbors that have rated the product (more similar = more weight)
    3. Sort unrated products by their estimated ratings in descending order
    4. Pick top 10

Workflow:
    1. Set up the data
    2. Construct a rating matrix
    3. Find K nearest neighbors
    4. Find top N recommendations

Distance metrics
    - Euclidean distance
    - Correlation distance = 1 - correlation
    - Hamming distance (how many numbers match) = % of numbers in disagreement


Data source: http://www2.informatik.uni-freiburg.de/~cziegler/BX/
'''

In [34]:
import pandas as pd
dataFile = 'data/BX-Book-Ratings.csv'
data = pd.read_csv(dataFile, sep=';', header=0, names=["user","isbn","rating"], encoding='ANSI')
data.head() # view top 5 rows

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [29]:
bookFile = 'data/BX-Books.csv'
books = pd.read_csv(bookFile, sep=';', header=0, names=["isbn","title","author"], encoding='ANSI', error_bad_lines=False, usecols=[0,1,2], index_col=0)
books.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


In [35]:
# Get rid of any inconsistent ISBN's since data is not perfect
original_size = data.shape
data = data[data["isbn"].isin(books.index)]
new_size = data.shape

print('Original data shape: {}'.format(original_size))
print('Data shape after removing inconsistencies: {}'.format(new_size))

Original data shape: (1149780, 3)
Data shape after removing inconsistencies: (1031175, 3)


In [14]:
def bookMeta(isbn):
    '''
    Get book data by entering the isbn
    '''
    title = books.at[isbn,"title"]
    author = books.at[isbn,"author"]
    return title, author

# Test call
bookMeta('0195153448')

('Classical Mythology', 'Mark P. O. Morford')

In [27]:
def faveBooks(user,N):
    '''
    Get top N favorite books for user
    
        1. Get data for specified user
        2. Sort data by rating
        3. Add "title" column by applying bookMeta() method to entire ISBN column
    '''
    userRatings = data[data["user"] == user]
    sortedRatings = pd.DataFrame.sort_values(userRatings,['rating'],ascending=[0])[:N]
    sortedRatings["title"] = sortedRatings["isbn"].apply(bookMeta)
    return sortedRatings

# Test call
faveBooks(276729, 5).head()

Unnamed: 0,user,isbn,rating,title
4,276729,0521795028,6,(The Amsterdam Connection : Level 4 (Cambridge...
3,276729,052165615X,3,"(Help!: Level 1, Philip Prowse)"


In [38]:
# Get number of users per ISBN
usersPerISBN = data.isbn.value_counts()
usersPerISBN.head()

0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
Name: isbn, dtype: int64

In [39]:
usersPerISBN.shape # gives number unique of ISBN's

(270170,)

In [40]:
# ISBNs per user
ISBNsPerUser = data.user.value_counts()
ISBNsPerUser.head()

11676     11144
198711     6456
153662     5814
98391      5779
35859      5646
Name: user, dtype: int64

In [42]:
ISBNsPerUser.shape # gives number of unique users

(92107,)

In [45]:
# Reduce sparsity of data matrix by:
#   - excluding any isbns that have less than 10 users/ratings
#   - excluding any users that have less than 10 books read
data = data[data["isbn"].isin(usersPerISBN[usersPerISBN > 10].index)]
data = data[data["user"].isin(ISBNsPerUser[ISBNsPerUser > 10].index)]
print(data.shape)

(405709, 3)


In [46]:
# Create table where rows=users, columns=isbns, cells=ratings
userItemRatingMatrix = pd.pivot_table(data, values='rating', 
                                      index=['user'], columns=['isbn'])
userItemRatingMatrix.head()

isbn,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,5.0,,,,,,,,,,...,,,,,,,,,,
99,,,,,,,,,,,...,,,,,,,,,,
242,,,,,,,,,,,...,,,,,,,,,,
243,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,


In [56]:
from scipy.spatial.distance import hamming
import numpy as np
def distance(user1, user2):
    '''
    Compute hamming distance between two users
    '''
    try:
        user1Ratings = userItemRatingMatrix.transpose()[user1]
        user2Ratings = userItemRatingMatrix.transpose()[user2]
        distance = hamming(user1Ratings, user2Ratings)
    except:
        distance = np.NaN
    return distance

In [59]:
# Test distance() method
user1 = 276762
user2 = 276798
print('Hamming distance between user1 and user2: {}'.format(distance(user1,user2)))

Hamming distance between user1 and user2: 1.0


In [70]:
def nearestNeighbors(user,K=10):
    # Get list of all users, less the active user
    allUsers = pd.DataFrame(userItemRatingMatrix.index)
    allUsers = allUsers[allUsers.user != user] 

    # Get distance between active user and all other users
    allUsers['distance'] = allUsers['user'].apply(lambda x: distance(user, x))

    # Sort values by distance (shortest distance first), then take the top K users
    KNearestNeighbors = allUsers.sort_values(['distance'],ascending=True)['user'][:K]
    return KNearestNeighbors



3201     82893
3368     87555
2624     68555
1813     48046
5401    140036
Name: user, dtype: int64

In [88]:
def topN(user,N=3):
    KNearestUsers = nearestNeighbors(user)
    
    # Only take rows where user is in KNearestUsers
    NNRatings = userItemRatingMatrix[userItemRatingMatrix.index.isin(KNearestUsers)]
    
    # get average rating for each ISBN and drop any averages equal to NaN
    avgRating = NNRatings.apply(np.nanmean).dropna()

    # Transpose userItemRatingMatrix and get list of ISBNs for the user
    # Drop the rows where the user has not rated the ISBN
    booksAlreadyRead = userItemRatingMatrix.transpose()[user].dropna().index

    # Remove the books from the avg ratings that have already been read by the user
    avgRating = avgRating[~avgRating.index.isin(booksAlreadyRead)]

    # sort average ratings in descending order and get the top N results
    topNISBNs = avgRating.sort_values(ascending=False).index[:N]
    
    # Get book data from ISBN's and return the data
    return pd.Series(topNISBNs).apply(bookMeta)
    
    
# Test call
# user = 204622 # Active user
# print(topN(user,3))

In [84]:
faveBooks(204813,10)

Unnamed: 0,user,isbn,rating,title
845417,204813,399149848,10,"(Birthright, Nora Roberts)"
845407,204813,385504209,10,"(The Da Vinci Code, Dan Brown)"
845382,204813,373218036,10,"(Truly, Madly Manhattan, Nora Roberts)"
845359,204813,142001805,10,"(The Eyre Affair: A Novel, Jasper Fforde)"
845431,204813,446527793,10,"(The Guardian, Nicholas Sparks)"
845416,204813,399149392,10,"(Chesapeake Blue (Quinn Brothers (Hardcover)),..."
845432,204813,446531332,9,"(Nights in Rodanthe, Nicholas Sparks)"
845434,204813,446606243,9,"(The Tenth Justice, Brad Meltzer)"
845451,204813,671027360,9,"(Angels &amp; Demons, Dan Brown)"
845433,204813,446532452,9,"(The Wedding, Nicholas Sparks)"


In [89]:
topN(204813,10)

  labels=labels)


0    (Waiting For Nick (Silhouette Special Edition)...
1           (Wringer (Trophy Newbery), Jerry Spinelli)
2    (The Star Wars Trilogy: Star Wars, the Empire ...
3          (One, Two, Buckle My Shoe, Agatha Christie)
4                          (On the Road, Jack Kerouac)
5                 (Dead Poets Society, N.H. Kleinbaum)
6     (Go Ask Alice (Avon/Flare Book), James Jennings)
7                        (Carolina Moon, Nora Roberts)
8    (Illusions: The Adventures of a Reluctant Mess...
9    (You Just Don't Duct Tape a Baby!: True Tales ...
Name: isbn, dtype: object