Here we will do work with Latent Factor Analysis to create a Recommendation system. 

In [1]:
# read the BX-Book-Ratings file
import pandas as pd
dataFile='./data/BX-CSV-Dump/BX-Book-Ratings.csv'
data=pd.read_csv(dataFile,sep=";",header=0,encoding = "ISO-8859-1", names=["user","isbn","rating"])

In [2]:
data.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [3]:
# read the BX-Books file to get Books meta data
bookFile='./data/BX-CSV-Dump/BX-Books.csv'
books=pd.read_csv(bookFile,sep=";",header=0,encoding = "ISO-8859-1", error_bad_lines=False, usecols=[0,1,2],index_col=0,names=['isbn',"title","author"])

In [4]:
books.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


In [5]:
# Data Cleaning - only retain values from books-rating that actually have books meta data
data = data[data["isbn"].isin(books.index)]

In [6]:
# More Data Cleaning
# Drop books that dont have too many ratings
# Drop users that have not rated many books

# Find the number of books each user has rated
ISBNsPerUser = data.user.value_counts()
# Find the number of users who have rated a particular book. 
usersPerISBN = data.isbn.value_counts()

# Now lets only keep users who have rated more than 10 book
# lets only keep books which have been rated by more than 10 users
data = data[data["isbn"].isin(usersPerISBN[usersPerISBN>10].index)]
data = data[data["user"].isin(ISBNsPerUser[ISBNsPerUser>10].index)]

In [7]:
data.head()

Unnamed: 0,user,isbn,rating
31,276762,034544003X,0
33,276762,0380711524,5
34,276762,0451167317,0
89,276798,3423084049,0
97,276798,3548603203,6


In [8]:
# print the shape of rating matrix, number of users, number of books
print(data.shape)
print(len(data.user.value_counts()))
print(len(data.isbn.value_counts()))

(405709, 3)
10706
15451


In [9]:
# Now create a user item rating matrix. Use coo matrix
# coo_matrix((ratingcolumn, (usercolumn, itemcolumn))
from scipy.sparse import coo_matrix
data['user'] = data['user'].astype("category")
data['isbn'] = data['isbn'].astype("category")
R = coo_matrix((data['rating'].astype(float),
                       (data['user'].cat.codes,
                        data['isbn'].cat.codes)))

In [10]:
print(R)

  (10633, 3053)	0.0
  (10633, 4025)	5.0
  (10633, 7873)	0.0
  (10634, 15131)	0.0
  (10634, 15311)	6.0
  (10635, 125)	10.0
  (10635, 3876)	9.0
  (10635, 12950)	10.0
  (10636, 15017)	0.0
  (10636, 15032)	7.0
  (10637, 6941)	0.0
  (10637, 15099)	0.0
  (10637, 15118)	8.0
  (10637, 15128)	10.0
  (10637, 15166)	10.0
  (10637, 15184)	7.0
  (10637, 15192)	8.0
  (10637, 15195)	10.0
  (10637, 15199)	7.0
  (10637, 15312)	10.0
  (10637, 15313)	10.0
  (10637, 15314)	10.0
  (10637, 15315)	10.0
  (10638, 15009)	0.0
  (10639, 1369)	0.0
  :	:
  (10631, 11776)	0.0
  (10631, 11778)	0.0
  (10631, 11882)	0.0
  (10631, 11910)	0.0
  (10631, 12014)	10.0
  (10631, 12041)	10.0
  (10631, 12099)	0.0
  (10631, 12245)	0.0
  (10631, 12254)	0.0
  (10631, 12268)	8.0
  (10631, 12365)	10.0
  (10631, 13711)	10.0
  (10631, 13732)	10.0
  (10631, 14068)	10.0
  (10631, 14490)	6.0
  (10632, 1933)	0.0
  (10632, 2801)	6.0
  (10632, 5182)	0.0
  (10632, 5451)	0.0
  (10632, 6213)	0.0
  (10632, 6749)	0.0
  (10632, 6885)	0.0
  (1063

In [11]:
# understanding coo matrix
print(R.shape)
print(R.data[1])
print(R.row[1])
print(R.col[1])
print(data[data.user.cat.codes==10633].sort_values(by=['rating'],ascending=False)[0:5])
print(data[data.isbn.cat.codes==4025].sort_values(by=['rating'],ascending=False)[0:5])
print("----------------------------------")
user = 276762
print(data[data.user.cat.codes==10633])
print(R.data[R.row==10633])
len(R.data)

(10706, 15451)
5.0
10633
4025
      user        isbn  rating
33  276762  0380711524       5
31  276762  034544003X       0
34  276762  0451167317       0
          user        isbn  rating
348566   83287  0380711524      10
551584  132375  0380711524      10
710657  172370  0380711524       9
668446  162639  0380711524       9
122280   28204  0380711524       9
----------------------------------
      user        isbn  rating
31  276762  034544003X       0
33  276762  0380711524       5
34  276762  0451167317       0
[0. 5. 0.]


405709

In [12]:
dict( enumerate(data['user'].cat.categories ) )

{0: 8,
 1: 99,
 2: 242,
 3: 243,
 4: 254,
 5: 383,
 6: 388,
 7: 408,
 8: 424,
 9: 446,
 10: 487,
 11: 503,
 12: 505,
 13: 507,
 14: 625,
 15: 626,
 16: 638,
 17: 643,
 18: 651,
 19: 709,
 20: 728,
 21: 735,
 22: 741,
 23: 805,
 24: 850,
 25: 876,
 26: 882,
 27: 899,
 28: 900,
 29: 901,
 30: 914,
 31: 929,
 32: 1025,
 33: 1075,
 34: 1129,
 35: 1131,
 36: 1155,
 37: 1167,
 38: 1184,
 39: 1211,
 40: 1248,
 41: 1424,
 42: 1435,
 43: 1485,
 44: 1548,
 45: 1585,
 46: 1608,
 47: 1674,
 48: 1725,
 49: 1733,
 50: 1812,
 51: 1848,
 52: 1903,
 53: 1928,
 54: 2010,
 55: 2012,
 56: 2030,
 57: 2033,
 58: 2041,
 59: 2046,
 60: 2090,
 61: 2103,
 62: 2110,
 63: 2132,
 64: 2134,
 65: 2179,
 66: 2197,
 67: 2276,
 68: 2287,
 69: 2288,
 70: 2313,
 71: 2326,
 72: 2337,
 73: 2358,
 74: 2363,
 75: 2437,
 76: 2442,
 77: 2589,
 78: 2630,
 79: 2719,
 80: 2766,
 81: 2790,
 82: 2793,
 83: 2799,
 84: 2853,
 85: 2855,
 86: 2891,
 87: 2954,
 88: 2977,
 89: 3017,
 90: 3066,
 91: 3145,
 92: 3167,
 93: 3266,
 94: 3282,


In [13]:
len(dict( enumerate(data['user'].cat.categories ) ))

10706

In [14]:
dict(enumerate([1,2,3]))

{0: 1, 1: 2, 2: 3}

In [15]:
# this function will return the error between the actual rating and the ratings calculated from P and Q matrices
# predicted rating is calculated using the formula 
# pow(rui-np.dot(P[u,:],Q[:,i]),2)+ lamda*(pow(norm(P[u,:]),2)+pow(norm(Q[:,i]),2
from numpy.linalg import norm
import numpy as np

def error(R,P,Q,reg=0.02):
    ratings = R.data
    rows = R.row
    cols = R.col
    e = 0 
    # go through every rating 
    for ui in range(len(ratings)):
        # Save the rating, user code and book code
        rui=ratings[ui]
        u = rows[ui]
        i = cols[ui]
        if rui>0:
            # Find the sum of errors
            e = e + pow(rui-np.dot(P[u,:],Q[:,i]),2)+\
                reg*(pow(norm(P[u,:]),2)+pow(norm(Q[:,i]),2))
    return e

# error (difference) between actual & predict

In [16]:
# Function that will calculate P and Q matrices using stochastic gradient descent
def SGD(R, K, reg=0.02,steps=10, lrate=0.001):
    # Initialise the P and Q Factor Matrices with random numbers
    # Setup the dimensions 
    # M - No. of users
    # N - No. of items
    # K - No. of latent features (no. of cols)
    M,N = R.shape
    P = np.random.rand(M,K)
    Q = np.random.rand(K,N)
    
    # calculate the initial RMSE
    rmse = np.sqrt(error(R,P,Q,reg)/len(R.data))
    print("Initial RMSE: "+str(rmse))
    
    # complete the specified number of steps for gradient descent
    for step in range(steps):
        # complete the specified number of steps for gradient descent 
        for ui in range(len(R.data)):
            rui=R.data[ui]
            u = R.row[ui]
            i = R.col[ui]
            if rui>0:
                # update P, Q in the direction of local minima
                eui=rui-np.dot(P[u,:],Q[:,i])
                P[u,:]=P[u,:]+lrate*2*(eui*Q[:,i]-reg*P[u,:])
                Q[:,i]=Q[:,i]+lrate*2*(eui*P[u,:]-reg*Q[:,i])
        rmse = np.sqrt(error(R,P,Q,reg)/len(R.data))
        if rmse<0.5:
            break
    print("Final RMSE: "+ str(rmse))
    return P,Q

In [17]:
# call the SGD to calculate P,Q
(P,Q) = SGD(R,K=2,lrate=0.0007,reg=0.01, steps=100)

Initial RMSE: 4.330093148684902
Final RMSE: 0.7893190298541115


In [19]:
len(R.data)

405709

In [20]:
P,Q

(array([[0.89135534, 1.02486598],
        [1.29207278, 2.5374247 ],
        [2.30155159, 2.37540783],
        ...,
        [0.00414449, 0.19553207],
        [2.32050659, 2.05782198],
        [1.65273991, 1.47938466]]),
 array([[1.86754285, 1.85478622, 1.78989077, ..., 2.23573156, 2.1380698 ,
         0.99378304],
        [1.76395009, 2.30733062, 1.92356553, ..., 2.35215803, 2.0999918 ,
         0.23562639]]))

In [21]:
P.shape

(10706, 2)

In [22]:
Q.shape

(2, 15451)

In [23]:
dot_product = np.dot(P,Q)
dot_product

array([[ 3.47245671,  4.01797824,  3.56682555, ...,  4.40347799,
         4.05799007,  1.12729929],
       [ 6.88889179,  8.25119649,  7.19357182, ...,  8.85715177,
         8.09111284,  1.88192423],
       [ 8.48834706,  9.7497374 ,  8.68877857, ..., 10.73298614,
         9.90921491,  2.8469517 ],
       ...,
       [ 0.35264883,  0.45884428,  0.38353694, ...,  0.46918831,
         0.41947696,  0.05019124],
       [ 7.96354076,  9.05211932,  8.11180876, ..., 10.02835233,
         9.28281435,  2.79095726],
       [ 5.6961233 ,  6.47890873,  5.80391725, ...,  7.17482929,
         6.64036895,  1.99104696]])

In [24]:
dot_product.shape

(10706, 15451)

In [31]:
data['user'].cat.categories

Int64Index([     8,     99,    242,    243,    254,    383,    388,    408,
               424,    446,
            ...
            278522, 278535, 278554, 278563, 278582, 278633, 278637, 278771,
            278843, 278851],
           dtype='int64', length=10706)

In [32]:
len(data['user'].cat.categories)

10706

In [33]:
new_matrix = pd.DataFrame(dot_product, index=list(data['user'].cat.categories), columns=list(data['isbn'].cat.categories))
new_matrix.head()

Unnamed: 0,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
8,3.472457,4.017978,3.566826,3.72675,2.442734,3.713968,3.534192,3.675699,3.344812,2.952861,...,0.166,3.173362,3.562024,2.903911,3.073397,1.830889,4.282521,4.403478,4.05799,1.127299
99,6.888892,8.251196,7.193572,7.133566,5.08251,7.307689,7.07129,7.811692,6.936041,5.858134,...,0.275625,6.679993,6.956589,5.765048,5.926216,3.631086,8.499599,8.857152,8.091113,1.881924
242,8.488347,9.749737,8.688779,9.17687,5.910317,9.094253,8.623825,8.851354,8.098964,7.218193,...,0.419611,7.658204,8.73562,7.097501,7.55689,4.475868,10.467592,10.732986,9.909215,2.846952
243,7.802724,9.06113,8.028448,8.343885,5.516428,8.338386,7.948421,8.3199,7.550868,6.635182,...,0.366756,7.175403,7.991181,6.525659,6.886118,4.113934,9.623389,9.908873,9.123163,2.491814
254,7.727149,8.742609,7.853901,8.477138,5.268173,8.30733,7.82206,7.811195,7.230235,6.570873,...,0.407446,6.789125,8.00441,6.459096,6.960312,4.075039,9.527174,9.713004,9.001346,2.759758


In [34]:
# select one of the user
user8 = new_matrix[new_matrix.index == 8]
user8

Unnamed: 0,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
8,3.472457,4.017978,3.566826,3.72675,2.442734,3.713968,3.534192,3.675699,3.344812,2.952861,...,0.166,3.173362,3.562024,2.903911,3.073397,1.830889,4.282521,4.403478,4.05799,1.127299


In [38]:
new_matrix_8 = user8.T.sort_values(by=[8], ascending=False)
new_matrix_8.head()

Unnamed: 0,8
3453146972,5.154137
3499224615,4.867898
1853262390,4.849585
843760494X,4.811025
3499224623,4.746067


In [35]:
# books user_id 8 had rated
books_rated = data[data.user == 8]
books_rated

Unnamed: 0,user,isbn,rating
9563,8,2005018,5
9565,8,374157065,0
9567,8,399135782,0


In [42]:
new_matrix_8_excluded = new_matrix_8[~new_matrix_8.index.isin(books_rated.isbn)]
new_matrix_8_excluded.head()
# Top 5 recommendation

Unnamed: 0,8
3453146972,5.154137
3499224615,4.867898
1853262390,4.849585
843760494X,4.811025
3499224623,4.746067


In [43]:
new_matrix_8[new_matrix_8.index.isin(books_rated.isbn)]

Unnamed: 0,8
399135782,3.946814
374157065,3.593718
2005018,3.472457


In [None]:
# change isbn into book name & author
# compare Top N books of two models (previous example(notesbook))