In [87]:
import pandas as pd
import numpy as np
import io
import os
import re
import tensorflow as tf
from pandasql import sqldf
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix
root_path = 'C:\\Users\\luoyan011\\Desktop\\PersonalLearning\\GitHub\\NLP_data\\goodbooks'

print(tf.__version__)
pysqldf = lambda q: sqldf(q, globals())

2.5.0


**Reference**

[Learn about collaborative filtering and weighted alternatng least square with tensorflow](https://fairyonice.github.io/Learn-about-collaborative-filtering-and-weighted-alternating-least-square-with-tensorflow.html)

In [53]:
book_tags = pd.read_csv(os.path.join(root_path, "book_tags.csv"))
books = pd.read_csv(os.path.join(root_path, "books.csv"))
ratings = pd.read_csv(os.path.join(root_path, "ratings.csv"))
tags = pd.read_csv(os.path.join(root_path, "tags.csv"))
to_read = pd.read_csv(os.path.join(root_path, "to_read.csv"))

In [54]:
def change_id_from_0(data, column):
    data_copy = data.copy()
    for i in column:
        data_copy[i] = data_copy[i] - 1
    return data_copy

book_tags = change_id_from_0(book_tags, ['goodreads_book_id'])
books = change_id_from_0(books, ['id'])
ratings = change_id_from_0(ratings, ['book_id','user_id'])
to_read = change_id_from_0(to_read, ['book_id','user_id'])
        

In [55]:
print('--------------book_tags--------------')
display(book_tags.head(3))
print('--------------books--------------')
display(books.head(3))
print('--------------ratings--------------')
display(ratings.head(3))
print('--------------tags--------------')
display(tags.head(3))
print('--------------to_read--------------')
display(to_read.head(3))

--------------book_tags--------------


Unnamed: 0,goodreads_book_id,tag_id,count
0,0,30574,167697
1,0,11305,37174
2,0,11557,34173


--------------books--------------


Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,0,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,1,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,2,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...


--------------ratings--------------


Unnamed: 0,book_id,user_id,rating
0,0,313,5
1,0,438,3
2,0,587,5


--------------tags--------------


Unnamed: 0,tag_id,tag_name
0,0,-
1,1,--1-
2,2,--10-


--------------to_read--------------


Unnamed: 0,user_id,book_id
0,0,111
1,0,234
2,0,532


In [17]:
query = """
select A.*
    , B.original_title
    , B.authors

from ratings A
left join books B on A.book_id = B.book_id
"""
test = pysqldf(query)
test.head(3)

Unnamed: 0,book_id,user_id,rating,original_title,authors
0,1,314,5,Harry Potter and the Half-Blood Prince,"J.K. Rowling, Mary GrandPré"
1,1,439,3,Harry Potter and the Half-Blood Prince,"J.K. Rowling, Mary GrandPré"
2,1,588,5,Harry Potter and the Half-Blood Prince,"J.K. Rowling, Mary GrandPré"


## Collaborative Filtering
### Split Training/Testing

In [18]:
stats = ratings.describe()
stats

Unnamed: 0,book_id,user_id,rating
count,981756.0,981756.0,981756.0
mean,4943.275636,25616.759933,3.856534
std,2873.207415,15228.338826,0.983941
min,1.0,1.0,1.0
25%,2457.0,12372.0,3.0
50%,4921.0,25077.0,4.0
75%,7414.0,38572.0,5.0
max,10000.0,53424.0,5.0


In [88]:
def create_sparse_train_test(dataset, n_users, n_items, test_set_ratio = 0.1, seed=1):
    train_set, test_set = train_test_split(dataset, test_size = test_set_ratio, random_state = seed)
    test_set = test_set.values
    train_set = train_set.values
    
    i_tr, u_tr, r_tr = zip(*train_set)
    tr_sparse = coo_matrix((r_tr, (u_tr, i_tr)), shape = (n_users, n_items))
    
    i_ts, u_ts, r_ts = zip(*test_set)
    ts_sparse = coo_matrix((r_ts, (u_ts, i_ts)), shape = (n_users, n_items))
    
    return tr_sparse, ts_sparse

In [89]:
tr_sparse, ts_sparse = create_sparse_train_test(ratings, len(ratings.user_id.unique()), len(ratings.book_id.unique()))

In [91]:
print("sparse matrix dimension:",tr_sparse.shape,tr_sparse.row.shape,tr_sparse.col.shape,tr_sparse.data.min())
print("sparse matrix dimension:",ts_sparse.shape,ts_sparse.row.shape,ts_sparse.col.shape)

sparse matrix dimension: (53424, 10000) (883580,) (883580,) 1
sparse matrix dimension: (53424, 10000) (98176,) (98176,)


In [23]:
NITEMS = np.max(ratings.book_id)
NUSERS = np.max(ratings.user_id)
print('{} items, {} users, {} interactions'.format(NITEMS, NUSERS, len(ratings)))

10000 items, 53424 users, 981756 interactions


### Weighted Alternating Least Square Model
#### Weight

In [None]:
def make_weights(data, wt_type, obs_wt, feature_wt_exp, axis):
    """
    data: coo_matrix of ratings data
    wt_type: weight type, LOG_RATINGS or LINEAR_RATINGS
    obs_wt: linear weight factor
    feature_wt_exp: logarithmic weight factor
    axis: axis to make weights for, 1 = rows/users, 0 = cols/items
    """
    assert wt_type in ['LOG_RATINGS','LINEAR_RATINGS']
    frac = np.array(1.0 / (data > 0.0).sum(axis))
    frac[np.ma.masked_invalid(frac).mask] = 0
    
    if wt_type == 'LOG_RATINGS':
        wts = np.array(np.power(frac, feature_wt_exp)).flatten()
    elif wt_type == 'LINEAR_RATINGS':
        wts = np.array(obs_wt * frac).flatten()
    
    assert np.isfinite(wts).sum() == wts.shape[0]
    return wts

# Visualize Weight


LINEAR RATINGS seems to put more weights on those items that are rated only a few times in comparisons to LOG RATINGS.

#### Training