In [1]:
!wget https://raw.githubusercontent.com/katarinagresova/MLprojects/main/BookRecommendations/data/preprocessed_books.csv
!wget https://raw.githubusercontent.com/katarinagresova/MLprojects/main/BookRecommendations/data/preprocessed_users.csv
!wget https://raw.githubusercontent.com/katarinagresova/MLprojects/main/BookRecommendations/data/preprocessed_ratings.csv

--2021-11-08 14:18:21--  https://raw.githubusercontent.com/katarinagresova/MLprojects/main/BookRecommendations/data/preprocessed_books.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23601906 (23M) [text/plain]
Saving to: ‘preprocessed_books.csv’


2021-11-08 14:18:22 (124 MB/s) - ‘preprocessed_books.csv’ saved [23601906/23601906]

--2021-11-08 14:18:22--  https://raw.githubusercontent.com/katarinagresova/MLprojects/main/BookRecommendations/data/preprocessed_users.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1840909 (1.

In [2]:
import pandas as pd
import numpy as np    
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype

In [3]:
books = pd.read_csv('preprocessed_books.csv')
users = pd.read_csv('preprocessed_users.csv')
ratings = pd.read_csv('preprocessed_ratings.csv')

In [4]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0.0
1,276727,0446520802,0.0
2,276744,038550120X,0.0
3,276746,0425115801,-0.367617
4,276746,0449006522,-1.342271


In [5]:
books['Title'] = books.apply(lambda x: x['Book-Title'] +' | ' + x['Book-Author'] , axis=1)

In [6]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Title
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,Classical Mythology | Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,Clara Callan | Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,Decision in Normandy | Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,Flu: The Story of the Great Influenza Pandemic...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,The Mummies of Urumchi | E. J. W. Barber


In [7]:
books['Title'].value_counts()

Little Women | Louisa May Alcott                                                                                            21
Wuthering Heights | Emily Bronte                                                                                            20
Adventures of Huckleberry Finn | Mark Twain                                                                                 20
Pride and Prejudice | Jane Austen                                                                                           18
Great Expectations | Charles Dickens                                                                                        16
                                                                                                                            ..
A Visit to the Sesame Street Library: Featuring Jim Henson's Sesame Street Muppets (Please Read to Me) | Deborah Hautzig     1
It'S Back To School We Go! | Ellen Jackson                                                                     

Just to know, that there are many books with multiple ISBN numbers, but with the same name and the same author. We will treat those books as one. So we will use column `Title` in user-item table, instead of column `ISBN`.

In [8]:
ratings = pd.merge(ratings, books[['ISBN', 'Title']])

In [None]:
sparse = ratings.pivot_table(columns='Title', values='Book-Rating', index='User-ID')

In [9]:
# inspired by https://stackoverflow.com/a/53235048
def create_matrix(frame, user_col, item_col, rating_col):
    """
    creates the sparse user-item interaction matrix

    Parameters
    ----------
    frame : DataFrame
        implicit rating data

    user_col : str
        user column name

    item_col : str
        item column name

    ratings_col : str
        implicit rating column name
    """

    user_c = CategoricalDtype(sorted(frame[user_col].unique()), ordered=True)
    item_c = CategoricalDtype(sorted(frame[item_col].unique()), ordered=True)

    row = frame[user_col].astype(user_c).cat.codes
    col = frame[item_col].astype(item_c).cat.codes
    return csr_matrix((frame[rating_col], (row, col)), \
                              shape=(user_c.categories.size, item_c.categories.size))

In [10]:
sparse_matrix = create_matrix(ratings, 'User-ID', 'Title', 'Book-Rating')

In [11]:
sparse_matrix.shape

(65305, 15897)

In [16]:
# copied from https://stackoverflow.com/a/38727681
# modifications:
# - return np.array instead of np.matrix
def sparse_corr(A):
    N = A.shape[0]
    C=((A.T*A -(sum(A).T*sum(A)/N))/(N-1)).todense()
    V=np.sqrt(np.mat(np.diag(C)).T*np.mat(np.diag(C)))
    COR = np.divide(C,V+1e-119)
    return np.array(COR)

In [17]:
corr = sparse_corr(sparse_matrix)

In [18]:
corr.shape

(15897, 15897)

In [19]:
corr

array([[ 1.00000000e+00,  6.90959872e-06,  7.49555361e-06, ...,
         2.85179676e-06,  4.05214020e-06,  9.51360353e-07],
       [ 6.90959872e-06,  1.00000000e+00, -1.32470323e-05, ...,
        -5.04003382e-06, -7.16142327e-06, -1.68135697e-06],
       [ 7.49555361e-06, -1.32470323e-05,  1.00000000e+00, ...,
        -5.46744395e-06, -4.47363914e-02, -1.82394113e-06],
       ...,
       [ 2.85179676e-06, -5.04003382e-06, -5.46744395e-06, ...,
         1.00000000e+00,  1.47961525e-01, -6.93945995e-07],
       [ 4.05214020e-06, -7.16142327e-06, -4.47363914e-02, ...,
         1.47961525e-01,  1.00000000e+00, -9.86033263e-07],
       [ 9.51360353e-07, -1.68135697e-06, -1.82394113e-06, ...,
        -6.93945995e-07, -9.86033263e-07,  1.00000000e+00]])

In [26]:
titles = sorted(ratings['Title'].unique())

In [27]:
print(titles[0])
print(titles[1])

 Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth | Elizabeth Roberts
 Good Wives: Image and Reality in the Lives of Women in Northern New England, 1650-1750 | Laurel Thatcher Ulrich
