# Similarity Computation
## Preprocessing 
1. Map userIds and movieIds to range(1,n) and range(0,m), where n and m are the number of unique users and items respectively.
2. Create a COO a user-user and item-item sparse matrices


## Load Data

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

from recsys.io.file import IOService
from recsys.data.sparse import df_to_sparse_tensor
from recsys.data.rating import RatingsDataset

In [2]:
FILEPATH = "data/dev/ratings_0.5_pct.pkl"
USER_ID_MAP_FILE = "data/admin/user_id_map.pkl" 
ITEM_ID_MAP_FILE = "data/admin/item_id_map.pkl" 

In [3]:
ratings = IOService.read(FILEPATH)

ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 125000 entries, 19265544 to 3362652
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     125000 non-null  int64  
 1   movieId    125000 non-null  int64  
 2   rating     125000 non-null  float64
 3   timestamp  125000 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 4.8 MB


## Remap Identifiers

In [4]:
# Create User Map
userId = np.sort(ratings['userId'].unique())
useridx = range(len(userId))
u = {'userId': userId, 'useridx': useridx}
u = pd.DataFrame(data=u)

# Create Item Map
movieId = np.sort(ratings['movieId'].unique())
itemidx = range(len(movieId))
i = {'movieId': movieId, 'itemidx': itemidx}
i = pd.DataFrame(data=i)

# Save Id Maps
IOService.write(filepath=USER_ID_MAP_FILE, data=u)
IOService.write(filepath=ITEM_ID_MAP_FILE, data=i)
# Install New Indices
ratings = ratings.merge(u, on='userId', how='left')
ratings = ratings.merge(i, on='movieId', how='left')
# Dump the timestamp
ratings = ratings[['useridx', 'itemidx', 'rating']]
ratings.head()

Unnamed: 0,useridx,itemidx,rating
0,48232,9821,3.0
1,48702,990,4.0
2,35262,2684,4.5
3,16166,641,5.0
4,14133,3351,4.0


## Create Sparse Matrix

In [5]:
row = ratings.useridx.values
col = ratings.itemidx.values
data = ratings.rating.values
csr = csr_matrix(
    (data, (row, col)), shape=(ratings.useridx.nunique(), ratings.itemidx.nunique())
)

In [18]:
dv = np.sqrt(csr.power(2).sum(axis=1))
len(dv)
matrix = csr.dot(csr.transpose()) / dv

    


62691

In [19]:
matrix

matrix([[5.70087713, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 4.5       , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 2.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 6.10327781, 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 6.36396103,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         5.        ]])