# Similarity Matrix

In [1]:
import numpy as np
from tqdm import tqdm
from itertools import combinations
import pandas as pd
from recsys.data.rating import RatingsDataset
from recsys.io.file import IOService

In [2]:
FILEPATH = "data/dev/ratings_1_pct.pkl"

In [47]:
ratings = RatingsDataset(filepath=FILEPATH)
data = IOService.read(FILEPATH)

In [4]:
print(f"Number of users: {ratings.n_users}")
print(f"Number of items: {ratings.n_items}")

Number of users: 17438
Number of items: 8643


In [5]:
def to_key(uv: tuple) -> str:
    return str(uv[0]) +"_" + str(uv[1])
def from_key(uv: str) -> tuple:
    return tuple([int(x) for x in uv.split("_")])


In [6]:
to_key((2,3))
u,v = from_key("2_3")
u
v

'2_3'

2

3

In [18]:
# Average Item Rating
i = 3
r = data[data['movieId'] == i]
l = len(r)
s = r['rating'].sum()
l,s

(3, 4.5)

In [8]:
I = {}
for item in tqdm(ratings.items):    
    item_ratings = ratings.get_item_ratings(item=item)
    for uv_pair in combinations(item_ratings['userId'].values, 2):    
        uv_ratings = item_ratings[item_ratings['userId'].isin(uv_pair)]
        uv_key = to_key(uv_pair)
        if I.get(uv_key,None) is not None:            
            I[uv_key] = pd.concat([I[uv_key], uv_ratings], axis=0)
        else:       
            I[uv_key] = uv_ratings

100%|██████████| 8643/8643 [06:43<00:00, 21.40it/s] 


In [10]:
N = {}
for user in tqdm(ratings.users):
    N[user] = ratings.get_user_ratings_norm(user=user)

100%|██████████| 17438/17438 [00:11<00:00, 1565.29it/s]


In [11]:
S = {}
for uv, uv_ratings in tqdm(I.items()):    
    u,v = from_key(uv)
    ru = uv_ratings[uv_ratings['userId'] == u].sort_values(by='movieId')['rating'].values
    rv = uv_ratings[uv_ratings['userId'] == v].sort_values(by='movieId')['rating'].values
    S[uv] = ru.dot(rv) / (N[u] * N[v])

100%|██████████| 827937/827937 [26:52<00:00, 513.37it/s]  


In [12]:
sim = pd.DataFrame.from_dict(data=S, orient='index', columns=['cos'])
sim.head()

Unnamed: 0,cos
5051_11916,0.58
5051_11932,0.56
5051_13166,0.62
5051_15113,0.42
5051_16231,1.0


In [55]:
data = IOService.read(FILEPATH)

In [56]:
# Average user ratings
rbar = data.groupby("userId")["rating"].mean().reset_index()
rbar.columns = ['userId', 'rbar']
rbar
data = data.merge(rbar, on='userId', how='left')
data['rating_nbu'] = data['rating'] - data['rbar']
data = data.drop(columns=['rbar'])

Unnamed: 0,userId,rbar
0,3,4.67
1,4,3.17
2,35,4.00
3,63,4.00
4,68,3.50
...,...,...
17433,162457,3.38
17434,162469,3.50
17435,162502,4.33
17436,162516,2.75


In [57]:
# Average item ratings
rbar = data.groupby("movieId")["rating"].mean().reset_index()
rbar.columns = ['movieId', 'rbar']
rbar
data = data.merge(rbar, on='movieId', how='left')
data
data['rating_nbi'] = data['rating'] - data['rbar']
data = data.drop(columns=['rbar'])

Unnamed: 0,movieId,rbar
0,1,3.86
1,2,3.36
2,3,1.50
3,5,2.83
4,6,4.00
...,...,...
8638,207830,4.00
8639,208453,3.00
8640,208579,1.00
8641,208889,3.50


Unnamed: 0,userId,movieId,rating,timestamp,rating_nbu,rbar
0,125423,104211,5.00,1531296071,0.50,3.61
1,128243,8633,3.50,1494793313,0.58,3.50
2,85525,88810,3.50,1536420368,0.56,3.94
3,48653,648,4.00,1521355492,0.10,3.35
4,25494,44191,3.50,1559677514,-0.25,3.99
...,...,...,...,...,...,...
49995,58599,6936,3.00,1482783590,0.25,3.10
49996,143568,78105,3.00,1498503307,-0.45,3.21
49997,156480,170751,3.50,1566616756,0.23,3.50
49998,57545,3081,4.00,1547511124,0.96,3.47


In [58]:
i = 3
data[data['movieId']==i]['rating_nbu'].sum()

0.08995836754032371