# Similarity 
Create test and validation data for cosine, adjusted cosine, and pearson similarity matrices.

In [6]:
import pandas as pd
import numpy as np
from recsys import IOService
from recsys.services.sparse import get_element
from recsys.data.dataset import Dataset

In [7]:
FILEPATH = "tests/testdata/operators/data_operators/ratings_user_random_sample_1pct.pkl"
data = IOService.read(filepath=FILEPATH)
ds = Dataset(name="test", description='test', data=data)
ds.user_rating_frequency
ds.item_rating_frequency


Unnamed: 0,useridx,n_ratings
0,873,4689
1,13624,3935
2,3990,3866
3,8666,3286
4,14527,3273
...,...,...
16272,11280,20
16273,14957,20
16274,13830,20
16275,4120,20


Unnamed: 0,itemidx,n_ratings
0,349,8150
1,312,8139
2,290,7999
3,580,7437
4,2442,7337
...,...,...
30175,13754,1
30176,21807,1
30177,25101,1
30178,25116,1


## Cosine Similarity
### Cosine User Similarity

In [8]:
u = 873
v = 13624
ru = ds.get_user_ratings(u)[['useridx','itemidx', 'rating']]
rv = ds.get_user_ratings(v)[['useridx','itemidx', 'rating']]
Iuv = pd.merge(left=ru,right=rv, on='itemidx', how='inner')
Iuv

Unnamed: 0,useridx_x,itemidx,rating_x,useridx_y,rating_y
0,873,0,4.50,13624,3.50
1,873,1,2.50,13624,3.00
2,873,5,4.50,13624,4.00
3,873,9,3.00,13624,3.50
4,873,13,3.50,13624,3.50
...,...,...,...,...,...
1593,873,13348,2.50,13624,3.00
1594,873,13469,4.00,13624,4.00
1595,873,13679,4.00,13624,3.50
1596,873,13705,4.00,13624,3.00


In [9]:
ru = Iuv['rating_x']
rv = Iuv['rating_y']
dp = np.dot(ru,rv)
norm = np.sqrt(np.square(np.sum(ru))*np.square(np.sum(rv)))
cs = dp/norm
cs

0.0006348987976720046

In [10]:
csr = ds.to_csr()
get_element(csr, row=u,col=v)

        row    col  data
132282  873      0  4.50
132283  873      1  2.50
132284  873      2  2.50
132285  873      3  2.00
132286  873      4  2.00
...     ...    ...   ...
136966  873  13867  4.00
136967  873  13883  4.00
136968  873  13887  4.00
136969  873  13888  3.50
136970  873  13901  4.50

[4689 rows x 3 columns]
         row    col  data
658832  4310  13624  4.00


IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
csr.getformat()

## Cosine Item Similarity

In [None]:
i = 654
j = 2221
ri = ds.get_item_ratings(i)[['itemidx', 'useridx','rating', 'rating_cu']]
rj = ds.get_item_ratings(j)[['itemidx', 'useridx','rating', 'rating_cu']]
Uij = pd.merge(left=ri,right=rj, on='useridx', how='inner')
Uij

In [None]:
ri = Uij['rating_x']
rj = Uij['rating_y']
dp = np.dot(ri,rj)
norm = np.sqrt(np.square(np.sum(ri))*np.square(np.sum(rj)))
cs = dp/norm
cs