# Test Ratings Workbook

In [1]:
import numpy as np
from tqdm import tqdm
from itertools import combinations
import pandas as pd
from recsys.data.rating import RatingsDataset
from recsys.io.file import IOService

# Data

In [2]:
FILEPATH = "data/dev/ratings_0.5_pct.pkl"

In [3]:
data = IOService.read(FILEPATH)
data = data[['userId', 'movieId', 'rating']]

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 125000 entries, 19265544 to 3362652
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   125000 non-null  int64  
 1   movieId  125000 non-null  int64  
 2   rating   125000 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 3.8 MB


In [5]:
user_counts = data['userId'].value_counts().to_frame().reset_index()
user_counts.columns = ['userId', 'n_ratings']
users = user_counts[user_counts['n_ratings'].between(3,5)]['userId']

In [6]:
item_counts = data['movieId'].value_counts().to_frame().reset_index()
item_counts.columns = ['movieId', 'n_ratings']
items = item_counts[item_counts['n_ratings'].between(3,5)]['movieId']

In [7]:
data[(data['movieId'].isin(items) )&(data['userId'].isin(users) ) ]

Unnamed: 0,userId,movieId,rating
9823302,63771,1791,4.00
9692020,62928,4664,3.00
22853040,148489,2130,4.00
1261673,8528,76111,3.50
6021221,38999,5843,4.50
...,...,...,...
12541405,81096,3392,2.50
8703960,56709,90057,1.50
21591850,140402,8576,4.00
23392471,151876,1099,3.00


## User Ratings

In [8]:
user = 97615
user_ratings = data[data['userId']==user]
user_ratings


Unnamed: 0,userId,movieId,rating
15067476,97615,5444,5.0
15067661,97615,54004,3.5
15067238,97615,1207,5.0
15067676,97615,56152,4.5
15067795,97615,76251,4.5
15067380,97615,3507,5.0
15067611,97615,44840,4.0
15067438,97615,4447,4.5
15067678,97615,56367,5.0


### User Ratings Centered by Average User Rating

In [9]:
# user_ratings['rubar'] = 
df = user_ratings.groupby('userId')['rating'].mean()
user_ratings = user_ratings.merge(df, on='userId', how='left')
user_ratings['rating_cbu'] = user_ratings['rating_x'] - user_ratings['rating_y']
user_ratings.columns  = ['userId', 'movieId', 'rating', 'rubar', 'rating_cbu']
user_ratings

Unnamed: 0,userId,movieId,rating,rubar,rating_cbu
0,97615,5444,5.0,4.56,0.44
1,97615,54004,3.5,4.56,-1.06
2,97615,1207,5.0,4.56,0.44
3,97615,56152,4.5,4.56,-0.06
4,97615,76251,4.5,4.56,-0.06
5,97615,3507,5.0,4.56,0.44
6,97615,44840,4.0,4.56,-0.56
7,97615,4447,4.5,4.56,-0.06
8,97615,56367,5.0,4.56,0.44


### User Ratings Centered by Average Item Rating

In [10]:
ribar = {}
for item in user_ratings['movieId'].values:
    ribar[item] = data[data['movieId']==item]['rating'].mean()
ribar = pd.DataFrame.from_dict(data=ribar, orient='index').reset_index()
ribar.columns = ['movieId','ribar']
ribar


Unnamed: 0,movieId,ribar
0,5444,3.51
1,54004,3.0
2,1207,4.22
3,56152,3.61
4,76251,3.63
5,3507,4.19
6,44840,3.75
7,4447,3.34
8,56367,3.8


In [11]:
user_ratings = user_ratings.merge(ribar, on='movieId', how='left')
user_ratings['rating_cbi'] = user_ratings['rating'] - user_ratings['ribar']
user_ratings
user_ratings.sum(axis=0, numeric_only=True)

Unnamed: 0,userId,movieId,rating,rubar,rating_cbu,ribar,rating_cbi
0,97615,5444,5.0,4.56,0.44,3.51,1.49
1,97615,54004,3.5,4.56,-1.06,3.0,0.5
2,97615,1207,5.0,4.56,0.44,4.22,0.78
3,97615,56152,4.5,4.56,-0.06,3.61,0.89
4,97615,76251,4.5,4.56,-0.06,3.63,0.87
5,97615,3507,5.0,4.56,0.44,4.19,0.81
6,97615,44840,4.0,4.56,-0.56,3.75,0.25
7,97615,4447,4.5,4.56,-0.06,3.34,1.16
8,97615,56367,5.0,4.56,0.44,3.8,1.2


userId       878,535.00
movieId      302,219.00
rating            41.00
rubar             41.00
rating_cbu         0.00
ribar             33.06
rating_cbi         7.94
dtype: float64

### User Rating Norms

In [12]:
rating_data = user_ratings[['rating', 'rating_cbu', 'rating_cbi']]
np.sqrt(np.sum(np.square(rating_data), axis=0))


rating       13.75
rating_cbu    1.49
rating_cbi    2.85
dtype: float64

## Item Ratings

In [13]:
item = 58047
item_ratings = data[data['movieId'] == item]
item_ratings

Unnamed: 0,userId,movieId,rating
11904020,77221,58047,4.0
23514361,152676,58047,5.0
2987308,19718,58047,3.5
22721932,147727,58047,4.0
13700475,88758,58047,4.0
18623435,120677,58047,5.0
24951853,162271,58047,2.5
10403412,67488,58047,4.0
19786412,128566,58047,4.5
13007876,84232,58047,3.5


### Item Ratings Centered by Average User Rating

In [14]:
rubar = {}
for user in item_ratings['userId'].values:
    rubar[user] = data[data['userId']==user]['rating'].mean()
rubar = pd.DataFrame.from_dict(data=rubar, orient='index').reset_index()
rubar.columns = ['userId','rubar']
rubar

Unnamed: 0,userId,rubar
0,77221,4.0
1,152676,5.0
2,19718,4.2
3,147727,3.62
4,88758,3.5
5,120677,5.0
6,162271,2.64
7,67488,3.45
8,128566,4.5
9,84232,3.67


In [15]:
item_ratings = item_ratings.merge(rubar, on='userId', how='left')
item_ratings['rating_cbu'] = item_ratings['rating'] - item_ratings['rubar']
item_ratings

Unnamed: 0,userId,movieId,rating,rubar,rating_cbu
0,77221,58047,4.0,4.0,0.0
1,152676,58047,5.0,5.0,0.0
2,19718,58047,3.5,4.2,-0.7
3,147727,58047,4.0,3.62,0.38
4,88758,58047,4.0,3.5,0.5
5,120677,58047,5.0,5.0,0.0
6,162271,58047,2.5,2.64,-0.14
7,67488,58047,4.0,3.45,0.55
8,128566,58047,4.5,4.5,0.0
9,84232,58047,3.5,3.67,-0.17


### Item Rating Centered by Average Item Rating

In [16]:
ribar = item_ratings.groupby('movieId')['rating'].mean().reset_index()
ribar.columns = ['movieId', 'ribar']
item_ratings = item_ratings.merge(ribar, on='movieId', how='left')
item_ratings['rating_cbi'] = item_ratings['rating'] - item_ratings['ribar']
item_ratings
item_ratings.sum(axis=0, numeric_only=True)

Unnamed: 0,userId,movieId,rating,rubar,rating_cbu,ribar,rating_cbi
0,77221,58047,4.0,4.0,0.0,4.07,-0.07
1,152676,58047,5.0,5.0,0.0,4.07,0.93
2,19718,58047,3.5,4.2,-0.7,4.07,-0.57
3,147727,58047,4.0,3.62,0.38,4.07,-0.07
4,88758,58047,4.0,3.5,0.5,4.07,-0.07
5,120677,58047,5.0,5.0,0.0,4.07,0.93
6,162271,58047,2.5,2.64,-0.14,4.07,-1.57
7,67488,58047,4.0,3.45,0.55,4.07,-0.07
8,128566,58047,4.5,4.5,0.0,4.07,0.43
9,84232,58047,3.5,3.67,-0.17,4.07,-0.57


userId       1,315,357.00
movieId        812,658.00
rating              57.00
rubar               55.04
rating_cbu           1.96
ribar               57.00
rating_cbi           0.00
dtype: float64

### Item Rating Norms

In [17]:
rating_data = item_ratings[['rating', 'rating_cbu', 'rating_cbi']]
np.sqrt(np.sum(np.square(rating_data), axis=0))

rating       15.41
rating_cbu    2.05
rating_cbi    2.33
dtype: float64

## Cosine Similarity