# Test Ratings Workbook

In [1]:
import numpy as np
from tqdm import tqdm
from itertools import combinations
import pandas as pd
from recsys.data.rating import RatingsDataset
from recsys.io.file import IOService

# Data

In [2]:
FILEPATH = "data/dev/ratings_0.5_pct.pkl"

In [3]:
data = IOService.read(FILEPATH)
data = data[['userId', 'movieId', 'rating']]

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   userId   25000 non-null  category
 1   movieId  25000 non-null  category
 2   rating   25000 non-null  float64 
dtypes: category(2), float64(1)
memory usage: 2.1 MB


In [5]:
user_counts = data['userId'].value_counts().to_frame().reset_index()
user_counts.columns = ['userId', 'n_ratings']
users = user_counts[user_counts['n_ratings'].between(3,5)]['userId']

In [6]:
item_counts = data['movieId'].value_counts().to_frame().reset_index()
item_counts.columns = ['movieId', 'n_ratings']
items = item_counts[item_counts['n_ratings'].between(3,5)]['movieId']

In [7]:
data[(data['movieId'].isin(items) )&(data['userId'].isin(users) ) ]

Unnamed: 0,userId,movieId,rating
10,97615,58047,5.00
47,37244,109848,2.50
75,158258,7669,4.50
115,68122,1223,3.50
122,103769,41997,3.50
...,...,...,...
24876,82918,55250,3.00
24926,89025,201646,3.00
24931,45165,969,4.50
24933,91560,85881,4.00


## User Ratings

In [8]:
user = 97615
user_ratings = data[data['userId']==user]
user_ratings


Unnamed: 0,userId,movieId,rating
10,97615,58047,5.0
12207,97615,95309,5.0
13022,97615,113453,4.0
17451,97615,152081,4.5
24633,97615,4816,4.0


### User Ratings Centered by Average User Rating

In [9]:
# user_ratings['rubar'] = 
df = user_ratings.groupby('userId')['rating'].mean()
user_ratings = user_ratings.merge(df, on='userId', how='left')
user_ratings['rating_cbu'] = user_ratings['rating_x'] - user_ratings['rating_y']
user_ratings.columns  = ['userId', 'movieId', 'rating', 'rubar', 'rating_cbu']
user_ratings

Unnamed: 0,userId,movieId,rating,rubar,rating_cbu
0,97615,58047,5.0,4.5,0.5
1,97615,95309,5.0,4.5,0.5
2,97615,113453,4.0,4.5,-0.5
3,97615,152081,4.5,4.5,0.0
4,97615,4816,4.0,4.5,-0.5


### User Ratings Centered by Average Item Rating

In [10]:
ribar = {}
for item in user_ratings['movieId'].values:
    ribar[item] = data[data['movieId']==item]['rating'].mean()
ribar = pd.DataFrame.from_dict(data=ribar, orient='index').reset_index()
ribar.columns = ['movieId','ribar']
ribar


Unnamed: 0,movieId,ribar
0,58047,4.0
1,95309,3.5
2,113453,4.0
3,152081,3.85
4,4816,3.17


In [11]:
user_ratings = user_ratings.merge(ribar, on='movieId', how='left')
user_ratings['rating_cbi'] = user_ratings['rating'] - user_ratings['ribar']
user_ratings
user_ratings.sum(axis=0, numeric_only=True)

Unnamed: 0,userId,movieId,rating,rubar,rating_cbu,ribar,rating_cbi
0,97615,58047,5.0,4.5,0.5,4.0,1.0
1,97615,95309,5.0,4.5,0.5,3.5,1.5
2,97615,113453,4.0,4.5,-0.5,4.0,0.0
3,97615,152081,4.5,4.5,0.0,3.85,0.65
4,97615,4816,4.0,4.5,-0.5,3.17,0.83


movieId      423,706.00
rating            22.50
rubar             22.50
rating_cbu         0.00
ribar             18.52
rating_cbi         3.98
dtype: float64

### User Rating Norms

In [12]:
rating_data = user_ratings[['rating', 'rating_cbu', 'rating_cbi']]
np.sqrt(np.sum(np.square(rating_data), axis=0))


rating       10.11
rating_cbu    1.00
rating_cbi    2.09
dtype: float64

## Item Ratings

In [13]:
item = 58047
item_ratings = data[data['movieId'] == item]
item_ratings

Unnamed: 0,userId,movieId,rating
10,97615,58047,5.0
2130,117657,58047,3.0
4821,148771,58047,4.0


### Item Ratings Centered by Average User Rating

In [14]:
rubar = {}
for user in item_ratings['userId'].values:
    rubar[user] = data[data['userId']==user]['rating'].mean()
rubar = pd.DataFrame.from_dict(data=rubar, orient='index').reset_index()
rubar.columns = ['userId','rubar']
rubar

Unnamed: 0,userId,rubar
0,97615,4.5
1,117657,2.81
2,148771,4.0


In [15]:
item_ratings = item_ratings.merge(rubar, on='userId', how='left')
item_ratings['rating_cbu'] = item_ratings['rating'] - item_ratings['rubar']
item_ratings

Unnamed: 0,userId,movieId,rating,rubar,rating_cbu
0,97615,58047,5.0,4.5,0.5
1,117657,58047,3.0,2.81,0.19
2,148771,58047,4.0,4.0,0.0


### Item Rating Centered by Average Item Rating

In [16]:
ribar = item_ratings.groupby('movieId')['rating'].mean().reset_index()
ribar.columns = ['movieId', 'ribar']
item_ratings = item_ratings.merge(ribar, on='movieId', how='left')
item_ratings['rating_cbi'] = item_ratings['rating'] - item_ratings['ribar']
item_ratings
item_ratings.sum(axis=0, numeric_only=True)

Unnamed: 0,userId,movieId,rating,rubar,rating_cbu,ribar,rating_cbi
0,97615,58047,5.0,4.5,0.5,4.0,1.0
1,117657,58047,3.0,2.81,0.19,4.0,-1.0
2,148771,58047,4.0,4.0,0.0,4.0,0.0


userId       364,043.00
rating            12.00
rubar             11.31
rating_cbu         0.69
ribar             12.00
rating_cbi         0.00
dtype: float64

### Item Rating Norms

In [17]:
rating_data = item_ratings[['rating', 'rating_cbu', 'rating_cbi']]
np.sqrt(np.sum(np.square(rating_data), axis=0))

rating       7.07
rating_cbu   0.53
rating_cbi   1.41
dtype: float64