# Scoring
## Compute Weights

In [1]:
from recsys.core.services.io import IOService
from itertools import combinations
import pandas as pd
import numpy as np

In [2]:
FILEPATH = "data/raw/rating.pkl"
USERID = 118205
MOVIEID = 296

In [3]:

ratings = IOService.read(FILEPATH)
ratings.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  int64  
 2   rating   float64
dtypes: float64(1), int64(2)
memory usage: 457.8 MB


In [4]:
omega_j = ratings.loc[(ratings["movieId"] == MOVIEID) & (ratings["userId"] != USERID) ]["userId"].values
len(omega_j)
omega_j[0:10]

67309

array([ 1,  8, 11, 13, 15, 18, 21, 22, 23, 24])

In [5]:
def score(i,j):
    omega_j = ratings.loc[(ratings["movieId"] == j) & (ratings["userId"] != i) ]["userId"].values
    

In [6]:
ratings_user_i = ratings.loc[(ratings["userId"] == USERID) & ((ratings["userId"] != MOVIEID))]


In [7]:
ratings_user_i.info()
ratings_user_i.head()
ratings_user_i.shape

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9254 entries, 17085594 to 17094847
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   userId   9254 non-null   int64  
 1   movieId  9254 non-null   int64  
 2   rating   9254 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 289.2 KB


Unnamed: 0,userId,movieId,rating
17085594,118205,1,4.0
17085595,118205,2,4.0
17085596,118205,3,3.0
17085597,118205,4,3.0
17085598,118205,5,3.0


(9254, 3)

In [8]:
neighbors = ratings.loc[ratings["userId"].isin(omega_j)]
neighbors.columns = ["neighborId", "movieId", "neighbor_rating"]

In [9]:
neighbors.info()
neighbors.head()
neighbors.shape


<class 'pandas.core.frame.DataFrame'>
Int64Index: 14465215 entries, 0 to 20000262
Data columns (total 3 columns):
 #   Column           Dtype  
---  ------           -----  
 0   neighborId       int64  
 1   movieId          int64  
 2   neighbor_rating  float64
dtypes: float64(1), int64(2)
memory usage: 441.4 MB


Unnamed: 0,neighborId,movieId,neighbor_rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


(14465215, 3)

In [10]:
user_neighbor = ratings_user_i.merge(right=neighbors, how="inner", on="movieId").sort_values(by=["neighborId","movieId"])

In [11]:
user_neighbor.info()
user_neighbor.head()
user_neighbor.shape
user_neighbor["neighborId"].nunique()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13202455 entries, 30784 to 13052274
Data columns (total 5 columns):
 #   Column           Dtype  
---  ------           -----  
 0   userId           int64  
 1   movieId          int64  
 2   rating           float64
 3   neighborId       int64  
 4   neighbor_rating  float64
dtypes: float64(2), int64(3)
memory usage: 604.4 MB


Unnamed: 0,userId,movieId,rating,neighborId,neighbor_rating
30784,118205,2,4.0,1,3.5
230984,118205,29,4.0,1,3.5
244796,118205,32,4.0,1,3.5
359266,118205,47,4.0,1,3.5
403468,118205,50,4.5,1,3.5


(13202455, 5)

67309

In [12]:
weights = user_neighbor.groupby(by=["userId","neighborId"], group_keys=False)[["rating", "neighbor_rating"]].apply(lambda x: (x["rating"].dot(x["neighbor_rating"])/(np.linalg.norm(x["rating"]) * np.linalg.norm(x["neighbor_rating"])))).to_frame().reset_index()
weights.columns = ["userId", "neighborId", "weight"]
weights = weights.sort_values(by="weight", ascending=False)
weights["weight_abs"] = np.abs(user_neighbor["weight"])

In [13]:
weights.info()
weights.head()
weights.shape

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67309 entries, 3175 to 66423
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   userId      67309 non-null  int64  
 1   neighborId  67309 non-null  int64  
 2   weight      67309 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.1 MB


Unnamed: 0,userId,neighborId,weight
3175,118205,6441,0.9970361798508864
35798,118205,73927,0.9964263778815756
58138,118205,119462,0.9959476313225942
24908,118205,51442,0.995689251555982
43458,118205,89360,0.9955907945411626


(67309, 3)

In [14]:
user_neighbor = user_neighbor.merge(weights[["neighborId", "weight"]], how="inner", on="neighborId")


In [15]:
user_neighbor.info()
user_neighbor.head()
user_neighbor.shape

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13202455 entries, 0 to 13202454
Data columns (total 7 columns):
 #   Column           Dtype  
---  ------           -----  
 0   userId           int64  
 1   movieId          int64  
 2   rating           float64
 3   neighborId       int64  
 4   neighbor_rating  float64
 5   weight           float64
 6   weight_abs       float64
dtypes: float64(4), int64(3)
memory usage: 805.8 MB


Unnamed: 0,userId,movieId,rating,neighborId,neighbor_rating,weight,weight_abs
0,118205,2,4.0,1,3.5,0.986789474771418,0.986789474771418
1,118205,29,4.0,1,3.5,0.986789474771418,0.986789474771418
2,118205,32,4.0,1,3.5,0.986789474771418,0.986789474771418
3,118205,47,4.0,1,3.5,0.986789474771418,0.986789474771418
4,118205,50,4.5,1,3.5,0.986789474771418,0.986789474771418


(13202455, 7)