# 간단한 추천시스템 만들기

1. MovieLens 데이터셋 불러오기
2. MovieLens 데이터셋 중 학습셋과 평가셋 나누기
3. 간단한 추천 알고리즘 만들기(평점을 예측하고 평가는 RMSE로 한다)

## 필요한 라이브러리 정의 (Configuration)

In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

import warnings
warnings.filterwarnings("ignore")

## MovieLens 데이터셋 불러오기

* ratings.csv, movies.csv, tags.csv

In [2]:
# 데이터 Path 설정
path = '../data/movielens/'

In [3]:
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')

In [4]:
print(ratings_df.shape)
print(ratings_df.head())

(100836, 4)
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


### ratings 데이터 정보 확인하기

* 몇 명의 유저가 몇 개의 영화에 평점을 줬는지
* 각 유저가 어떤 영화에 평점을 줬는지 sparse matrix

In [5]:
num_users = ratings_df['userId'].unique()
num_movies = ratings_df['movieId'].unique()

print("총 유저 수: ", len(num_users))
print("총 영화 수: ", len(num_movies))

총 유저 수:  610
총 영화 수:  9724


* pandas dataframe의 pivot: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot.html

In [6]:
# pivot ratings into movie features
user_movie_matrix = ratings_df.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)
 
# convert dataframe of movie features to scipy sparse matrix
sparse_mat = csr_matrix(user_movie_matrix.values)

In [7]:
print(user_movie_matrix)

userId   1    2    3    4    5    6    7    8    9    10   ...  601  602  603  \
movieId                                                    ...                  
1        4.0  0.0  0.0  0.0  4.0  0.0  4.5  0.0  0.0  0.0  ...  4.0  0.0  4.0   
2        0.0  0.0  0.0  0.0  0.0  4.0  0.0  4.0  0.0  0.0  ...  0.0  4.0  0.0   
3        4.0  0.0  0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4        0.0  0.0  0.0  0.0  0.0  3.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
5        0.0  0.0  0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
193581   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
193583   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
193585   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
193587   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
193609   0.0  0.0  0.0  0.0 

In [8]:
print(sparse_mat)

  (0, 0)	4.0
  (0, 4)	4.0
  (0, 6)	4.5
  (0, 14)	2.5
  (0, 16)	4.5
  (0, 17)	3.5
  (0, 18)	4.0
  (0, 20)	3.5
  (0, 26)	3.0
  (0, 30)	5.0
  (0, 31)	3.0
  (0, 32)	3.0
  (0, 39)	5.0
  (0, 42)	5.0
  (0, 43)	3.0
  (0, 44)	4.0
  (0, 45)	5.0
  (0, 49)	3.0
  (0, 53)	3.0
  (0, 56)	5.0
  (0, 62)	5.0
  (0, 63)	4.0
  (0, 65)	4.0
  (0, 67)	2.5
  (0, 70)	5.0
  :	:
  (9700, 337)	2.5
  (9701, 337)	3.0
  (9702, 183)	4.0
  (9702, 247)	3.5
  (9703, 317)	2.5
  (9704, 209)	1.0
  (9705, 461)	2.5
  (9706, 49)	3.5
  (9707, 337)	1.5
  (9708, 337)	4.0
  (9709, 337)	1.0
  (9710, 337)	1.5
  (9711, 337)	1.0
  (9712, 337)	1.0
  (9713, 183)	4.5
  (9714, 183)	3.5
  (9715, 183)	3.0
  (9716, 183)	4.0
  (9717, 183)	4.0
  (9718, 183)	3.5
  (9719, 183)	4.0
  (9720, 183)	3.5
  (9721, 183)	3.5
  (9722, 183)	3.5
  (9723, 330)	4.0


In [9]:
user_info_df = pd.DataFrame(data = [sum(list(user_movie_matrix[int(x)].value_counts())[1:]) for x in user_movie_matrix.columns],
                           index = user_movie_matrix.columns, columns=['movies_rated'])

# sum(list(user_movie_matrix[1].value_counts())[1:])

In [10]:
user_info_df

Unnamed: 0_level_0,movies_rated
userId,Unnamed: 1_level_1
1,232
2,29
3,39
4,216
5,44
...,...
606,1115
607,187
608,831
609,37


In [11]:
movie_info_df = pd.DataFrame(data = [sum(list(user_movie_matrix.loc[int(x)].value_counts())[1:]) for x in user_movie_matrix.index],
                           index = user_movie_matrix.index, columns=['users_rated'])


In [12]:
movie_info_df

Unnamed: 0_level_0,users_rated
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
...,...
193581,1
193583,1
193585,1
193587,1


## MovieLens 데이터셋 중 학습셋과 평가셋 나누기

In [13]:
train_df, test_df = train_test_split(ratings_df, test_size=0.3, random_state=1234)

In [14]:
print(train_df.shape)
print(test_df.shape)

(70585, 4)
(30251, 4)


### test set에는 존재하지만, train set에는 없는 영화 또는 사용자 비율

In [15]:
# 집합 A - 집합 B => 집합 B에는 없고 집합 A에만 있는 item 

# userId
print("사용자: ",len(list(set(test_df['userId'].unique()) - set(train_df['userId'].unique()))))         # 제외 User 수

# movieId
print("영화: ", len(list(set(test_df['movieId'].unique()) - set(train_df['movieId'].unique()))))        # 제외 영화
print("test set의 전체 영화 수: ",  len(test_df['movieId'].unique()))
print("train set의 전체 영화 수: ",  len(train_df['movieId'].unique()))
print("전체 영화 수: ", len(ratings_df['movieId'].unique()))

사용자:  0
영화:  1209
test set의 전체 영화 수:  6151
train set의 전체 영화 수:  8515
전체 영화 수:  9724


In [16]:
movies_not_included = list(set(test_df['movieId'].unique()) - set(train_df['movieId'].unique()))
print(sorted(movies_not_included)[:10])

not_included_df = test_df[test_df.movieId.isin(movies_not_included)].sort_values(by='movieId')
print(not_included_df.head(10))

print("train set에 없고, test set에만 있는 영화 데이터 수: ", not_included_df.shape)

[49, 117, 137, 178, 179, 214, 220, 241, 270, 279]
       userId  movieId  rating  timestamp
29386     202       49     3.0  974925453
97066     604      117     3.0  832080636
99501     609      137     3.0  847221054
27959     191      178     1.0  829760898
62376     414      179     2.0  961514335
632         6      179     1.0  845555362
96154     603      214     4.0  963179452
13114      84      214     4.0  858771796
97086     604      220     3.0  832080636
98493     607      241     4.0  964744490
train set에 없고, test set에만 있는 영화 데이터 수:  (1390, 4)


## 간단한 추천알고리즘 만들기

1. 랜덤으로 평점 예측하기
2. 영화 평균 평점기반 예측하기
3. 사용자 평균 평점기반 예측하기
4. Rule기반 영화 랭킹 예측하기
    
* test에 있고, train에 없는 경우

### 랜덤으로 평점 예측하기

In [17]:
# 0.5 - 5.0사이의 숫자를 예측해야할 평점 수 만큼 generate
ratings_range = np.arange(0.5, 5.5, step=0.5)
ratings_range

array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])

In [18]:
import random
pred_random = [random.choice(ratings_range) for x in range(len(test_df))]
pred_random[:10]

[2.5, 3.0, 4.0, 1.5, 1.0, 0.5, 5.0, 2.0, 4.0, 3.0]

In [19]:
test_df['pred_ratings_random'] = pred_random
test_df

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random
99731,610,3527,5.0,1479545223,2.5
97583,606,1250,3.5,1171376891,3.0
38197,262,213,5.0,840310907,4.0
11474,68,69406,3.0,1261622505,1.5
34105,232,4728,3.0,1218166950,1.0
...,...,...,...,...,...
24735,172,1982,3.0,1238716057,0.5
5825,41,69757,3.5,1458938869,2.5
11128,68,6893,3.5,1158534072,3.5
2231,18,117176,4.0,1473004722,2.0


In [20]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_ratings_random'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

3.6997950480975836 1.9234851307191287


###  영화 평균 평점기반 예측하기

1. train set의 모든 영화에 대해서 평균 평점 구하기
2. test set 예측할 때, train set의 영화 평균 평점 활용하기. 만약 없다면, random으로 선택하기.

In [21]:
train_movie_df = train_df.groupby('movieId').mean()

print(train_movie_df.shape)
print(train_movie_df.head())

(8515, 3)
             userId    rating     timestamp
movieId                                    
1        307.115646  3.884354  1.128729e+09
2        321.148649  3.432432  1.161156e+09
3        277.150000  3.462500  9.941003e+08
4        192.750000  2.250000  8.425133e+08
5        305.685714  3.042857  1.020735e+09


In [22]:
def avg_rating_prediction(training_set, x):                 # 영화의 평균 평점 기반으로 예상 평점 구하기
    if x in training_set.index:
        pred_rating = training_set.loc[x]['rating']
    else:
        pred_rating = random.choice(ratings_range)
    return pred_rating

In [23]:
test_df['pred_rating_movie'] = test_df['movieId'].apply(lambda x: avg_rating_prediction(train_movie_df, x))

test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movie
99731,610,3527,5.0,1479545223,2.5,3.592105
97583,606,1250,3.5,1171376891,3.0,4.140625
38197,262,213,5.0,840310907,4.0,3.75
11474,68,69406,3.0,1261622505,1.5,3.454545
34105,232,4728,3.0,1218166950,1.0,2.769231


In [24]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_movie'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.0715493316455182 1.035156670096618


### 사용자 평균 평점기반 예측하기

1. train set의 모든 유저가 준 평균 평점
2. test set 예측할 때, 유저가 train set에서 준 평균 평점을 활용. 유저가 없을 경우 random 평점 적용

In [25]:
train_user_df = train_df.groupby('userId').mean()

print(train_user_df.shape)
print(train_user_df.head())

(610, 3)
             movieId    rating     timestamp
userId                                      
1        1876.043210  4.271605  9.649824e+08
2       71289.000000  3.954545  1.445715e+09
3       10552.681818  2.340909  1.306464e+09
4        1953.664516  3.567742  9.659816e+08
5         328.250000  3.678571  8.474351e+08


In [26]:
test_df['pred_rating_user'] = test_df['userId'].apply(lambda x: avg_rating_prediction(train_user_df, x))

test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movie,pred_rating_user
99731,610,3527,5.0,1479545223,2.5,3.592105,3.685022
97583,606,1250,3.5,1171376891,3.0,4.140625,3.662127
38197,262,213,5.0,840310907,4.0,3.75,2.945946
11474,68,69406,3.0,1261622505,1.5,3.454545,3.237079
34105,232,4728,3.0,1218166950,1.0,2.769231,3.26509


In [27]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_user'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

0.8848494933593299 0.9406643893330554


### Rule 기반 영화 평점 예측하기

1. train set에 포함된 유저의 영화 평균 평점과 영화의 장르를 활용하여, 장르별 평균 평점 계산 -> test set의 영화 장르의 평균 평점으로 예측

In [28]:
# create user_movie matrix by only using train_df
train_user_movie_matrix = train_df.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

In [29]:
train_user_movie_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,0.0,4.0,2.5,0.0,2.5,0.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
genres_df = movies_df['genres'].str.get_dummies(sep='|')
print(genres_df.shape)
genres_df = genres_df.loc[train_df.movieId.unique()]
print(genres_df.shape)
genres_df.head()

(9742, 20)
(8515, 20)


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
4591,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
161,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0
5073,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
5128,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
1186,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [31]:
# trainset에서 영화별 유저 평점 평균
train_movie_avg_ratings_df = train_user_movie_matrix.copy()
train_movie_avg_ratings_df = train_movie_avg_ratings_df.replace(0, np.NaN)
train_movie_avg_ratings_df = train_movie_avg_ratings_df.mean(axis = 1)

train_movie_avg_ratings_df.head()

movieId
1    3.884354
2    3.432432
3    3.462500
4    2.250000
5    3.042857
dtype: float64

In [32]:
# genres_df에서 해당 장르가 포함된 모든 영화 index를 가져와서, 해당 영화의 유저 평균 평점의 평균을 구해서 장르 평균 평점으로 활용
genres_avg_ratings_df = pd.DataFrame(index=genres_df.columns, columns=['avg_ratings'])

for genre in genres_avg_ratings_df.index:
    genre_avg_rating = train_movie_avg_ratings_df.loc[genres_df[genres_df[genre].isin([1])].index].mean()
    genres_avg_ratings_df.loc[genre]['avg_ratings'] = genre_avg_rating

genres_avg_ratings_df

Unnamed: 0,avg_ratings
(no genres listed),3.46181
Action,3.1208
Adventure,3.23623
Animation,3.47415
Children,3.10769
Comedy,3.17972
Crime,3.31178
Documentary,3.77086
Drama,3.43173
Fantasy,3.24693


In [33]:
def get_genre_avg_ratings(x):
    genres_list = movies_df.loc[x]['genres'].split('|')
    rating = 0
    for genre in genres_list:
        rating += genres_avg_ratings_df.loc[genre]['avg_ratings']
    
    return rating / len(genres_list)

In [34]:
tqdm.pandas()                       # progress_apply 사용 - 진행상황 표시
test_df['pred_rating_genre'] = test_df['movieId'].progress_apply(lambda x: get_genre_avg_ratings(x))

100%|██████████| 30251/30251 [00:06<00:00, 4699.50it/s]


In [35]:
test_df

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movie,pred_rating_user,pred_rating_genre
99731,610,3527,5.0,1479545223,2.5,3.592105,3.685022,3.141247
97583,606,1250,3.5,1171376891,3.0,4.140625,3.662127,3.410769
38197,262,213,5.0,840310907,4.0,3.750000,2.945946,3.431731
11474,68,69406,3.0,1261622505,1.5,3.454545,3.237079,3.260991
34105,232,4728,3.0,1218166950,1.0,2.769231,3.265090,3.179721
...,...,...,...,...,...,...,...,...
24735,172,1982,3.0,1238716057,0.5,3.805556,3.900000,2.909010
5825,41,69757,3.5,1458938869,2.5,3.727273,3.175497,3.317904
11128,68,6893,3.5,1158534072,3.5,4.166667,3.237079,3.204100
2231,18,117176,4.0,1473004722,2.0,3.791667,3.736915,3.386996


In [36]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_genre'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.1200127449284865 1.05830654582143


### Rule 기반 영화 평점 예측하기

2. user의 평균 영화 평점을 normalize해서 확인하기, 평점 측정 수, 표준편차 등 활용가능 

In [37]:
train_user_info_df = pd.DataFrame({
    'avg_ratings': train_df.groupby('userId')['rating'].mean(),
    'std_ratings': train_df.groupby('userId')['rating'].std(),
    'count_ratings': train_df.groupby('userId')['rating'].count()
})

train_user_info_df

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4.271605,0.863509,162
2,3.954545,0.843873,22
3,2.340909,2.151336,22
4,3.567742,1.324197,155
5,3.678571,0.983327,28
...,...,...,...
606,3.662127,0.719157,771
607,3.803150,0.984316,127
608,3.158895,1.085355,579
609,3.285714,0.460044,28


In [38]:
min_count = train_user_info_df['count_ratings'].min()
max_count = train_user_info_df['count_ratings'].max()
avg_count = train_user_info_df['count_ratings'].mean()

train_user_info_df['weights'] = train_user_info_df['count_ratings'].apply(lambda x: (x-avg_count)/(max_count-min_count))

In [39]:
train_user_info_df

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings,weights
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,4.271605,0.863509,162,0.024465
2,3.954545,0.843873,22,-0.049531
3,2.340909,2.151336,22,-0.049531
4,3.567742,1.324197,155,0.020765
5,3.678571,0.983327,28,-0.046360
...,...,...,...,...
606,3.662127,0.719157,771,0.346346
607,3.803150,0.984316,127,0.005966
608,3.158895,1.085355,579,0.244866
609,3.285714,0.460044,28,-0.046360


In [40]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(train_user_info_df)
df_normalized = pd.DataFrame(np_scaled, columns = train_user_info_df.columns, index=train_user_info_df.index)
df_normalized

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings,weights
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.808704,0.401383,0.080338,0.080338
2,0.725436,0.392255,0.006342,0.006342
3,0.301653,1.000000,0.006342,0.006342
4,0.623851,0.615523,0.076638,0.076638
5,0.652958,0.457077,0.009514,0.009514
...,...,...,...,...
606,0.648639,0.334284,0.402220,0.402220
607,0.685676,0.457537,0.061839,0.061839
608,0.516477,0.504503,0.300740,0.300740
609,0.549784,0.213841,0.009514,0.009514


In [41]:
df_normalized['normalized_avg_ratings'] = df_normalized['avg_ratings'] * 5
df_normalized

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings,weights,normalized_avg_ratings
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.808704,0.401383,0.080338,0.080338,4.043522
2,0.725436,0.392255,0.006342,0.006342,3.627181
3,0.301653,1.000000,0.006342,0.006342,1.508264
4,0.623851,0.615523,0.076638,0.076638,3.119257
5,0.652958,0.457077,0.009514,0.009514,3.264791
...,...,...,...,...,...
606,0.648639,0.334284,0.402220,0.402220,3.243197
607,0.685676,0.457537,0.061839,0.061839,3.428378
608,0.516477,0.504503,0.300740,0.300740,2.582387
609,0.549784,0.213841,0.009514,0.009514,2.748918


In [42]:
test_df['pred_rating_normalized'] = test_df['userId'].progress_apply(lambda x: df_normalized.loc[x]['normalized_avg_ratings'])
test_df

100%|██████████| 30251/30251 [00:02<00:00, 13853.95it/s]


Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movie,pred_rating_user,pred_rating_genre,pred_rating_normalized
99731,610,3527,5.0,1479545223,2.5,3.592105,3.685022,3.141247,3.273261
97583,606,1250,3.5,1171376891,3.0,4.140625,3.662127,3.410769,3.243197
38197,262,213,5.0,840310907,4.0,3.750000,2.945946,3.431731,2.302757
11474,68,69406,3.0,1261622505,1.5,3.454545,3.237079,3.260991,2.685053
34105,232,4728,3.0,1218166950,1.0,2.769231,3.265090,3.179721,2.721835
...,...,...,...,...,...,...,...,...,...
24735,172,1982,3.0,1238716057,0.5,3.805556,3.900000,2.909010,3.555556
5825,41,69757,3.5,1458938869,2.5,3.727273,3.175497,3.317904,2.604188
11128,68,6893,3.5,1158534072,3.5,4.166667,3.237079,3.204100,2.685053
2231,18,117176,4.0,1473004722,2.0,3.791667,3.736915,3.386996,3.341403


In [43]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_rating_normalized'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.1219932449166226 1.0592418255132408
