## MoiveLens 데이터를 통해 간단한 추천시스템

In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

import warnings 
warnings.filterwarnings('ignore')

### 데이터 가져오기

In [2]:
data_path = './data/ml-latest-small/'

data_list = os.listdir(data_path)
data_list

['links.csv', 'tags.csv', 'ratings.csv', 'README.txt', 'movies.csv']

In [3]:
rating_df = pd.read_csv(data_path + data_list[2], encoding='utf-8')
movie_df = pd.read_csv(data_path + data_list[-1], index_col='movieId', encoding='utf-8')
tag_df = pd.read_csv(data_path + data_list[1], encoding='utf-8')

display(rating_df.head())
display(movie_df.head())
display(tag_df.head())

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


### raing_df 데이터 확인

In [4]:
num_users = len(rating_df['userId'].unique())
num_movies = len(rating_df['movieId'].unique())

print('사용자 수 : %d' % num_users)
print('영화 수 : %d' % num_movies)

사용자 수 : 610
영화 수 : 9724


In [5]:
## Sparse Matrix 생성
# 각 사용자가 어떤 영화에 평점을 줬는지 

user_movie_pivot = pd.pivot(data=rating_df, 
                            index='movieId', 
                            columns='userId',
                            values='rating'
                            ).fillna((0))
user_movie_pivot

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# csr matrix

user_movie_mat = csr_matrix(user_movie_pivot.values)
print(user_movie_mat)

  (0, 0)	4.0
  (0, 4)	4.0
  (0, 6)	4.5
  (0, 14)	2.5
  (0, 16)	4.5
  (0, 17)	3.5
  (0, 18)	4.0
  (0, 20)	3.5
  (0, 26)	3.0
  (0, 30)	5.0
  (0, 31)	3.0
  (0, 32)	3.0
  (0, 39)	5.0
  (0, 42)	5.0
  (0, 43)	3.0
  (0, 44)	4.0
  (0, 45)	5.0
  (0, 49)	3.0
  (0, 53)	3.0
  (0, 56)	5.0
  (0, 62)	5.0
  (0, 63)	4.0
  (0, 65)	4.0
  (0, 67)	2.5
  (0, 70)	5.0
  :	:
  (9700, 337)	2.5
  (9701, 337)	3.0
  (9702, 183)	4.0
  (9702, 247)	3.5
  (9703, 317)	2.5
  (9704, 209)	1.0
  (9705, 461)	2.5
  (9706, 49)	3.5
  (9707, 337)	1.5
  (9708, 337)	4.0
  (9709, 337)	1.0
  (9710, 337)	1.5
  (9711, 337)	1.0
  (9712, 337)	1.0
  (9713, 183)	4.5
  (9714, 183)	3.5
  (9715, 183)	3.0
  (9716, 183)	4.0
  (9717, 183)	4.0
  (9718, 183)	3.5
  (9719, 183)	4.0
  (9720, 183)	3.5
  (9721, 183)	3.5
  (9722, 183)	3.5
  (9723, 330)	4.0


In [7]:
## 한 사용자가 영화에 매긴 수 => movies_rated

user_scores_df= pd.DataFrame(data = [sum(list(user_movie_pivot[int(x)].value_counts())[1:]) for x in user_movie_pivot.columns],
                            index = user_movie_pivot.columns, columns=['movies_num'])

user_scores_df

Unnamed: 0_level_0,movies_num
userId,Unnamed: 1_level_1
1,232
2,29
3,39
4,216
5,44
...,...
606,1115
607,187
608,831
609,37


In [8]:
user_scores_df.sort_values(by='movies_num',ascending=False)

Unnamed: 0_level_0,movies_num
userId,Unnamed: 1_level_1
414,2698
599,2478
474,2108
448,1864
274,1346
...,...
442,20
569,20
320,20
576,20


In [9]:
user_movie_pivot.iloc[1]

userId
1      0.0
2      0.0
3      0.0
4      0.0
5      0.0
      ... 
606    0.0
607    0.0
608    2.0
609    0.0
610    0.0
Name: 2, Length: 610, dtype: float64

In [10]:
sum(list(user_movie_pivot.iloc[1].value_counts())[1:])

110

In [11]:
## 한 영화에 몇 명의 사용자가 평점을 줬는지 => user_rated

movie_scores_df= pd.DataFrame(data = [sum(list(user_movie_pivot.loc[int(x)].value_counts())[1:]) for x in user_movie_pivot.index],
                                columns = ['users_num'])
movie_scores_df = movie_scores_df.astype('int')
movie_scores_df

Unnamed: 0,users_num
0,215
1,110
2,52
3,7
4,49
...,...
9719,1
9720,1
9721,1
9722,1


In [12]:
movie_scores_df.sort_values(by='users_num', ascending=False)

Unnamed: 0,users_num
314,329
277,317
257,307
510,279
1938,278
...,...
3053,1
3049,1
6687,1
3045,1


### Train / Valid /Test 데이터셋 분리

In [13]:
train_df, test_df = train_test_split(rating_df, test_size=0.2, random_state=42)

print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


## Test data에는 존재 -> Train data에는 없는 사용자와 영화

In [14]:
train_user_id = train_df.userId.unique()
train_movie_id = train_df.movieId.unique()

print(len(train_user_id))
print(len(train_movie_id))

print('Train X / Test O Users => {%d} \nTrain X / Test O Movies => {%d}' %(
    len(test_df.loc[test_df['userId'].isin(train_user_id) == False, 'userId'].unique()),
    len(test_df.loc[test_df['movieId'].isin(train_movie_id) == False, 'movieId'].unique())
))


610
8983
Train X / Test O Users => {0} 
Train X / Test O Movies => {741}


In [15]:
test_df.loc[test_df['movieId'].isin(train_movie_id) == False].head(10)

Unnamed: 0,userId,movieId,rating,timestamp
40203,274,26736,4.5,1239123373
63526,414,3847,2.0,966612823
13741,89,42761,4.5,1520409141
70108,448,99992,3.0,1374953846
50168,323,111443,2.5,1422640855
92128,596,141131,2.5,1535712067
29693,202,2983,4.0,974924755
98415,606,47538,2.5,1244573590
94861,599,93723,1.5,1519145934
94459,599,31921,2.5,1519185643


### 간단한 추천시스템 만들기

- 1. 랜덤으로 평점 예측하기
- 2. 영화 평균 평점기반 예측하기
- 3. 사용자 평균 평점기반 예측하기
- 4. Rule 기반 영화 랭킹 예측하기

- [test에 있고, train에 없는 경우의 데이터셋]

In [16]:
not_included_df = test_df.loc[test_df['movieId'].isin(train_movie_id) == False]
not_included_df.shape

(813, 4)

### 1. 랜덤으로 평점 예측해보기

In [17]:
rating_ranges = np.arange(0.5, 5.5, step=0.5)
rating_ranges

array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])

In [18]:
import random

random_ratings = [random.choice(rating_ranges) for _ in range(test_df.shape[0])]
len(random_ratings)

20168

In [19]:
test_df['rating_pred'] = random_ratings
test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,rating_pred
67037,432,77866,4.5,1335139641,4.5
42175,288,474,3.0,978465565,1.0
93850,599,4351,3.0,1498524542,4.0
6187,42,2987,4.0,996262677,2.0
12229,75,1610,4.0,1158989841,3.5


In [20]:
mse = mean_squared_error(test_df['rating'], test_df['rating_pred'])
random_rmse = np.sqrt(mse)

print(f'Random Recommend MSE : {mse}\nRandom Recommend RMSE : {random_rmse}')

Random Recommend MSE : 3.7202994843316146
Random Recommend RMSE : 1.9288077883323715


### 2. 영화 평균 평점기반

- 1. train data의 모든 영화 평균 평점
- 2. test set 예측할 때 train set의 영화 평균 평점을 주고, 없는 영화의 경우 random

In [21]:
train_movie_df = train_df.groupby('movieId').mean()
train_movie_df

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,317.741379,3.893678,1.142680e+09
2,329.538462,3.373626,1.145753e+09
3,284.900000,3.162500,1.005732e+09
4,242.500000,2.250000,9.052133e+08
5,329.205882,2.955882,1.006175e+09
...,...,...,...
193581,184.000000,4.000000,1.537109e+09
193583,184.000000,3.500000,1.537110e+09
193585,184.000000,3.500000,1.537110e+09
193587,184.000000,3.500000,1.537110e+09


In [22]:
def mean_rating_pred(tmd, x):
    if x in tmd.index:
        pred_ = tmd.loc[x]['rating']
    else:
        pred_ = random.choice(random_ratings)

    return pred_


test_df['rating_pred2'] = test_df['movieId'].apply(lambda x: mean_rating_pred(train_movie_df, x))
test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,rating_pred,rating_pred2
67037,432,77866,4.5,1335139641,4.5,2.9
42175,288,474,3.0,978465565,1.0,3.754386
93850,599,4351,3.0,1498524542,4.0,3.25
6187,42,2987,4.0,996262677,2.0,3.578947
12229,75,1610,4.0,1158989841,3.5,3.80137


In [23]:
mse = mean_squared_error(test_df['rating'], test_df['rating_pred2'])
mean_rmse = np.sqrt(mse)

print(f'Mean Recommend MSE : {mse}\nMean Recommend RMSE : {mean_rmse}')

Mean Recommend MSE : 1.0511934367663462
Mean Recommend RMSE : 1.0252772487314572


### 3. 사용자 평균 평점 기반
- 1. train data의 모든 사용자가 준 평균 평점
- 2. test set 예측 -> 존재하면 평균평점, 없으면 random

In [24]:
train_user_df = train_df.groupby('userId').mean()
train_user_df

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1810.336788,4.331606,9.649824e+08
2,67608.080000,3.920000,1.445715e+09
3,8325.516129,2.580645,1.306464e+09
4,1899.694118,3.464706,9.657212e+08
5,366.842105,3.657895,8.474351e+08
...,...,...,...
606,9606.890591,3.657002,1.179536e+09
607,1929.551724,3.744828,9.649334e+08
608,4569.170839,3.117820,1.122548e+09
609,441.483871,3.290323,8.472210e+08


In [25]:
test_df['rating_pred3'] = test_df['userId'].apply(lambda x: mean_rating_pred(train_user_df, x))
test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,rating_pred,rating_pred2,rating_pred3
67037,432,77866,4.5,1335139641,4.5,2.9,3.628571
42175,288,474,3.0,978465565,1.0,3.754386,3.131737
93850,599,4351,3.0,1498524542,4.0,3.25,2.651029
6187,42,2987,4.0,996262677,2.0,3.578947,3.541547
12229,75,1610,4.0,1158989841,3.5,3.80137,3.322034


In [26]:
mse = mean_squared_error(test_df['rating'], test_df['rating_pred3'])
mean_rmse = np.sqrt(mse)

print(f'Mean Recommend MSE : {mse}\nMean Recommend RMSE(User) : {mean_rmse}')

Mean Recommend MSE : 0.9020188826241128
Mean Recommend RMSE(User) : 0.9497467465720074


### Rule 기반 영화 평점

- 1. train data에 포함된 유저의 영화 평균 평점과 영화의 장르를 활용 /장르별 평균을 계산하여 test set의 영화 장르의 평균 평점으로 예측

In [27]:
train_user_movie_matrix = train_df.pivot(
    index = 'movieId',
    columns = 'userId',
    values = 'rating'
).fillna(0)

train_user_movie_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,...,0.0,0.0,4.0,3.0,4.0,2.5,0.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
genre_df = movie_df['genres'].str.get_dummies(sep='|')
print(genre_df.shape)

genre_df = genre_df.loc[train_df.movieId.unique()]
print(genre_df.shape)
genre_df.head()

(9742, 20)
(8983, 20)


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7347,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
71462,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2115,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1127,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
2409,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [29]:
# train data에서 User의 평균 평점

train_movie_mean_ratings_df = train_user_movie_matrix.copy()
train_movie_mean_ratings_df = train_movie_mean_ratings_df.replace(0, np.NaN)
train_movie_mean_ratings_df = train_movie_mean_ratings_df.mean(axis =1)
train_movie_mean_ratings_df.head()

movieId
1    3.893678
2    3.373626
3    3.162500
4    2.250000
5    2.955882
dtype: float64

In [30]:
genre_df

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7347,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
71462,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2115,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1127,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
2409,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3807,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
136353,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
113280,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
98607,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [31]:
# genre_df에 해당 장르가 포함된 모든 영화 index를 가져와서 해당 영화의 유저 평균 평점의 평균을 활용

genre_mean_rating_df = pd.DataFrame(index = genre_df.columns, columns = ['avg_ratings'])

for genre in genre_mean_rating_df.index:
    genre_mean_rating = train_movie_mean_ratings_df[genre_df[genre_df[genre].isin([1])].index].mean()
    genre_mean_rating_df.loc[genre]['avg_ratings'] = genre_mean_rating

genre_mean_rating_df

Unnamed: 0,avg_ratings
(no genres listed),3.35679
Action,3.121847
Adventure,3.205439
Animation,3.477723
Children,3.115097
Comedy,3.170161
Crime,3.313487
Documentary,3.777727
Drama,3.426015
Fantasy,3.220115


In [32]:
def get_genre_mean_rating(x):
    genre_list = movie_df.loc[x]['genres'].split('|')
    rating = 0
    for genre in genre_list:
        rating += genre_mean_rating_df.loc[genre]['avg_ratings']

    return rating / len(genre_list)

In [33]:
tqdm.pandas()

test_df['rating_pred4'] = test_df['movieId'].progress_apply(lambda x: get_genre_mean_rating(x))

100%|██████████| 20168/20168 [00:02<00:00, 9836.30it/s]


In [34]:
test_df

Unnamed: 0,userId,movieId,rating,timestamp,rating_pred,rating_pred2,rating_pred3,rating_pred4
67037,432,77866,4.5,1335139641,4.5,2.900000,3.628571,3.335160
42175,288,474,3.0,978465565,1.0,3.754386,3.131737,3.142915
93850,599,4351,3.0,1498524542,4.0,3.250000,2.651029,3.199772
6187,42,2987,4.0,996262677,2.0,3.578947,3.541547,3.262930
12229,75,1610,4.0,1158989841,3.5,3.801370,3.322034,3.163756
...,...,...,...,...,...,...,...,...
57416,380,5048,2.0,1494268065,4.0,2.875000,3.675358,3.163565
67290,434,54272,3.5,1270606860,5.0,3.592105,3.738220,3.323942
33423,226,5989,4.5,1162428551,3.5,3.948454,3.474874,3.369751
98552,607,1320,3.0,963080497,5.0,3.183333,3.744828,3.085319


In [35]:
mse = mean_squared_error(test_df['rating'], test_df['rating_pred4'])
mean_rmse = np.sqrt(mse)

print(f'Rule(1) Recommend MSE : {mse}\nRule(1) Recommend RMSE : {mean_rmse}')

Rule(1) Recommend MSE : 1.1333817433607893
Rule(1) Recommend RMSE : 1.0646040312533056


- 2. 사용자의 평균 영화 평점을 normalize해서 확인 (평점 남긴 수, 표준편차 등 활용가능)

In [36]:
train_user_info_df = pd.DataFrame({
    'mean_rating' : train_df.groupby('userId')['rating'].mean(),
    'std_rating' : train_df.groupby('userId')['rating'].std(),
    'count_rating' : train_df.groupby('userId')['rating'].count()
})

train_user_info_df.head()

Unnamed: 0_level_0,mean_rating,std_rating,count_rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4.331606,0.812587,193
2,3.92,0.837655,25
3,2.580645,2.125625,31
4,3.464706,1.372356,170
5,3.657895,1.046908,38


In [37]:
min_count = train_user_info_df['count_rating'].min()
max_count = train_user_info_df['count_rating'].max()
mean_count = train_user_info_df['count_rating'].mean()

print(min_count, max_count, mean_count)

13 2122 132.24262295081968


In [38]:
# Weights

train_user_info_df['weights'] = train_user_info_df['count_rating'].apply(lambda x: (x-mean_count) / (max_count - min_count))

In [39]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

train_user_info_df_scaled = scaler.fit_transform(train_user_info_df)
train_user_info_df_scaled = pd.DataFrame(train_user_info_df_scaled,
                                         columns=train_user_info_df.columns,
                                         index=train_user_info_df.index)
train_user_info_df_scaled.head()

Unnamed: 0_level_0,mean_rating,std_rating,count_rating,weights
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.817711,0.382282,0.085349,0.085349
2,0.705455,0.394075,0.00569,0.00569
3,0.340176,1.0,0.008535,0.008535
4,0.581283,0.645625,0.074443,0.074443
5,0.633971,0.492518,0.011854,0.011854


In [40]:
train_user_info_df_scaled['normalized_avg_rating'] = train_user_info_df_scaled['mean_rating'] * 5
train_user_info_df_scaled.head()

Unnamed: 0_level_0,mean_rating,std_rating,count_rating,weights,normalized_avg_rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.817711,0.382282,0.085349,0.085349,4.088554
2,0.705455,0.394075,0.00569,0.00569,3.527273
3,0.340176,1.0,0.008535,0.008535,1.70088
4,0.581283,0.645625,0.074443,0.074443,2.906417
5,0.633971,0.492518,0.011854,0.011854,3.169856


In [41]:
test_df['rating_pred5'] = test_df['userId'].apply(lambda x: train_user_info_df_scaled.loc[x]['normalized_avg_rating'])

test_df

Unnamed: 0,userId,movieId,rating,timestamp,rating_pred,rating_pred2,rating_pred3,rating_pred4,rating_pred5
67037,432,77866,4.5,1335139641,4.5,2.900000,3.628571,3.335160,3.129870
42175,288,474,3.0,978465565,1.0,3.754386,3.131737,3.142915,2.452368
93850,599,4351,3.0,1498524542,4.0,3.250000,2.651029,3.199772,1.796857
6187,42,2987,4.0,996262677,2.0,3.578947,3.541547,3.262930,3.011201
12229,75,1610,4.0,1158989841,3.5,3.801370,3.322034,3.163756,2.711864
...,...,...,...,...,...,...,...,...,...
57416,380,5048,2.0,1494268065,4.0,2.875000,3.675358,3.163565,3.193670
67290,434,54272,3.5,1270606860,5.0,3.592105,3.738220,3.323942,3.279391
33423,226,5989,4.5,1162428551,3.5,3.948454,3.474874,3.369751,2.920283
98552,607,1320,3.0,963080497,5.0,3.183333,3.744828,3.085319,3.288401


In [42]:
mse = mean_squared_error(test_df['rating'], test_df['rating_pred5'])
mean_rmse = np.sqrt(mse)

print(f'Rule(2) Recommend MSE : {mse}\nRule(2) Recommend RMSE : {mean_rmse}')

Rule(2) Recommend MSE : 1.2273721504015616
Rule(2) Recommend RMSE : 1.1078682910894966
