### MovieLens 데이터셋을 활용한 간단한 추천 알고리즘 만들기
- 평점을 예측하고 평가를 RMSE로 판단

In [2]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

import warnings
warnings.filterwarnings('ignore')

In [6]:
path = '../data/movielens/'

In [187]:
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path,'movies.csv'), index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')

In [189]:
print(ratings_df.shape)
print(movies_df.shape)
print(tags_df.shape)

(100836, 4)
(9742, 2)
(3683, 4)


#### ratings 데이터 정보 확인
- 몇 명의 유저가 몇 개의 영화에 평점을 줬는지
- 각 유저가 어떤 영화에 평점을 줬는지 sparse matrix

In [15]:
ratings_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [20]:
users = ratings_df['userId'].unique()
movies = ratings_df['movieId'].unique()

print('총 유저 수 : ', len(users))
print('총 영화 수 : ', len(movies))

총 유저 수 :  610
총 영화 수 :  9724


#### 유저-영화 매트릭스

In [27]:
user_movie_matrix = ratings_df.pivot(
    index='movieId',
    columns='userId',
    values='rating').fillna(0)

user_movie_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
sparse_matrix = csr_matrix(user_movie_matrix.values)
print(sparse_matrix)   # 왼쪽 영화, 오른쪽 유저

  (0, 0)	4.0
  (0, 4)	4.0
  (0, 6)	4.5
  (0, 14)	2.5
  (0, 16)	4.5
  (0, 17)	3.5
  (0, 18)	4.0
  (0, 20)	3.5
  (0, 26)	3.0
  (0, 30)	5.0
  (0, 31)	3.0
  (0, 32)	3.0
  (0, 39)	5.0
  (0, 42)	5.0
  (0, 43)	3.0
  (0, 44)	4.0
  (0, 45)	5.0
  (0, 49)	3.0
  (0, 53)	3.0
  (0, 56)	5.0
  (0, 62)	5.0
  (0, 63)	4.0
  (0, 65)	4.0
  (0, 67)	2.5
  (0, 70)	5.0
  :	:
  (9700, 337)	2.5
  (9701, 337)	3.0
  (9702, 183)	4.0
  (9702, 247)	3.5
  (9703, 317)	2.5
  (9704, 209)	1.0
  (9705, 461)	2.5
  (9706, 49)	3.5
  (9707, 337)	1.5
  (9708, 337)	4.0
  (9709, 337)	1.0
  (9710, 337)	1.5
  (9711, 337)	1.0
  (9712, 337)	1.0
  (9713, 183)	4.5
  (9714, 183)	3.5
  (9715, 183)	3.0
  (9716, 183)	4.0
  (9717, 183)	4.0
  (9718, 183)	3.5
  (9719, 183)	4.0
  (9720, 183)	3.5
  (9721, 183)	3.5
  (9722, 183)	3.5
  (9723, 330)	4.0


In [61]:
user_info = pd.DataFrame(data = [sum(list(user_movie_matrix[int(i)].value_counts())[1:]) for i in user_movie_matrix.columns],
                         index = user_movie_matrix.columns, columns =['movie_rated'])

user_info

Unnamed: 0_level_0,movie_rated
userId,Unnamed: 1_level_1
1,232
2,29
3,39
4,216
5,44
...,...
606,1115
607,187
608,831
609,37


In [80]:
movie_info = pd.DataFrame([sum(list(user_movie_matrix.loc[int(i)].value_counts())[1:]) for i in user_movie_matrix.index],
                         columns= ['user_rated'], index = user_movie_matrix.index)

movie_info

Unnamed: 0_level_0,user_rated
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
...,...
193581,1
193583,1
193585,1
193587,1


### MovieLens 데이터셋 중 학습셋과 평가셋 나누기

In [83]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=1234)

In [85]:
print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


#### test set에는 존재하지만, train set에는 없는 영화 또는 사용자 비율

In [98]:
# 집합 A - 집합 B => 집합 B에는 없고 집합 A에는 있는 item


user_not_included = list(set(test_df['userId'].unique()) - set(train_df['userId'].unique()))
print(len(user_not_included))
print('test set의 전체 유저 수 :', len(test_df['userId'].unique()))

0
610


In [99]:
movie_not_included = list(set(test_df['movieId'].unique()) - set(train_df['movieId'].unique()))
print(len(movie_not_included))
print('test set의 전체 영화 수 :', len(test_df['movieId'].unique()))

786
test set의 전체 영화 수 : 5171


In [107]:
# train set에는 없지만 test set에는 존재하는 영화 데이터

test_set_not_movie = test_df[test_df['movieId'].isin(movie_not_included)].sort_values(by='movieId')
print('train set에는 없지만 test set에는 존재하는 영화 데이터 수 : ', test_set_not_movie.shape[0])

train set에는 없지만 test set에는 존재하는 영화 데이터 수 :  852


## 간단한 추천 알고리즘

- 1. 랜덤으로 평점 예측
- 2. 영화 평균 평점 기반 예측
- 3. 사용자 평균 평점 기반 예측
- 4. Rule 기반 영화 랭킹 예측

test에 있고, train에 없는 경우

#### 1. 랜덤으로 평균 평점 예측

In [109]:
# 0.5 에서 5.0 사이의 숫자 예측 

ratings_range = np.arange(0.5,5.5, step=0.5)
ratings_range

array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])

In [110]:
import random

pred_random = [random.choice(ratings_range) for x in range(len(test_df))]
pred_random

[1.0,
 1.5,
 1.0,
 2.0,
 5.0,
 2.5,
 3.0,
 4.5,
 4.0,
 3.0,
 0.5,
 3.5,
 4.5,
 3.5,
 2.5,
 0.5,
 2.5,
 0.5,
 4.0,
 4.5,
 0.5,
 4.5,
 3.5,
 4.5,
 5.0,
 5.0,
 1.0,
 2.5,
 3.0,
 0.5,
 0.5,
 2.0,
 0.5,
 3.0,
 1.0,
 1.0,
 1.0,
 3.5,
 2.5,
 4.0,
 5.0,
 2.0,
 2.5,
 5.0,
 2.0,
 1.0,
 2.0,
 4.0,
 1.5,
 2.5,
 0.5,
 2.0,
 3.0,
 3.5,
 4.5,
 3.0,
 2.5,
 3.5,
 4.5,
 4.0,
 1.5,
 5.0,
 4.0,
 1.5,
 4.5,
 5.0,
 3.5,
 1.5,
 3.5,
 4.5,
 5.0,
 5.0,
 4.5,
 4.0,
 0.5,
 1.5,
 0.5,
 4.5,
 4.0,
 3.0,
 3.5,
 2.5,
 5.0,
 3.0,
 5.0,
 2.5,
 4.5,
 2.5,
 3.0,
 3.0,
 2.0,
 3.5,
 3.0,
 3.5,
 2.5,
 4.0,
 3.5,
 0.5,
 4.0,
 3.5,
 0.5,
 0.5,
 2.5,
 1.0,
 2.5,
 2.0,
 3.5,
 0.5,
 2.5,
 3.5,
 1.0,
 2.0,
 1.0,
 1.0,
 4.0,
 4.0,
 3.5,
 5.0,
 3.5,
 4.5,
 1.5,
 1.0,
 1.5,
 3.0,
 4.0,
 3.5,
 2.5,
 0.5,
 2.5,
 1.0,
 3.0,
 4.5,
 0.5,
 3.5,
 3.5,
 0.5,
 0.5,
 1.0,
 4.0,
 4.5,
 3.0,
 2.0,
 2.0,
 4.5,
 2.0,
 3.0,
 3.0,
 4.0,
 1.5,
 4.0,
 4.5,
 0.5,
 1.5,
 2.0,
 2.0,
 1.5,
 1.0,
 1.0,
 5.0,
 3.0,
 4.5,
 2.5,
 2.0,
 4.5,
 3.0,
 1.0,
 1.5

In [115]:
test_df['pred_ratings_random'] = pred_random
test_df

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random
99731,610,3527,5.0,1479545223,1.0
97583,606,1250,3.5,1171376891,1.5
38197,262,213,5.0,840310907,1.0
11474,68,69406,3.0,1261622505,2.0
34105,232,4728,3.0,1218166950,5.0
...,...,...,...,...,...
41080,279,593,4.0,1506394242,4.5
4897,31,780,4.0,850466616,1.5
8023,56,410,3.0,835799188,1.5
77467,483,2291,4.0,1415579167,3.0


In [116]:
# RMSE로 랜덤 점수에 대한 평가

mse = mean_squared_error(y_true=test_df['rating'].values, y_pred=test_df['pred_ratings_random'].values)
rmse = np.sqrt(mse)

print(mse,rmse)

3.6941689805632687 1.9220221071994121


#### 2. 영화 평균 평점 기반 예측
- train set의 모든 영화에 대해서 평균 평점 구하기
- test set을 예측할 때, train_set의 영화 평균 평점 활용하기. 없다면 random으로 선택

In [123]:
movie_mean_df = train_df.groupby('movieId').mean()

movie_mean_df

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,307.473373,3.893491,1.128439e+09
2,327.475610,3.396341,1.142893e+09
3,266.386364,3.454545,9.900434e+08
4,192.750000,2.250000,8.425133e+08
5,309.526316,3.039474,1.007415e+09
...,...,...,...
193573,184.000000,4.000000,1.537100e+09
193579,184.000000,3.500000,1.537107e+09
193581,184.000000,4.000000,1.537109e+09
193587,184.000000,3.500000,1.537110e+09


In [127]:
def average_prediction(mean_df, x):
    if x in mean_df.index:
        pred_ratings = mean_df.loc[x]['rating']
    else:
        pred_ratings = random.choice(ratings_range)
    return pred_ratings

In [128]:
test_df['pred_movie_mean_rating'] = test_df['movieId'].apply(lambda x: average_prediction(movie_mean_df,x))

test_df

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_movie_mean_rating
99731,610,3527,5.0,1479545223,1.0,3.604167
97583,606,1250,3.5,1171376891,1.5,4.180556
38197,262,213,5.0,840310907,1.0,3.750000
11474,68,69406,3.0,1261622505,2.0,3.571429
34105,232,4728,3.0,1218166950,5.0,2.769231
...,...,...,...,...,...,...
41080,279,593,4.0,1506394242,4.5,4.127907
4897,31,780,4.0,850466616,1.5,3.470760
8023,56,410,3.0,835799188,1.5,3.131148
77467,483,2291,4.0,1415579167,3.0,3.734375


In [129]:
mse = mean_squared_error(y_true = test_df['rating'], y_pred=test_df['pred_movie_mean_rating'])
rmse = np.sqrt(mse)

print(mse, rmse)

1.0636728651042733 1.031345172628579


### 사용자 평균 평점기반 예측
- train set의 모든 유저가 준 평균 평점
- test set을 예측할 때, 유저가 train set에서 준 평균 평점을 활용. 유저가 없을 경우 random 평점 적용

In [132]:
user_mean = train_df.groupby('userId').mean()
user_mean

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1891.168478,4.320652,9.649865e+08
2,70402.760000,3.940000,1.445715e+09
3,8394.733333,2.516667,1.306464e+09
4,1957.923077,3.631868,9.655941e+08
5,337.606061,3.636364,8.474351e+08
...,...,...,...
606,9380.236158,3.649718,1.179733e+09
607,1906.558621,3.772414,9.649104e+08
608,4448.867669,3.145865,1.122822e+09
609,495.275862,3.275862,8.472210e+08


In [136]:
test_df['pred_user_mean_rating'] = test_df['rating'].apply(lambda x: average_prediction(user_mean, x))
test_df

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_movie_mean_rating,pred_user_mean_rating
99731,610,3527,5.0,1479545223,1.0,3.604167,3.636364
97583,606,1250,3.5,1171376891,1.5,4.180556,3.000000
38197,262,213,5.0,840310907,1.0,3.750000,3.636364
11474,68,69406,3.0,1261622505,2.0,3.571429,2.516667
34105,232,4728,3.0,1218166950,5.0,2.769231,2.516667
...,...,...,...,...,...,...,...
41080,279,593,4.0,1506394242,4.5,4.127907,3.631868
4897,31,780,4.0,850466616,1.5,3.470760,3.631868
8023,56,410,3.0,835799188,1.5,3.131148,2.516667
77467,483,2291,4.0,1415579167,3.0,3.734375,3.631868


In [138]:
mse = mean_squared_error(y_true=test_df['rating'].values, y_pred= test_df['pred_user_mean_rating'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.9813192161988435 1.4075934129566121


### Rule 기반 영화 평점 예측 (1)
- train set에 포함된 유저의 영화 평균 평점과 영화 장르 활용
- 영화 장르별 평균 평점 계산 -> test set의 영화 장르의 평균 평점으로 예측

In [249]:
train_df  # train, test 데이터로 위에서 split 한 학습 데이터 

Unnamed: 0,userId,movieId,rating,timestamp
95713,600,5943,3.0,1237714356
61560,407,2571,5.0,1424349171
77204,482,8958,4.0,1105397126
93367,599,2322,2.5,1498515283
90892,590,2959,3.5,1258416553
...,...,...,...,...
89460,580,1923,4.0,1167790046
60620,391,2232,4.0,1030826940
34086,232,4344,4.0,1206995838
58067,380,166528,5.0,1493419871


In [251]:
genres_df

NameError: name 'genres' is not defined

In [250]:
train_user_movie_matrix = train_df.pivot(
    index= 'movieId',
    columns= 'userId',
    values ='rating').fillna(0)

train_user_movie_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,0.0,2.5,0.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,0.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [220]:
genres_df = movies_df['genres'].str.get_dummies(sep="|")
genres_df

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
193583,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
193585,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
193587,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [224]:
genres_df.shape

(9742, 20)

In [225]:
genres_df = genres_df.loc[train_df.movieId.unique()]
genres_df

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
5943,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2571,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
8958,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2322,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
2959,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45648,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
6067,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
26861,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
6814,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [227]:
genres_df.shape

(8938, 20)

In [229]:
# train set에서 영화별 유저 평점 평균

train_movie_avg_rating = train_user_movie_matrix.copy()
train_movie_avg_rating

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,0.0,2.5,0.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,0.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [230]:
train_movie_avg_rating = train_movie_avg_rating.replace(0, np.NaN)
train_movie_avg_rating = train_movie_avg_rating.mean(axis=1)
train_movie_avg_rating

movieId
1         3.893491
2         3.396341
3         3.454545
4         2.250000
5         3.039474
            ...   
193573    4.000000
193579    3.500000
193581    4.000000
193587    3.500000
193609    4.000000
Length: 8938, dtype: float64

In [231]:
# genres_df 에서 해당 장르가 포함된 모든 영화 index를 가져와서, 해당 영화의 유저 평균 평점의 평균을 구해 장르 평균 평점으로 활용

genres_avg_rating = pd.DataFrame(index= genres_df.columns, columns=['avg_ratings'])
genres_avg_rating

Unnamed: 0,avg_ratings
(no genres listed),
Action,
Adventure,
Animation,
Children,
Comedy,
Crime,
Documentary,
Drama,
Fantasy,


In [240]:
for genre in genres_avg_rating.index:
    genre_avg_rating = train_movie_avg_rating.loc[genres_df[genres_df[genre].isin([1])].index].mean()
    genres_avg_rating.loc[genre]['avg_ratings'] = genre_avg_rating
    
genres_avg_rating

Unnamed: 0,avg_ratings
(no genres listed),3.33642
Action,3.11085
Adventure,3.230721
Animation,3.492258
Children,3.101232
Comedy,3.18148
Crime,3.313588
Documentary,3.801026
Drama,3.429093
Fantasy,3.240257


In [247]:
def genre_avg_rating(x):
    genres_list = movies_df.loc[x]['genres'].split('|')
    rating = 0
    
    for genre in genres_list:
        rating += genre_avg_rating.loc[genre]['avg_ratings']
    
    return rating / len(genres_list)

In [246]:
genres_list = movies_df.loc[3]['genres'].split('|')
print(genres_list)

['Comedy', 'Romance']


In [None]:
tqdm.pandas()
test_df['pred_rating_genre'] = test_df[]

### Rule 기반 영화 평점 예측(2)

- user의 평균 영화 평점을 normalized 해서 확인하기, 평점 측정 수, 표준편차 등 활용가능

In [170]:
train_user_info_df = pd.DataFrame(
    {'avg_ratings': train_df.groupby('userId')['rating'].mean(),
    'std_ratings': train_df.groupby('userId')['rating'].std(),
    'count_ratings': train_df.groupby('userId')['rating'].count()
    })

train_user_info_df

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4.320652,0.836600,184
2,3.940000,0.820569,25
3,2.516667,2.127340,30
4,3.631868,1.317823,182
5,3.636364,1.084498,33
...,...,...,...
606,3.649718,0.734887,885
607,3.772414,0.955574,145
608,3.145865,1.071503,665
609,3.275862,0.454859,29


In [173]:
min_count = train_user_info_df['count_ratings'].min()
max_count = train_user_info_df['count_ratings'].max()
avg_count = train_user_info_df['count_ratings'].mean()


train_user_info_df['weights'] = train_user_info_df['count_ratings'].apply(lambda x : (x-avg_count) / (max_count-min_count))
train_user_info_df

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings,weights
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,4.320652,0.836600,184,0.023995
2,3.940000,0.820569,25,-0.049718
3,2.516667,2.127340,30,-0.047400
4,3.631868,1.317823,182,0.023068
5,3.636364,1.084498,33,-0.046010
...,...,...,...,...
606,3.649718,0.734887,885,0.348983
607,3.772414,0.955574,145,0.005914
608,3.145865,1.071503,665,0.246990
609,3.275862,0.454859,29,-0.047864


#### 정규화

In [178]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(train_user_info_df)
df_normalized = pd.DataFrame(np_scaled, columns= train_user_info_df.columns, index=train_user_info_df.index )

df_normalized

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings,weights
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.822227,0.393261,0.079740,0.079740
2,0.722617,0.385725,0.006027,0.006027
3,0.350156,1.000000,0.008345,0.008345
4,0.641984,0.619470,0.078813,0.078813
5,0.643161,0.509791,0.009736,0.009736
...,...,...,...,...
606,0.646655,0.345449,0.404729,0.404729
607,0.678762,0.449188,0.061660,0.061660
608,0.514806,0.503682,0.302735,0.302735
609,0.548824,0.213816,0.007881,0.007881


In [179]:
df_normalized['nomalized_avg_ratings'] = df_normalized['avg_ratings'] * 5
df_normalized

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings,weights,nomalized_avg_ratings
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.822227,0.393261,0.079740,0.079740,4.111134
2,0.722617,0.385725,0.006027,0.006027,3.613084
3,0.350156,1.000000,0.008345,0.008345,1.750779
4,0.641984,0.619470,0.078813,0.078813,3.209921
5,0.643161,0.509791,0.009736,0.009736,3.215803
...,...,...,...,...,...
606,0.646655,0.345449,0.404729,0.404729,3.233275
607,0.678762,0.449188,0.061660,0.061660,3.393812
608,0.514806,0.503682,0.302735,0.302735,2.574029
609,0.548824,0.213816,0.007881,0.007881,2.744119


In [182]:
test_df['pred_rating_normalized'] = test_df['userId'].apply(lambda x: df_normalized.loc[x]['nomalized_avg_ratings'])
test_df

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_movie_mean_rating,pred_user_mean_rating,pred_rating_normalized
99731,610,3527,5.0,1479545223,1.0,3.604167,3.636364,3.271208
97583,606,1250,3.5,1171376891,1.5,4.180556,3.000000,3.233275
38197,262,213,5.0,840310907,1.0,3.750000,3.636364,2.285047
11474,68,69406,3.0,1261622505,2.0,3.571429,2.516667,2.683236
34105,232,4728,3.0,1218166950,5.0,2.769231,2.516667,2.700164
...,...,...,...,...,...,...,...,...
41080,279,593,4.0,1506394242,4.5,4.127907,3.631868,3.255452
4897,31,780,4.0,850466616,1.5,3.470760,3.631868,3.576141
8023,56,410,3.0,835799188,1.5,3.131148,2.516667,3.479414
77467,483,2291,4.0,1415579167,3.0,3.734375,3.631868,3.166837


In [184]:
mse = mean_squared_error(y_true=test_df['rating'], y_pred=test_df['pred_rating_normalized'])
rmse = np.sqrt(mse)

print(mse, rmse)

1.120579096060227 1.05857408624065
