# 라이브러리 및 데이터 로드

In [2]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

import warnings
warnings.filterwarnings("ignore")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# 경로 설정
path = '/content/drive/MyDrive/data/movielens/'

In [11]:
# 데이터 로드
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col = 'movieId' ,encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')

In [6]:
print(ratings_df.shape)
print(movies_df.shape)
print(tags_df.shape)

(100836, 4)
(9742, 3)
(3683, 4)


In [12]:
movies_df.index

Int64Index([     1,      2,      3,      4,      5,      6,      7,      8,
                 9,     10,
            ...
            193565, 193567, 193571, 193573, 193579, 193581, 193583, 193585,
            193587, 193609],
           dtype='int64', name='movieId', length=9742)

영화 데이터는 9742개인데 movieid는 193609까지 있다. <br/>
중간이 비어있음을 확인

# ratings 데이터 분석

In [13]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [15]:
# 몇 명의 유저가 몇 개의 영화에 평점을 주었는지
# 각 유저가 어떤 영화에 평점을 주었는지 sparse matrix
num_users = ratings_df['userId'].unique()
num_movies = ratings_df['movieId'].unique()

print("총 유저 수: ", len(num_users))
print("총 영화 수: ", len(num_movies))

총 유저 수:  610
총 영화 수:  9724


In [16]:
# pivot ratings into movie features
user_movie_matrix = ratings_df.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

user_movie_matrix

userId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,4.5,3.5,4.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,5.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,3.0,2.5,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.5,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,2.5,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,2.5,4.0,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# convert dataframe of movies to scipy sparse matrix
sparse_mat = csr_matrix(user_movie_matrix.values)
print(sparse_mat)

  (0, 0)	4.0
  (0, 4)	4.0
  (0, 6)	4.5
  (0, 14)	2.5
  (0, 16)	4.5
  (0, 17)	3.5
  (0, 18)	4.0
  (0, 20)	3.5
  (0, 26)	3.0
  (0, 30)	5.0
  (0, 31)	3.0
  (0, 32)	3.0
  (0, 39)	5.0
  (0, 42)	5.0
  (0, 43)	3.0
  (0, 44)	4.0
  (0, 45)	5.0
  (0, 49)	3.0
  (0, 53)	3.0
  (0, 56)	5.0
  (0, 62)	5.0
  (0, 63)	4.0
  (0, 65)	4.0
  (0, 67)	2.5
  (0, 70)	5.0
  :	:
  (9700, 337)	2.5
  (9701, 337)	3.0
  (9702, 183)	4.0
  (9702, 247)	3.5
  (9703, 317)	2.5
  (9704, 209)	1.0
  (9705, 461)	2.5
  (9706, 49)	3.5
  (9707, 337)	1.5
  (9708, 337)	4.0
  (9709, 337)	1.0
  (9710, 337)	1.5
  (9711, 337)	1.0
  (9712, 337)	1.0
  (9713, 183)	4.5
  (9714, 183)	3.5
  (9715, 183)	3.0
  (9716, 183)	4.0
  (9717, 183)	4.0
  (9718, 183)	3.5
  (9719, 183)	4.0
  (9720, 183)	3.5
  (9721, 183)	3.5
  (9722, 183)	3.5
  (9723, 330)	4.0


In [20]:
# user 별 rating을 몇개 주었는지 
user_info_df = pd.DataFrame(data = [sum(list(user_movie_matrix[int(x)].value_counts())[1:]) for x in user_movie_matrix.columns],
                            index = user_movie_matrix.columns, columns=['movie_rated'])

In [None]:
# user_movie_matrix.columns 는 userId

In [21]:
user_info_df

Unnamed: 0_level_0,movie_rated
userId,Unnamed: 1_level_1
1,232
2,29
3,39
4,216
5,44
...,...
606,1115
607,187
608,831
609,37


In [27]:
# movie 별 몇명의 user가 평가를 내렸는지
movies_info_df = pd.DataFrame(data=[sum(list(user_movie_matrix.loc[int(x)].value_counts())[1:]) for x in user_movie_matrix.index],
                                    index = user_movie_matrix.index, columns=['user_rated'])

In [35]:
(list(user_movie_matrix.loc[1].value_counts())[1:]) #1부터 시작하는 이유 0점은 제외

[82, 47, 34, 18, 18, 8, 6, 1, 1]

In [32]:
movies_info_df

Unnamed: 0_level_0,user_rated
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
...,...
193581,1
193583,1
193585,1
193587,1


정보가 bias되어있음

# MovieLens 데이터 셋 학습/평가 세트 생성

In [36]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=1234)

In [37]:
print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


test 세트에는 존재하나 train 세트에는 없는 데이터 비율 확인

In [38]:
# userId
print('사용자: ', len(list(set(test_df['userId'].unique()) - set(train_df['userId'].unique()))))

사용자:  0


In [41]:
# movieId
print("영화: ", len(list(set(test_df['movieId'].unique()) - set(train_df['movieId'].unique()))))


영화:  786


In [42]:
print("test 세트의 전체 영화 쉬 ", len(test_df['movieId'].unique()))

test 세트의 전체 영화 쉬  5171


In [46]:
movies_not_included = list(set(test_df['movieId'].unique()) - set(train_df['movieId'].unique()))
print(sorted(movies_not_included)[:10])

not_included_df = test_df[test_df.movieId.isin(movies_not_included)].sort_values(by='movieId')
print(not_included_df.head())

[49, 117, 137, 178, 241, 320, 359, 478, 488, 495]
       userId  movieId  rating  timestamp
29386     202       49     3.0  974925453
97066     604      117     3.0  832080636
99501     609      137     3.0  847221054
27959     191      178     1.0  829760898
98493     607      241     4.0  964744490


In [47]:
print('train 세트에는 없고 test 세트에만 있는 영화 데이터 수: ', not_included_df.shape)

train 세트에는 없고 test 세트에만 있는 영화 데이터 수:  (852, 4)


학습 셋에 없는 데이터는 추천 시스텡에서 예측하기가 어려움

# 간단한 추천 알고리즘

1. 랜덤으로 평점 예측
2. 영화 평균 평점기반 예측
3. 사용자 평균 평점기반 예측
4. Rule 기반 영화 랭킹 예측

## 1.랜덤으로 평점 예측

In [49]:
# 0.5 ~ 5.0점 사이 숫자를 랜덤으로 예측
ratings_range = np.arange(0.5, 5, step=0.5)
ratings_range

array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5])

In [50]:
import random
pred_random = [random.choice(ratings_range) for x in range(len(test_df))]
pred_random[:10]

[3.5, 4.5, 2.0, 0.5, 3.5, 4.0, 4.5, 2.5, 2.0, 3.5]

In [52]:
test_df['pred_ratings_random'] = pred_random

In [53]:
test_df

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random
99731,610,3527,5.0,1479545223,3.5
97583,606,1250,3.5,1171376891,4.5
38197,262,213,5.0,840310907,2.0
11474,68,69406,3.0,1261622505,0.5
34105,232,4728,3.0,1218166950,3.5
...,...,...,...,...,...
41080,279,593,4.0,1506394242,0.5
4897,31,780,4.0,850466616,4.0
8023,56,410,3.0,835799188,4.5
77467,483,2291,4.0,1415579167,2.0


In [57]:
# 성능 평가
mse = mean_squared_error(y_true = test_df['rating'].values, 
                         y_pred = test_df['pred_ratings_random'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

3.751772610075367 1.9369493049833202


## 2.영화 평균 평점기반 예측

1. train 세트의 모든 영화에 대한 평균 평점 계산
2. test 세트를 예측할 때 train 세트의 평균 평점 활용하고, 없다면 random으로 선택

In [58]:
train_movie_df = train_df.groupby('movieId').mean()

train_movie_df.head()

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,307.473373,3.893491,1128439000.0
2,327.47561,3.396341,1142893000.0
3,266.386364,3.454545,990043400.0
4,192.75,2.25,842513300.0
5,309.526316,3.039474,1007415000.0


In [62]:
def avg_rating_prediction(training_set, x):
  if x in training_set.index:
    pred_rating = training_set.loc[x]['rating']
  else:
    pred_rating = random.choice(ratings_range)
  return pred_rating

In [63]:
test_df['pred_rating_movie'] = test_df['movieId'].apply(lambda x: avg_rating_prediction(train_movie_df, x))

In [64]:
test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movie
99731,610,3527,5.0,1479545223,3.5,3.604167
97583,606,1250,3.5,1171376891,4.5,4.180556
38197,262,213,5.0,840310907,2.0,3.75
11474,68,69406,3.0,1261622505,0.5,3.571429
34105,232,4728,3.0,1218166950,3.5,2.769231


In [65]:
# 평가
mse = mean_squared_error(y_true = test_df['rating'].values, 
                         y_pred = test_df['pred_rating_movie'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.0609953561792436 1.0300462883672965


사람들이 준 평점이 경향성이 있어 보임

## 3.사용자 평균 평점기반 예측

In [66]:
train_user_df = train_df.groupby('userId').mean()

train_user_df.head()

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1891.168478,4.320652,964986500.0
2,70402.76,3.94,1445715000.0
3,8394.733333,2.516667,1306464000.0
4,1957.923077,3.631868,965594100.0
5,337.606061,3.636364,847435100.0


In [67]:
test_df['pred_rating_user'] = test_df['userId'].apply(lambda x : avg_rating_prediction(train_user_df, x))

In [68]:
test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movie,pred_rating_user
99731,610,3527,5.0,1479545223,3.5,3.604167,3.678709
97583,606,1250,3.5,1171376891,4.5,4.180556,3.649718
38197,262,213,5.0,840310907,2.0,3.75,2.925
11474,68,69406,3.0,1261622505,0.5,3.571429,3.229331
34105,232,4728,3.0,1218166950,3.5,2.769231,3.242268


In [69]:
mse = mean_squared_error(y_true = test_df['rating'].values,
                         y_pred = test_df['pred_rating_user'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

0.8905889036428333 0.9437101798978504


## 4.Rule 기반 영화 평점 예측 - 장르기반

train set에 포함된 **유저의 평균 평점**과 **영화의 장르**를 활용하여 장르 별 평균 평점 계산

In [70]:
train_user_movie_matrix = train_df.pivot(
    index = 'movieId',
    columns= 'userId',
    values= 'rating'
).fillna(0)

In [71]:
train_user_movie_matrix

userId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,3.5,4.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0,0.0,4.0,0.0,4.0,3.0,4.0,2.5,0.0,2.5,0.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.5,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,2.5,4.0,0.0,4.0,0.0,0.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
genres_df = movies_df['genres'].str.get_dummies(sep='|')
print(genres_df.shape)

# train  데이터로만 해야하므로
genres_df = genres_df.loc[train_df.movieId.unique()]
print(genres_df.shape)
genres_df.head()

(9742, 20)
(8938, 20)


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
5943,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2571,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
8958,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2322,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
2959,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0


In [74]:
# train세트에서 영화 별 평점 평균
train_movie_avg_ratings_df = train_user_movie_matrix.copy()
train_movie_avg_ratings_df = train_movie_avg_ratings_df.replace(0, np.NaN)
train_movie_avg_ratings_df = train_movie_avg_ratings_df.mean(axis = 1)

train_movie_avg_ratings_df.head()

movieId
1    3.893491
2    3.396341
3    3.454545
4    2.250000
5    3.039474
dtype: float64

In [75]:
# genre 별 평점 평균
genres_avg_ratings_df = pd.DataFrame(index=genres_df.columns, columns=['avg_ratings'])

for genre in genres_avg_ratings_df.index:
  genre_avg_rating = train_movie_avg_ratings_df.loc[genres_df[genres_df[genre].isin([1])].index].mean()
  genres_avg_ratings_df.loc[genre]['avg_ratings'] = genre_avg_rating

genres_avg_ratings_df

Unnamed: 0,avg_ratings
(no genres listed),3.33642
Action,3.11085
Adventure,3.23072
Animation,3.49226
Children,3.10123
Comedy,3.18148
Crime,3.31359
Documentary,3.80103
Drama,3.42909
Fantasy,3.24026


In [80]:
# 특정 영화의 장르가 Sci-Fi, 스릴러, 뮤지컬이라면 그 세 장르의 평균 평점의 평균을 구함
def get_genre_avg_rating(x):
  genres_list = movies_df.loc[x]['genres'].split('|')
  rating = 0

  for genre in genres_list:
    rating += genres_avg_ratings_df.loc[genre]['avg_ratings']
  
  return rating / len(genres_list)

In [81]:
tqdm.pandas()
test_df['pred_rating_genres'] = test_df['movieId'].progress_apply(lambda x : get_genre_avg_rating(x))

100%|██████████| 20168/20168 [00:07<00:00, 2815.57it/s]


In [82]:
test_df

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movie,pred_rating_user,pred_rating_genres
99731,610,3527,5.0,1479545223,3.5,3.604167,3.678709,3.138325
97583,606,1250,3.5,1171376891,4.5,4.180556,3.649718,3.410377
38197,262,213,5.0,840310907,2.0,3.750000,2.925000,3.429093
11474,68,69406,3.0,1261622505,0.5,3.571429,3.229331,3.267870
34105,232,4728,3.0,1218166950,3.5,2.769231,3.242268,3.181480
...,...,...,...,...,...,...,...,...
41080,279,593,4.0,1506394242,0.5,4.127907,3.666667,3.132440
4897,31,780,4.0,850466616,4.0,3.470760,3.911765,3.161424
8023,56,410,3.0,835799188,4.5,3.131148,3.837838,3.174323
77467,483,2291,4.0,1415579167,2.0,3.734375,3.598940,3.341203


In [83]:
mse = mean_squared_error(y_true= test_df['rating'].values, y_pred = test_df['pred_rating_genres'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.1251906030478547 1.0607500191128232


개별 유저의 성향을 파악하는 것이 더 중요함을 확인

## 4.Rule 기반 영화 평점 예측2

user 평균 영화 평점을 normalize하여 확인

In [86]:
train_user_info_df = pd.DataFrame({
    'avg_ratings' : train_df.groupby('userId')['rating'].mean(),
    'std_ratings' : train_df.groupby('userId')['rating'].std(),
    'count_ratings' : train_df.groupby('userId')['rating'].count()
})

train_user_info_df

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4.320652,0.836600,184
2,3.940000,0.820569,25
3,2.516667,2.127340,30
4,3.631868,1.317823,182
5,3.636364,1.084498,33
...,...,...,...
606,3.649718,0.734887,885
607,3.772414,0.955574,145
608,3.145865,1.071503,665
609,3.275862,0.454859,29


In [87]:
# 가중치
min_count = train_user_info_df['count_ratings'].min()
max_count = train_user_info_df['count_ratings'].max()
avg_count = train_user_info_df['count_ratings'].mean()

train_user_info_df['weights'] = train_user_info_df['count_ratings'].apply(lambda x: (x - avg_count) / (max_count - min_count))

In [88]:
train_user_info_df

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings,weights
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,4.320652,0.836600,184,0.023995
2,3.940000,0.820569,25,-0.049718
3,2.516667,2.127340,30,-0.047400
4,3.631868,1.317823,182,0.023068
5,3.636364,1.084498,33,-0.046010
...,...,...,...,...
606,3.649718,0.734887,885,0.348983
607,3.772414,0.955574,145,0.005914
608,3.145865,1.071503,665,0.246990
609,3.275862,0.454859,29,-0.047864


In [90]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(train_user_info_df)

df_normalized = pd.DataFrame(np_scaled, columns= train_user_info_df.columns, index = train_user_info_df.index)
df_normalized

Unnamed: 0_level_0,avg_ratings,std_ratings,count_ratings,weights
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.822227,0.393261,0.079740,0.079740
2,0.722617,0.385725,0.006027,0.006027
3,0.350156,1.000000,0.008345,0.008345
4,0.641984,0.619470,0.078813,0.078813
5,0.643161,0.509791,0.009736,0.009736
...,...,...,...,...
606,0.646655,0.345449,0.404729,0.404729
607,0.678762,0.449188,0.061660,0.061660
608,0.514806,0.503682,0.302735,0.302735
609,0.548824,0.213816,0.007881,0.007881


In [91]:
df_normalized['normalized_avg_ratings'] = df_normalized['avg_ratings']*5

In [94]:
test_df['pred_rating_normalized'] = test_df['userId'].apply(lambda x : df_normalized.loc[x]['normalized_avg_ratings'])
test_df

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_rating_movie,pred_rating_user,pred_rating_genres,pred_rating_normalzied,pred_rating_normalized
99731,610,3527,5.0,1479545223,3.5,3.604167,3.678709,3.138325,3.271208,3.271208
97583,606,1250,3.5,1171376891,4.5,4.180556,3.649718,3.410377,3.233275,3.233275
38197,262,213,5.0,840310907,2.0,3.750000,2.925000,3.429093,2.285047,2.285047
11474,68,69406,3.0,1261622505,0.5,3.571429,3.229331,3.267870,2.683236,2.683236
34105,232,4728,3.0,1218166950,3.5,2.769231,3.242268,3.181480,2.700164,2.700164
...,...,...,...,...,...,...,...,...,...,...
41080,279,593,4.0,1506394242,0.5,4.127907,3.666667,3.132440,3.255452,3.255452
4897,31,780,4.0,850466616,4.0,3.470760,3.911765,3.161424,3.576141,3.576141
8023,56,410,3.0,835799188,4.5,3.131148,3.837838,3.174323,3.479414,3.479414
77467,483,2291,4.0,1415579167,2.0,3.734375,3.598940,3.341203,3.166837,3.166837


In [95]:
mse = mean_squared_error(y_true = test_df['rating'].values, y_pred = test_df['pred_rating_normalized'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.120579096060227 1.05857408624065
