In [1]:
import tensorflow as tf
import tensorflow.keras as keras
tf.config.run_functions_eagerly(True)

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

# 추천 모델 예시

영화 평점 데이터를 기반으로 추천 모델을 만듭니다.

이번에는 지난 번보다 더 큰 규모의 데이터를 사용하며, 전처리를 거친 데이터를 사용합니다.

https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset

평점이 3.5점 이상이면 호(1), 미만이면 불호(0)로 보고 모델을 학습시킵니다.

## 데이터 로드

영화의 수는 45,115개, 사용자 수는 270,896명, 리뷰 수는 26,024,289개입니다.

In [2]:
movie_df = pd.read_csv('./The Movies Dataset/processed_movies.csv')
print(len(movie_df))
movie_df.head()

45443


Unnamed: 0,title,genres,original_language,release_date,imdbId
0,Toy Story,"Animation, Comedy, Family",en,1995-10-30,114709
1,Jumanji,"Adventure, Fantasy, Family",en,1995-12-15,113497
2,Grumpier Old Men,"Romance, Comedy",en,1995-12-22,113228
3,Waiting to Exhale,"Comedy, Drama, Romance",en,1995-12-22,114885
4,Father of the Bride Part II,Comedy,en,1995-02-10,113041


In [3]:
rating_df = pd.read_csv('./The Movies Dataset/processed_ratings.csv')
rating_df['prefer'] = (rating_df['rating'] >= 3.5).astype(int)
print(len(rating_df))
rating_df.head()

26024289


Unnamed: 0,userId,rating,timestamp,imdbId,prefer
0,1,1.0,1425941529,112573,0
1,11,3.5,1231676989,112573,1
2,22,5.0,1111937009,112573,1
3,24,5.0,979870012,112573,1
4,29,3.0,1044020005,112573,0


In [4]:
rating_df.describe()

Unnamed: 0,userId,rating,timestamp,imdbId,prefer
count,26024290.0,26024290.0,26024290.0,26024290.0,26024290.0
mean,135037.1,3.52809,1171258000.0,295395.0,0.6185742
std,78176.2,1.065443,205288900.0,506896.1,0.4857367
min,1.0,0.5,789652000.0,1.0,0.0
25%,67164.0,3.0,990754500.0,97428.0,0.0
50%,135163.0,3.5,1151716000.0,116261.0,1.0
75%,202693.0,4.0,1357578000.0,257360.0,1.0
max,270896.0,5.0,1501830000.0,7158814.0,1.0


In [5]:
num_of_user = len(rating_df['userId'].unique())
num_of_item = len(rating_df['imdbId'].unique())
num_of_user, num_of_item

(270896, 45115)

In [6]:
(
    rating_df
    .groupby('imdbId')
    .agg('count')
    .describe()
)

Unnamed: 0,userId,rating,timestamp,prefer
count,45115.0,45115.0,45115.0,45115.0
mean,576.843378,576.843378,576.843378,576.843378
std,3037.380582,3037.380582,3037.380582,3037.380582
min,1.0,1.0,1.0,1.0
25%,2.0,2.0,2.0,2.0
50%,8.0,8.0,8.0,8.0
75%,69.0,69.0,69.0,69.0
max,91921.0,91921.0,91921.0,91921.0


In [7]:
(
    rating_df
    .groupby('userId')
    .agg('count')
    .describe()
)

Unnamed: 0,rating,timestamp,imdbId,prefer
count,270896.0,270896.0,270896.0,270896.0
mean,96.067454,96.067454,96.067454,96.067454
std,205.719606,205.719606,205.719606,205.719606
min,1.0,1.0,1.0,1.0
25%,15.0,15.0,15.0,15.0
50%,30.0,30.0,30.0,30.0
75%,93.0,93.0,93.0,93.0
max,18276.0,18276.0,18276.0,18276.0


## 모델 정의

벡터의 크기는 24로 정했습니다.

In [8]:
def compile_model(num_of_user, num_of_item, learning_rate=0.001): 
    VECTOR_DIMENSION = 24
    
    user_input = keras.layers.Input((1,))
    user_embedding = keras.layers.Embedding(num_of_user, VECTOR_DIMENSION, input_length=1, name='user_embedding')
    user = user_embedding(user_input)
    user = keras.layers.Reshape((VECTOR_DIMENSION, 1))(user)
    
    item_input = keras.layers.Input((1,))
    item_embedding = keras.layers.Embedding(num_of_item, VECTOR_DIMENSION, input_length=1, name='item_embedding')
    item = item_embedding(item_input)
    item = keras.layers.Reshape((VECTOR_DIMENSION, 1))(item)
    
    dot_product = keras.layers.dot([user, item], axes=1) 
    output = keras.layers.Activation('sigmoid')(dot_product)
    
    model = keras.models.Model(inputs=[user_input, item_input], outputs=output) 
    model.compile(
        loss='binary_crossentropy', 
        optimizer=keras.optimizers.Adam(learning_rate),
        metrics=['binary_accuracy'],
    )
    return model


model = compile_model(num_of_user, num_of_item)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 user_embedding (Embedding)     (None, 1, 24)        6501504     ['input_1[0][0]']                
                                                                                                  
 item_embedding (Embedding)     (None, 1, 24)        1082760     ['input_2[0][0]']                
                                                                                              

2022-03-25 20:17:32.287018: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
# keras.utils.plot_model(
#     model, to_file='movie.png', show_shapes=True, show_layer_activations=True,
# )

## 학습 데이터 준비

모델은 0부터 연속적인 자연수를 input으로 받기 때문에 user id, movie id를 각각 index로 바꿔줍니다.

Label은 앞서 정의한대로, 3.5점을 기준으로 0, 1로 구분합니다.

Validation, Test set을 각각 2만개 씩 두고, 나머지는 train set으로 사용했습니다.

In [14]:
uid2idx = {v: k for k, v in dict(enumerate(rating_df['userId'].unique())).items()}
iid2idx = {v: k for k, v in dict(enumerate(rating_df['imdbId'].unique())).items()}

dataset = pd.DataFrame()
dataset['uidx'] = rating_df['userId'].map(uid2idx.get)
dataset['iidx'] = rating_df['imdbId'].map(iid2idx.get)
dataset['y'] = rating_df['prefer']
dataset.head()

Unnamed: 0,uidx,iidx,y
0,0,0,0
1,1,0,1
2,2,0,1
3,3,0,1
4,4,0,0


In [15]:
data_x = np.array([
    dataset['uidx'].values, 
    dataset['iidx'].values, 
]).T
data_y = np.array([
    dataset['y'].values, 
]).T
print(data_x.shape, data_y.shape)
train_x, valid_x, train_y, valid_y = train_test_split(
    data_x, data_y, test_size=40000,
)
valid_x, test_x, valid_y, test_y = train_test_split(
    valid_x, valid_y, test_size=20000,
)
print(
    train_x.shape, train_y.shape, 
    valid_x.shape, valid_y.shape, 
    test_x.shape, test_y.shape,
)

(26024289, 2) (26024289, 1)
(25984289, 2) (25984289, 1) (20000, 2) (20000, 1) (20000, 2) (20000, 1)


## 학습

모델을 학습시킵니다.

Batch size, epoch은 제가 임의로 정한 숫자입니다.

제 환경에서는 batch size 2048 기준으로 1 epoch에 약 35분 정도가 걸렸습니다.

In [11]:
checkpoint_filepath = './The Movies Dataset/model24_{val_loss:.4f}.hdf5'
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_best_only=True,
)

In [10]:
model = compile_model(num_of_user, num_of_item)
history = model.fit(
    x=(train_x[:,0,None], train_x[:,1,None]),
    y=train_y, 
    batch_size=2048,
    epochs=5, 
    verbose='auto',
    validation_data=((valid_x[:,0,None], valid_x[:,1,None]), valid_y), 
    shuffle=True,
    callbacks=[model_checkpoint_callback],
)

## 학습 결과

Loss가 가장 낮은 모델 기준 test loss는 약 0.4174, test accuracy는 약 0.8065입니다.

In [11]:
best_model = keras.models.load_model('The Movies Dataset/model24_0.4764.hdf5')

In [17]:
best_model.evaluate(
    x=(test_x[:,0,None], test_x[:,1,None]),
    y=test_y
)



[0.41743502020835876, 0.8064500093460083]

## 비슷한 영화 추천

학습된 영화 벡터를 기반으로 비슷한 영화 추천 결과를 확인합니다.

이번에는 결과가 괜찮아 보입니다. 
- 'Toy Story'와 가장 유사한 영화로 'Toy Story 2', 'Toy Story 3'를 순서대로 꼽고, 이어서 픽사의 장편 영화들이 주로 나옵니다.
- 'Star Wars: Episode I' 또한 가장 유사한 영화로 'Star Wars: Episode II', 'Star Wars: Episode III'가 등장하고 SF/판타지 장르의 영화들이 이어서 나옵니다.

In [18]:
user_layer = best_model.layers[2]
item_layer = best_model.layers[3]
user_vectors = user_layer.get_weights()[0]
item_vectors = item_layer.get_weights()[0]
user_vectors.shape, item_vectors.shape

((270896, 24), (45115, 24))

In [19]:
source_df = (
    movie_df
    .assign(
        movieIdx=lambda df: df['imdbId'].map(iid2idx.get),
    )
)
print(len(source_df))
source_df.head()

45443


Unnamed: 0,title,genres,original_language,release_date,imdbId,movieIdx
0,Toy Story,"Animation, Comedy, Family",en,1995-10-30,114709,183.0
1,Jumanji,"Adventure, Fantasy, Family",en,1995-12-15,113497,1167.0
2,Grumpier Old Men,"Romance, Comedy",en,1995-12-22,113228,1896.0
3,Waiting to Exhale,"Comedy, Drama, Romance",en,1995-12-22,114885,3083.0
4,Father of the Bride Part II,Comedy,en,1995-02-10,113041,27.0


In [20]:
def consine_similarity(v, mat, topn=10):
    sim = np.dot(v, mat.T) / (np.linalg.norm(v) * np.linalg.norm(mat, axis=1))
    ind = np.argpartition(sim, -(topn + 1))[-(topn + 1):]
    ind = ind[np.argsort(sim[ind])][topn - 1::-1]
    return sim[ind], ind

In [21]:
sim, ind = consine_similarity(item_vectors[183], item_vectors)
print(sim)
print(ind)
source_df[source_df['movieIdx'].isin(ind)]

[0.9054582  0.85912645 0.8379016  0.8288878  0.8221444  0.7865561
 0.68195194 0.6790499  0.66605604 0.66534024]
[   90   921   151   318   156   162   347   146    71 41587]


Unnamed: 0,title,genres,original_language,release_date,imdbId,movieIdx
581,Aladdin,"Animation, Family, Comedy, Adventure, Fantasy,...",en,1992-11-25,103639,347.0
2242,A Bug's Life,"Adventure, Animation, Comedy, Family",en,1998-11-25,120623,71.0
2997,Toy Story 2,"Animation, Comedy, Family",en,1999-10-30,120363,90.0
4178,Shrek,"Adventure, Animation, Comedy, Family, Fantasy",en,2001-05-16,126029,146.0
4756,"Monsters, Inc.","Animation, Comedy, Family",en,2001-11-01,198781,318.0
6232,Finding Nemo,"Animation, Family",en,2003-05-30,266543,151.0
8234,The Incredibles,"Action, Adventure, Animation, Family",en,2004-11-05,317705,156.0
11566,Ratatouille,"Animation, Comedy, Family, Fantasy",en,2007-06-22,382932,162.0
15345,Toy Story 3,"Animation, Family, Comedy",en,2010-06-16,435761,921.0
26067,Carry On Behind,Comedy,en,1975-12-01,72764,41587.0


In [22]:
source_df[source_df['title'].str.contains('Wars')].head()

Unnamed: 0,title,genres,original_language,release_date,imdbId,movieIdx
256,Star Wars,"Adventure, Action, Science Fiction",en,1977-05-25,76759,34.0
2514,Star Wars: Episode I - The Phantom Menace,"Adventure, Action, Science Fiction",en,1999-05-19,120915,138.0
5244,Star Wars: Episode II - Attack of the Clones,"Adventure, Action, Science Fiction",en,2002-05-15,121765,325.0
5308,V.I. Warshawski,"Action, Adventure, Drama, Mystery, Thriller",en,1991-07-26,103184,3904.0
8003,To End All Wars,"Action, Comedy, Drama, History",en,2001-09-02,243609,2198.0


In [23]:
sim, ind = consine_similarity(item_vectors[138], item_vectors)
print(sim)
print(ind)
source_df[source_df['movieIdx'].isin(ind)]

[0.9674525  0.867388   0.8328334  0.82906306 0.7678357  0.72436464
 0.71210456 0.6588349  0.650056   0.64414716]
[  325  1052   338   152  1108    24   178  2542 35811 41896]


Unnamed: 0,title,genres,original_language,release_date,imdbId,movieIdx
1640,Tomorrow Never Dies,"Adventure, Action, Thriller",en,1997-12-11,120347,2542.0
5244,Star Wars: Episode II - Attack of the Clones,"Adventure, Action, Science Fiction",en,2002-05-15,121765,325.0
6221,The Matrix Reloaded,"Adventure, Action, Thriller, Science Fiction",en,2003-05-15,234215,338.0
6783,The Matrix Revolutions,"Adventure, Action, Thriller, Science Fiction",en,2003-11-05,242653,152.0
10068,Star Wars: Episode III - Revenge of the Sith,"Science Fiction, Adventure, Action",en,2005-05-17,121766,1052.0
19962,The Hobbit: An Unexpected Journey,"Adventure, Fantasy, Action",en,2012-11-26,903624,24.0
22047,The Hobbit: The Desolation of Smaug,"Adventure, Fantasy",en,2013-12-11,1170358,178.0
25376,The Hobbit: The Battle of the Five Armies,"Action, Adventure, Fantasy",en,2014-12-10,2310332,1108.0
30305,Blackie the Pirate,"Action, Adventure, Comedy",it,1971-03-12,66950,35811.0
45441,Satan Triumphant,,en,1917-10-21,8536,41896.0
