In [8]:
import os

import tensorflow as tf
import tensorflow.keras as keras
tf.config.run_functions_eagerly(True)

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

# 장르 임베딩 모델

추천 모델과 마찬가지 원리로 장르를 벡터로 임베딩합니다.

`사용자 벡터 * 장르 벡터 = 선호 여부`로 보고, 선호 여부는 이전 모델과 마찬가지로 평점 3.5점을 기준으로 합니다.

여기서 학습된 장르 벡터와 장르에 대한 사용자 벡터(사용자의 장르 취향 벡터)에 더해 기존 추천 모델로 학습한 벡터를 input으로 새 모델을 학습 시킬 계획입니다.

하나의 영화가 여러 장르를 갖는 경우가 있는데, 이 경우 데이터가 복제되었습니다.

예를 들어, 한 사용자가 Action, Adventure 영화에 5점을 준 경우, Action 장르에 대한 prefer=1인 샘플 하나, Adventure 장르에 대한 prefer=1인 샘플 하나, 총 2개 샘플이 생성됩니다.

반대로, 장르 데이터가 없는 영화도 있는데, 이런 영화들에 대한 데이터는 제외했습니다.

테스트 데이터에 대한 최종 결과는 `loss: 0.5676, accuracy: 0.6946`입니다.

In [10]:
genre_df = pd.read_csv('./The Movies Dataset/genres.csv')
print(len(genre_df))
genre_df.head()

20


Unnamed: 0,genre_id,genre_name
0,0,Action
1,1,Adventure
2,2,Animation
3,3,Comedy
4,4,Crime


In [11]:
rating_df = pd.read_csv('./The Movies Dataset/genre_ratings.csv')
rating_df['prefer'] = (rating_df['rating'] >= 3.5).astype(int)
print(len(rating_df))
rating_df.head()

70504818


Unnamed: 0,userId,rating,timestamp,imdbId,genre,genre_id,prefer
0,1,1.0,1425941529,112573,Action,0,0
1,1,1.0,1425941529,112573,Drama,6,0
2,1,1.0,1425941529,112573,History,10,0
3,1,1.0,1425941529,112573,War,18,0
4,11,3.5,1231676989,112573,Action,0,1


In [12]:
rating_df['prefer'].describe()

count    7.050482e+07
mean     6.123738e-01
std      4.872085e-01
min      0.000000e+00
25%      0.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
Name: prefer, dtype: float64

In [13]:
num_of_user = len(rating_df['userId'].unique())
num_of_item = len(rating_df['genre_id'].unique())
num_of_user, num_of_item

(270876, 20)

In [14]:
VECTOR_DIMENSION = 8

def compile_model(num_of_user, num_of_item, learning_rate=3e-4): 
    user_input = keras.layers.Input((1,))
    user_embedding = keras.layers.Embedding(num_of_user, VECTOR_DIMENSION, input_length=1, name='user_embedding')
    user = user_embedding(user_input)
    user = keras.layers.Reshape((VECTOR_DIMENSION, 1))(user)
    
    item_input = keras.layers.Input((1,))
    item_embedding = keras.layers.Embedding(num_of_item, VECTOR_DIMENSION, input_length=1, name='item_embedding')
    item = item_embedding(item_input)
    item = keras.layers.Reshape((VECTOR_DIMENSION, 1))(item)
    
    dot_product = keras.layers.dot([user, item], axes=1) 
    output = keras.layers.Activation('sigmoid')(dot_product)
    
    model = keras.models.Model(inputs=[user_input, item_input], outputs=output) 
    model.compile(
        loss='binary_crossentropy', 
        optimizer=keras.optimizers.Adam(learning_rate),
        metrics=['binary_accuracy'],
    )
    return model


model = compile_model(num_of_user, num_of_item)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 user_embedding (Embedding)     (None, 1, 8)         2167008     ['input_1[0][0]']                
                                                                                                  
 item_embedding (Embedding)     (None, 1, 8)         160         ['input_2[0][0]']                
                                                                                              

In [3]:
def build_dataset(filepath, user_col, item_col, y_col):
    uid2idx = {v: k for k, v in dict(enumerate(sorted(rating_df[user_col].unique()))).items()}
    iid2idx = {v: k for k, v in dict(enumerate(sorted(rating_df[item_col].unique()))).items()}

    dataset = pd.DataFrame()
    dataset['uidx'] = rating_df[user_col].map(uid2idx.get)
    dataset['iidx'] = rating_df[item_col].map(iid2idx.get)
    dataset['y'] = rating_df[y_col]
    
    data_x = np.array([
        dataset['uidx'].values, 
        dataset['iidx'].values, 
    ]).T
    data_y = np.array([
        dataset['y'].values, 
    ]).T
    train_x, valid_x, train_y, valid_y = train_test_split(
        data_x, data_y, test_size=40000,
    )
    valid_x, test_x, valid_y, test_y = train_test_split(
        valid_x, valid_y, test_size=20000,
    )
    return {
        'uid2idx': np.array(list(uid2idx.items())),
        'iid2idx': np.array(list(iid2idx.items())),
        'train_x': train_x, 
        'train_y': train_y, 
        'valid_x': valid_x, 
        'valid_y': valid_y, 
        'test_x': test_x, 
        'test_y': test_y,
    }


def load_or_build_dataset(filepath, user_col, item_col, y_col):
    if not os.path.exists(filepath):
        np.savez_compressed(
            filepath,
            **build_dataset(filepath, user_col, item_col, y_col),
        )
    with np.load(filepath) as filedata:
        return dict(filedata['uid2idx']), dict(filedata['iid2idx']), \
            filedata['train_x'], filedata['train_y'], \
            filedata['valid_x'], filedata['valid_y'], \
            filedata['test_x'], filedata['test_y']

    
filepath = './The Movies Dataset/genre_dataset.npz'
uid2idx, iid2idx, train_x, train_y, valid_x, valid_y, test_x, test_y =\
    load_or_build_dataset(filepath, 'userId', 'genre_id', 'prefer')

In [8]:
checkpoint_filepath = './The Movies Dataset/genre' + str(VECTOR_DIMENSION) + '_{val_loss:.4f}.hdf5'
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_best_only=True,
)

In [None]:
model = compile_model(num_of_user, num_of_item)
history = model.fit(
    x=(train_x[:,0,None], train_x[:,1,None]),
    y=train_y, 
    batch_size=2048,
    epochs=6, 
    verbose=1,
    validation_data=((valid_x[:,0,None], valid_x[:,1,None]), valid_y), 
    shuffle=True,
    callbacks=[model_checkpoint_callback],
)

In [6]:
best_model = keras.models.load_model('The Movies Dataset/genre8_0.5718.hdf5')

In [7]:
best_model.evaluate(
    x=(test_x[:,0,None], test_x[:,1,None]),
    y=test_y
)



[0.5676388740539551, 0.6946499943733215]