In [1159]:
from urllib.request import urlretrieve
import zipfile
import pandas as pd
import numpy as np
import random

# data 읽기
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(
    'data/u.user', sep='|', names=users_cols, encoding='latin-1')

watch_cols = ['user_id', 'movie_id', 'watch_hist_time']
watches = pd.read_csv(
    'data/u.watch', sep='\t', names=watch_cols, encoding='latin-1')

search_cols = ['user', 'search_hist']
searches = pd.read_csv(
    'data/u.search', sep='\t', names=search_cols, encoding='latin-1')

# The movies file contains a binary feature for each genre.
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western",
]
movies_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols
movies = pd.read_csv(
    'data/u.item', sep='|', names=movies_cols, encoding='latin-1')

# Since the ids start at 1, we shift them to start at 0.
# users["user_id"] = users["user_id"].apply(lambda x: str(x-1))
# movies["movie_id"] = movies["movie_id"].apply(lambda x: str(x-1))
movies["year"] = movies['release_date'].apply(lambda x: str(x).split('-')[-1])
# watches["movie_id"] = watches["movie_id"].apply(lambda x: str(x-1))
# watches["user_id"] = watches["user_id"].apply(lambda x: str(x-1))
# searches["user_id"] = searches["user"].apply(lambda x: str(x-1))

# example_age 추가
movies['example_age'] = (pd.to_datetime("now") - pd.to_datetime(movies['release_date']))\
            /np.timedelta64(1,'D') 

# normalize
def normalize_col(df,col_name):
    df[col_name] = (df[col_name] - df[col_name].min()) / (df[col_name].max() - df[col_name].min())
    return df

movies = normalize_col(movies,'example_age')
watches = normalize_col(watches,'watch_hist_time')


# data 합치기
data = watches.merge(movies, on='movie_id').merge(users, on='user_id')
data['user_id']=data['user_id'].astype(int)
data['movie_id']=data['movie_id'].astype(int)
data = data.set_index(['user_id']).sort_index()
data = data.reset_index()
data['movie_name']=data['title'].str[:-6] # 년도 부분 자르기


# occupation 인코딩
occupations = data["occupation"].unique().tolist()
occupations_encoded = {x: i for i, x in enumerate(occupations)}
occupationsencoded2occupations = {i: x for i, x in enumerate(occupations)}

# search history 인코딩
search_hists = searches["search_hist"].unique().tolist()
search_encoded = {x: i for i, x in enumerate(search_hists)}
searchencoded2search = {i: x for i, x in enumerate(search_hists)}

# 유저 인덱스 인코딩
user_ids = data["user_id"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}

# 영화 인덱스 인코딩
movie_ids = data["movie_id"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}

# 영화 제목 인코딩
title_ids = data["title"].unique().tolist()
title2title_encoded = {x: i for i, x in enumerate(title_ids)}
title_encoded2title = {i: x for i, x in enumerate(title_ids)}

# 인코딩으로 바꾸기
data["user"] = data["user_id"].map(user2user_encoded)
data["movie"] = data["movie_id"].map(movie2movie_encoded)
data["title_d"] = data["title"].map(title2title_encoded)
searches["search_hist"] = searches["search_hist"].map(search_encoded)
data["occupation"] = data["occupation"].map(occupations_encoded)
# searches["search_hist"] = searches["search_hist"]
searches = normalize_col(searches,'search_hist')

watch_hist = data.groupby(['user'])['movie_id'].apply(list).reset_index()
search_hist = searches.groupby(['user'])['search_hist'].apply(list).reset_index()
watch_hist_time = data.groupby(['user'])['watch_hist_time'].apply(list).reset_index()
example_age = data.groupby(['user'])['example_age'].apply(list).reset_index()

user_video_list = data.pivot(index='user_id', columns='movie_id', values='movie').reset_index()
user_video_list.fillna(data["movie_id"].max()+1, inplace=True)

sample_data=data[['user','occupation','sex']]
sample_data=sample_data.reset_index()
sample_data = sample_data.drop('index',axis=1)
sample_data = sample_data.drop_duplicates()

user_movie_list = pd.merge(sample_data,watch_hist, how= 'left')
user_movie_list = pd.merge(user_movie_list,watch_hist_time, how='left')
user_movie_list = pd.merge(user_movie_list,search_hist, how='left')
user_movie_list = pd.merge(user_movie_list,example_age, how='left')
user_movie_list['search_hist'] = user_movie_list['search_hist'].apply(lambda x: x if type(x) is list else []) # NaN 처리
user_movie_list['predict_labels'] = user_movie_list['movie_id'].apply(lambda x: int(random.uniform(0,data["movie"].max()))) #label을 마지막 값으로..



train_data = user_movie_list[(user_movie_list.user >= 1)&
                                  (user_movie_list.user <= 5)]
test_data = user_movie_list[(user_movie_list.user >= 6)&
                                  (user_title_list.user <= 10)]





In [1160]:
movies # 영화 정보 데이터

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year,example_age
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,1995,1.0
1,2,GoldenEye (2011),01-Jan-2011,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,1,0,0,2011,0.359982
2,3,Four Rooms (2020),01-Jan-2020,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,1,0,0,2020,0.0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1995,1.0
4,5,Copycat (1998),01-Jan-1998,,http://us.imdb.com/M/title-exact?Copycat%20(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,1998,0.879969
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,01-Jan-1995,,http://us.imdb.com/Title?Yao+a+yao+yao+dao+wai...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1995,1.0
6,7,Twelve Monkeys (2018),01-Jan-2018,,http://us.imdb.com/M/title-exact?Twelve%20Monk...,0,0,0,0,0,...,0,0,0,0,1,0,0,0,2018,0.079947
7,8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,0,0,1,...,0,0,0,0,0,0,0,0,1995,1.0
8,9,Dead Man Walking (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Dead%20Man%20...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1995,1.0
9,10,Richard III (1995),22-Jan-1996,,http://us.imdb.com/M/title-exact?Richard%20III...,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1996,0.957726


In [1161]:
user_movie_list

Unnamed: 0,user,occupation,sex,movie_id,watch_hist_time,search_hist,example_age,predict_labels
0,0,0,M,"[10, 3, 9, 5]","[0.024163568773234202, 0.0, 0.2131350681536555...",[0.0],"[0.9577264264593144, 0.0, 1.0, 0.8799693352316...",1
1,1,1,F,"[2, 7]","[0.007390688617454417, 0.1644538856434767]","[0.0, 1.0]","[0.3599824772752163, 0.07994743182564888]",7
2,2,2,M,"[3, 8]","[0.17919100725792175, 0.2091520623119136]",[0.5],"[0.0, 1.0]",3
3,3,0,M,[8],[1.0],[1.0],[1.0],6
4,4,1,F,[4],[0.017923526287838557],[],[1.0],3
5,5,3,M,[6],[0.1614887590724022],[],[1.0],3
6,6,4,M,"[2, 8]","[0.4109134360063728, 0.17919100725792175]",[],"[0.3599824772752163, 1.0]",4
7,7,4,M,"[7, 10, 1]","[0.20477075588599752, 0.1644538856434767, 0.03...",[0.0],"[0.07994743182564888, 0.9577264264593144, 1.0]",4
8,8,5,M,[1],[0.4109134360063728],[],[1.0],0
9,9,6,M,[1],[0.20069923880332802],[],[1.0],0


In [1162]:
train_data # train data

Unnamed: 0,user,occupation,sex,movie_id,watch_hist_time,search_hist,example_age,predict_labels
1,1,1,F,"[2, 7]","[0.007390688617454417, 0.1644538856434767]","[0.0, 1.0]","[0.3599824772752163, 0.07994743182564888]",7
2,2,2,M,"[3, 8]","[0.17919100725792175, 0.2091520623119136]",[0.5],"[0.0, 1.0]",3
3,3,0,M,[8],[1.0],[1.0],[1.0],6
4,4,1,F,[4],[0.017923526287838557],[],[1.0],3
5,5,3,M,[6],[0.1614887590724022],[],[1.0],3


In [1163]:
test_data # test data

Unnamed: 0,user,occupation,sex,movie_id,watch_hist_time,search_hist,example_age,predict_labels
6,6,4,M,"[2, 8]","[0.4109134360063728, 0.17919100725792175]",[],"[0.3599824772752163, 1.0]",4
7,7,4,M,"[7, 10, 1]","[0.20477075588599752, 0.1644538856434767, 0.03...",[0.0],"[0.07994743182564888, 0.9577264264593144, 1.0]",4
8,8,5,M,[1],[0.4109134360063728],[],[1.0],0
9,9,6,M,[1],[0.20069923880332802],[],[1.0],0


In [1164]:
EMBEDDING_DIMS = 16
DENSE_UNITS = 64
DROPOUT_PCT = 0.0
ALPHA = 0.0
NUM_CLASSES=data["movie"].max() + 2
LEARNING_RATE = 0.003

In [1165]:
import tensorflow as tf
class MaskedEmbeddingsAggregatorLayer(tf.keras.layers.Layer):
    def __init__(self, agg_mode='sum', **kwargs):
        super(MaskedEmbeddingsAggregatorLayer, self).__init__(**kwargs)

        if agg_mode not in ['sum', 'mean']:
            raise NotImplementedError('mode {} not implemented!'.format(agg_mode))
        self.agg_mode = agg_mode
    
    @tf.function
    def call(self, inputs, mask=None):
        masked_embeddings = tf.ragged.boolean_mask(inputs, mask)
        if self.agg_mode == 'sum':
            aggregated =  tf.reduce_sum(masked_embeddings, axis=1)
        elif self.agg_mode == 'mean':
            aggregated = tf.reduce_mean(masked_embeddings, axis=1)
        return aggregated
    
    def get_config(self):
        # this is used when loading a saved model that uses a custom layer
        return {'agg_mode': self.agg_mode}
    
class L2NormLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(L2NormLayer, self).__init__(**kwargs)
    
    @tf.function
    def call(self, inputs, mask=None):
        if mask is not None:
            inputs = tf.ragged.boolean_mask(inputs, mask).to_tensor()
        return tf.math.l2_normalize(inputs, axis=-1)

    def compute_mask(self, inputs, mask):
        return mask

In [1166]:
#---inputs
import tensorflow as tf
import datetime
import os
input_watch_hist = tf.keras.Input(shape=(None, ), name='watch_hist')
input_watch_hist_time = tf.keras.layers.Input(shape=(None,), name='watch_hist_time')
input_search_hist = tf.keras.layers.Input(shape=(None,), name='search_hist')
input_example_age = tf.keras.Input(shape=(None, ), name='example_age')
input_occupation = tf.keras.Input(shape=(None, ), name='occupation')


#--- layers
features_embedding_layer = tf.keras.layers.Embedding(input_dim=NUM_CLASSES, output_dim=EMBEDDING_DIMS, 
                                            mask_zero=True, trainable=True, name='features_embeddings')
labels_embedding_layer = tf.keras.layers.Embedding(input_dim=NUM_CLASSES, output_dim=EMBEDDING_DIMS, 
                                            mask_zero=True, trainable=True, name='labels_embeddings')

avg_embeddings = MaskedEmbeddingsAggregatorLayer(agg_mode='mean', name='aggregate_embeddings')

dense_1 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_1')
dense_2 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_2')
dense_3 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_3')
l2_norm_1 = L2NormLayer(name='l2_norm_1')

dense_output = tf.keras.layers.Dense(NUM_CLASSES, activation=tf.nn.softmax, name='dense_output')

#--- features
features_embeddings = features_embedding_layer(input_watch_hist)
l2_norm_features = l2_norm_1(features_embeddings)
avg_features = avg_embeddings(l2_norm_features)

labels_watch_embeddings = labels_embedding_layer(input_watch_hist_time)
l2_norm_watched = l2_norm_1(labels_watch_embeddings)
avg_watched = avg_embeddings(l2_norm_watched)

labels_search_embeddings = labels_embedding_layer(input_search_hist)
l2_norm_searched = l2_norm_1(labels_search_embeddings)
avg_searched = avg_embeddings(l2_norm_searched)

labels_example_age_embeddings = labels_embedding_layer(input_example_age)
l2_norm_example_age = l2_norm_1(labels_example_age_embeddings)
avg_example_age = avg_embeddings(l2_norm_example_age)

labels_occupation_embeddings = labels_embedding_layer(input_occupation)
l2_norm_occupation = l2_norm_1(labels_occupation_embeddings)
avg__occupation = avg_embeddings(l2_norm_occupation)


print(avg_features)
print(avg_watched)
print(avg_searched)
print(avg_example_age)
print(input_occupation)

# 임베딩 벡터들 연결
concat_inputs = tf.keras.layers.Concatenate(axis=1)([avg_features,
                                                     avg_watched,
                                                     avg_searched,
                                                     avg_example_age,
#                                                      avg__occupation
                                                     ])
# Dense Layers
dense_1_features = dense_1(concat_inputs)
dense_1_relu = tf.keras.layers.ReLU(name='dense_1_relu')(dense_1_features)
dense_1_batch_norm = tf.keras.layers.BatchNormalization(name='dense_1_batch_norm')(dense_1_relu)

dense_2_features = dense_2(dense_1_relu)
dense_2_relu = tf.keras.layers.ReLU(name='dense_2_relu')(dense_2_features)
# dense_2_batch_norm = tf.keras.layers.BatchNormalization(name='dense_2_batch_norm')(dense_2_relu)

dense_3_features = dense_3(dense_2_relu)
dense_3_relu = tf.keras.layers.ReLU(name='dense_3_relu')(dense_3_features)
dense_3_batch_norm = tf.keras.layers.BatchNormalization(name='dense_3_batch_norm')(dense_3_relu)
outputs = dense_output(dense_3_batch_norm)

#Optimizer
optimiser = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

#--- prep model
model = tf.keras.models.Model(
    inputs=[input_watch_hist, 
            input_watch_hist_time, 
            input_search_hist,
            input_example_age,
#             input_occupation,
            ],
    outputs=[outputs]
)
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
model.compile(optimizer=optimiser, loss='sparse_categorical_crossentropy', metrics=['acc'])

model.summary()

Tensor("aggregate_embeddings/PartitionedCall_298:0", shape=(None, 16), dtype=float32)
Tensor("aggregate_embeddings/PartitionedCall_299:0", shape=(None, 16), dtype=float32)
Tensor("aggregate_embeddings/PartitionedCall_300:0", shape=(None, 16), dtype=float32)
Tensor("aggregate_embeddings/PartitionedCall_301:0", shape=(None, 16), dtype=float32)
Tensor("occupation_36:0", shape=(None, None), dtype=float32)
Model: "functional_124"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
watch_hist (InputLayer)         [(None, None)]       0                                            
__________________________________________________________________________________________________
watch_hist_time (InputLayer)    [(None, None)]       0                                            
____________________________________________________________________________

In [1167]:
history = model.fit([tf.keras.preprocessing.sequence.pad_sequences(train_data['movie_id']),
           tf.keras.preprocessing.sequence.pad_sequences(train_data['watch_hist_time'], dtype=float),
           tf.keras.preprocessing.sequence.pad_sequences(train_data['search_hist'], dtype=float) + 1e-10,
           tf.keras.preprocessing.sequence.pad_sequences(train_data['example_age'], dtype=float),
#            tf.keras.preprocessing.sequence.pad_sequences(train_data['occupation'], dtype=float),
           ],train_data['predict_labels'].values,
           steps_per_epoch=1, epochs=50)




Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [1168]:
model.save("candidate_generation.h5")

In [1169]:
pred = model.predict([tf.keras.preprocessing.sequence.pad_sequences(test_data['movie_id']),
           tf.keras.preprocessing.sequence.pad_sequences(test_data['watch_hist_time'], dtype=float),
           tf.keras.preprocessing.sequence.pad_sequences(test_data['search_hist'], dtype=float) + 1e-10,
           tf.keras.preprocessing.sequence.pad_sequences(test_data['example_age'], dtype=float)
           ])



In [1170]:
pred

array([[0.03323323, 0.04705915, 0.03846468, 0.6008853 , 0.04078529,
        0.03653717, 0.04297908, 0.03952277, 0.04089409, 0.03757692,
        0.0420624 ],
       [0.04467057, 0.07143843, 0.05111337, 0.39572582, 0.06613615,
        0.04641057, 0.03548903, 0.1128524 , 0.06010652, 0.0472837 ,
        0.0687734 ],
       [0.03265506, 0.04456531, 0.03621548, 0.6071133 , 0.04086963,
        0.03534368, 0.04314659, 0.03943169, 0.04152324, 0.03550074,
        0.04363524],
       [0.03265506, 0.04456531, 0.03621548, 0.6071133 , 0.04086963,
        0.03534368, 0.04314659, 0.03943169, 0.04152324, 0.03550074,
        0.04363524]], dtype=float32)

In [1171]:
# candidate generation: 
###### 각 user당 top-7개의 추천 데이터를 뽑아낸다.
N = 6
k = np.sort((-pred).argsort()[:,:N])
print(k)
k = k.flatten()
k[k>data["movie"].max()]=0
k = np.unique(k)


[[ 1  3  4  6  8 10]
 [ 1  3  4  7  8 10]
 [ 1  3  4  6  8 10]
 [ 1  3  4  6  8 10]]


In [1172]:
k

array([0, 1, 3, 4, 6, 7, 8])

In [1173]:
### ranking

In [1174]:
# load candidate_generation 
model = tf.keras.models.load_model(
    'candidate_generation.h5',
    custom_objects={
        'L2NormLayer':L2NormLayer,
        'MaskedEmbeddingsAggregatorLayer':MaskedEmbeddingsAggregatorLayer
    }
)

In [1175]:
movie_data = movies.set_index(['movie_id']).sort_index()
movie_data = movie_data.loc[k+1]
movie_data["title_d"] = movie_data["title"].map(title2title_encoded)

ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    'data/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

get_genres(movie_data, genre_cols)

new_data = movie_data.merge(ratings, on='movie_id') # rating 추가

genre_occurences = new_data[genre_cols].sum().to_dict()
genres_encoded = {x: i for i, x in enumerate(genre_cols)}


new_data = new_data[['movie_id', 'user_id', 'rating', 'unix_timestamp', 'all_genres', 'title_d']]
new_data['movie_type'] = np.where(new_data['rating'] >= 3, 'like', 'dislike') # 3보다 크면 like


genre_list = new_data.groupby(['user_id'])['all_genres'].unique().apply(list).reset_index()
genre_list['all_genres']=genre_list['all_genres'].apply(lambda x: list(set(','.join(x))) ) # 중복제거
genre_list['all_genres']=genre_list['all_genres'].apply(lambda x:[ x for x in x if x.isdigit() ])

new_data = normalize_col(new_data, 'unix_timestamp')
timestamp_list = new_data.groupby(['user_id'])['unix_timestamp'].unique().apply(list).reset_index()

title_list = new_data.groupby(['user_id'])['title_d'].apply(list).reset_index()
print(title_list)
dataset = movie_list.pivot(index='user_id', columns='movie_type', values='movie_id').reset_index()
dataset.fillna(new_data["movie_id"].max()+1, inplace=True)

dataset['like'] =dataset['like'].apply(lambda x: x if type(x) is list else [])
dataset['dislike'] =dataset['dislike'].apply(lambda x: x if type(x) is list else [])

dataset = pd.merge(dataset, title_list, how='left')
dataset = pd.merge(dataset, genre_list, how='left')
dataset = pd.merge(dataset, timestamp_list, how='left')

dataset['predict_labels'] = dataset['like'].apply(lambda x: int(random.uniform(1,new_data["movie_id"].max()))) #label을 마지막 값으로..

dataset['like']=dataset['like'].apply(lambda x: [new_data["movie_id"].max()+1] if x == [] else x)
dataset['dislike']=dataset['dislike'].apply(lambda x: [new_data["movie_id"].max()+1] if x == [] else x)
train_data=dataset[(dataset.user_id >= 1)&
                                  (dataset.user_id <= 5)]
test_data=dataset[(dataset.user_id >= 6)&
                                  (dataset.user_id <= 9)]

   user_id title_d
0        1  [3, 2]
1        2  [4, 5]
2        3     [6]
3        4     [6]
4        5     [7]
5        7  [4, 6]
6        8  [9, 5]
7        9     [9]
8       10     [9]


In [1176]:
dataset

Unnamed: 0,user_id,dislike,like,title_d,all_genres,unix_timestamp,predict_labels
0,1,[3],[5],"[3, 2]","[6, 1, 8]","[0.4823272155954916, 0.3272099816004719]",1
1,2,[10],"[2, 7]","[4, 5]","[5, 6, 2, 1, 8]","[1.0, 0.9943047712588169]",1
2,3,[10],[3],[6],"[4, 5, 8]",[0.6559625711672339],8
3,5,[4],[10],[7],"[1, 5, 8]",[0.2124407399567807],1
4,7,[10],[2],"[4, 6]","[4, 5, 6, 2, 1, 8]","[0.0, 0.21964208637201138]",8
5,8,[10],"[1, 7]","[9, 5]","[4, 5, 3, 1, 8]","[0.5193146561727731, 0.7694933118780622]",4
6,9,[1],[10],[9],"[4, 5, 3]",[0.6606054505013651],8
7,10,[1],[10],[9],"[4, 5, 3]",[0.29119399936282336],7


In [1177]:
train_data

Unnamed: 0,user_id,dislike,like,title_d,all_genres,unix_timestamp,predict_labels
0,1,[3],[5],"[3, 2]","[6, 1, 8]","[0.4823272155954916, 0.3272099816004719]",1
1,2,[10],"[2, 7]","[4, 5]","[5, 6, 2, 1, 8]","[1.0, 0.9943047712588169]",1
2,3,[10],[3],[6],"[4, 5, 8]",[0.6559625711672339],8
3,5,[4],[10],[7],"[1, 5, 8]",[0.2124407399567807],1


In [1178]:
test_data

Unnamed: 0,user_id,dislike,like,title_d,all_genres,unix_timestamp,predict_labels
4,7,[10],[2],"[4, 6]","[4, 5, 6, 2, 1, 8]","[0.0, 0.21964208637201138]",8
5,8,[10],"[1, 7]","[9, 5]","[4, 5, 3, 1, 8]","[0.5193146561727731, 0.7694933118780622]",4
6,9,[1],[10],[9],"[4, 5, 3]",[0.6606054505013651],8


In [1179]:
new_data["movie_id"].max() + 3

12

In [1180]:
EMBEDDING_DIMS = 16
DENSE_UNITS = 64
DROPOUT_PCT = 0.0
ALPHA = 0.0
NUM_CLASSES=new_data["movie_id"].max() + 3
LEARNING_RATE = 0.003

In [1181]:
#---inputs
import tensorflow as tf
import datetime
import os
input_title = tf.keras.Input(shape=(None, ), name='movie_name')
inp_video_liked = tf.keras.layers.Input(shape=(None,), name='like')
inp_video_disliked = tf.keras.layers.Input(shape=(None,), name='dislike')
input_genre = tf.keras.Input(shape=(None, ), name='genre')
input_timestamp = tf.keras.Input(shape=(None, ), name='timestamp')


#--- layers
features_embedding_layer = tf.keras.layers.Embedding(input_dim=NUM_CLASSES, output_dim=EMBEDDING_DIMS, 
                                            mask_zero=True, trainable=True, name='features_embeddings')
labels_embedding_layer = tf.keras.layers.Embedding(input_dim=NUM_CLASSES, output_dim=EMBEDDING_DIMS, 
                                            mask_zero=True, trainable=True, name='labels_embeddings')

avg_embeddings = MaskedEmbeddingsAggregatorLayer(agg_mode='mean', name='aggregate_embeddings')

dense_1 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_1')
dense_2 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_2')
dense_3 = tf.keras.layers.Dense(units=DENSE_UNITS, name='dense_3')
l2_norm_1 = L2NormLayer(name='l2_norm_1')

dense_output = tf.keras.layers.Dense(NUM_CLASSES, activation=tf.nn.softmax, name='dense_output')

#--- features
features_embeddings = features_embedding_layer(input_title)
l2_norm_features = l2_norm_1(features_embeddings)
avg_features = avg_embeddings(l2_norm_features)

labels_liked_embeddings = labels_embedding_layer(inp_video_liked)
l2_norm_liked = l2_norm_1(labels_liked_embeddings)
avg_liked = avg_embeddings(l2_norm_liked)

labels_disliked_embeddings = labels_embedding_layer(inp_video_disliked)
l2_norm_disliked = l2_norm_1(labels_disliked_embeddings)
avg_disliked = avg_embeddings(l2_norm_disliked)

labels_genre_embeddings = labels_embedding_layer(input_genre)
l2_norm_genre = l2_norm_1(labels_genre_embeddings)
avg_genre = avg_embeddings(l2_norm_genre)

labels_timestamp_embeddings = labels_embedding_layer(input_timestamp)
l2_norm_timestamp = l2_norm_1(labels_timestamp_embeddings)
avg_timestamp = avg_embeddings(l2_norm_timestamp)


# 임베딩 벡터들 연결
concat_inputs = tf.keras.layers.Concatenate(axis=1)([avg_features,
                                                     avg_liked,
                                                     avg_disliked,
                                                     avg_genre,
                                                     avg_timestamp
                                                     ])
# Dense Layers
dense_1_features = dense_1(concat_inputs)
dense_1_relu = tf.keras.layers.ReLU(name='dense_1_relu')(dense_1_features)
dense_1_batch_norm = tf.keras.layers.BatchNormalization(name='dense_1_batch_norm')(dense_1_relu)

dense_2_features = dense_2(dense_1_relu)
dense_2_relu = tf.keras.layers.ReLU(name='dense_2_relu')(dense_2_features)
# dense_2_batch_norm = tf.keras.layers.BatchNormalization(name='dense_2_batch_norm')(dense_2_relu)

dense_3_features = dense_3(dense_2_relu)
dense_3_relu = tf.keras.layers.ReLU(name='dense_3_relu')(dense_3_features)
dense_3_batch_norm = tf.keras.layers.BatchNormalization(name='dense_3_batch_norm')(dense_3_relu)
outputs = dense_output(dense_3_batch_norm)

#Optimizer
optimiser = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

#--- prep model
model = tf.keras.models.Model(
    inputs=[input_title, 
            inp_video_liked, 
            inp_video_disliked,
            input_genre,
            input_timestamp,
            ],
    outputs=[outputs]
)
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
model.compile(optimizer=optimiser, loss='sparse_categorical_crossentropy', metrics=['acc'])

# model.summary()

In [1182]:
history = model.fit([tf.keras.preprocessing.sequence.pad_sequences(train_data['title_d']),
           tf.keras.preprocessing.sequence.pad_sequences(train_data['like']),
           tf.keras.preprocessing.sequence.pad_sequences(train_data['dislike']),
            tf.keras.preprocessing.sequence.pad_sequences(train_data['all_genres']),
            tf.keras.preprocessing.sequence.pad_sequences(train_data['unix_timestamp'], dtype=float) + 1e-10,
           ],train_data['predict_labels'].values,
           steps_per_epoch=1, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [1183]:
results = model.evaluate([tf.keras.preprocessing.sequence.pad_sequences(test_data['title_d']),
           tf.keras.preprocessing.sequence.pad_sequences(test_data['like']),
           tf.keras.preprocessing.sequence.pad_sequences(test_data['dislike']),
            tf.keras.preprocessing.sequence.pad_sequences(test_data['all_genres']),
            tf.keras.preprocessing.sequence.pad_sequences(test_data['unix_timestamp'], dtype=float) + 1e-10,
           ], test_data['predict_labels'].values, verbose=1
        )



In [1184]:
pred = model.predict([tf.keras.preprocessing.sequence.pad_sequences(test_data['title_d']),
           tf.keras.preprocessing.sequence.pad_sequences(test_data['like']),
           tf.keras.preprocessing.sequence.pad_sequences(test_data['dislike']),
            tf.keras.preprocessing.sequence.pad_sequences(test_data['all_genres']),
            tf.keras.preprocessing.sequence.pad_sequences(test_data['unix_timestamp'], dtype=float) + 1e-10
           ])



In [1185]:
pred

array([[0.07525565, 0.02337966, 0.07705533, 0.09420496, 0.06743241,
        0.07388986, 0.0750409 , 0.05455001, 0.2532599 , 0.0546343 ,
        0.08029357, 0.07100342],
       [0.09006105, 0.03391625, 0.07724374, 0.10065082, 0.06602372,
        0.07904252, 0.07099923, 0.05412571, 0.21223558, 0.05500762,
        0.09367954, 0.06701417],
       [0.06919657, 0.18517745, 0.07539512, 0.08832996, 0.06528492,
        0.07524402, 0.07267001, 0.05677502, 0.11355842, 0.05129936,
        0.07687893, 0.07019021]], dtype=float32)

In [1190]:
# ranking
###### 각 user당 top-3개의 추천 데이터를 뽑아낸다.
N = 3
k = np.sort((-pred).argsort()[:,:N])
k[k>new_data["movie_id"].max()]=0
print(k)

[[3 8 0]
 [3 8 0]
 [1 3 8]]
