In [103]:
import random
import math
import numpy as np
import pandas as pd
import json
import tensorflow as tf
import tensorboard
from tensorflow import keras
from tensorflow.keras import Sequential, layers, optimizers, losses
from config import Config
import re

In [2]:
path = 'data/ml-1m/'
movies_col = ['MovieID', 'Title', 'Genres']
users_col = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']
ratings_col = ['UserID', 'MovieID', 'Rating', 'Timestamp']
movies = pd.read_csv(f'{path}movies.dat', sep="::", names=movies_col)
users = pd.read_csv(f'{path}users.dat', sep="::", names=users_col)
ratings = pd.read_csv(f'{path}ratings.dat', sep="::", names=ratings_col)



In [28]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [29]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [98]:
set(users['Age'].to_list())

{1, 18, 25, 35, 45, 50, 56}

In [97]:
users['Age'].nunique()

7

In [30]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
user_id_num = users['UserID'].nunique()


In [51]:
user_id_embedding = keras.Sequential([
            layers.Embedding(user_id_num, Config.user_id_embedding_output_dim,  name="embedding"),
            layers.Dense(32, kernel_initializer='he_normal', name='use_id_fc'),
        ], name='user_id_embedding')

In [40]:
user_id_embedding(tf.expand_dims(users['UserID'][:3], axis=0))

<tf.Tensor: shape=(1, 3, 32), dtype=float32, numpy=
array([[[ 2.5832716e-02,  2.8851580e-02, -3.6324035e-02, -2.7316391e-02,
         -6.4673908e-03, -2.3645306e-02,  2.3282003e-02, -6.3903257e-04,
         -4.7676839e-02,  5.4590330e-03,  2.1130171e-02, -4.7348537e-02,
          4.3188345e-02, -4.9422313e-02,  1.1717565e-03, -3.6031723e-02,
          6.0714781e-05,  1.4065515e-02, -5.9670694e-03,  2.4082575e-02,
         -1.8593132e-02, -2.1727145e-02, -1.1332046e-02,  2.4214517e-02,
         -3.1360015e-03,  2.7307991e-02, -1.6783796e-02, -3.6213614e-02,
          6.6113696e-03, -4.3266665e-02, -2.9778052e-02,  9.3796365e-03],
        [-2.6907885e-02,  3.5011936e-02,  2.5391761e-02,  1.8592153e-02,
         -2.0112384e-02,  1.8152464e-02, -1.0144889e-02,  2.2828605e-02,
         -2.6873279e-02, -3.8172018e-02,  4.7138784e-02,  4.7662329e-02,
         -3.8386177e-02,  3.4824047e-02, -2.8643167e-02,  8.6031109e-04,
         -3.5016011e-02, -4.2482805e-02,  3.2668710e-03,  8.6185932e-03

In [3]:
genres = movies['Genres'].apply(lambda x: x.split('|'))

In [12]:
genres = movies['Genres'].apply(lambda x: x.split('|')).to_list()
genres = list(set(list(np.hstack(genres))))
genres_map = dict(zip(genres, [i + 1 for i in range(len(genres))]))
genres = movies['Genres'].apply(lambda x: x.split('|'))\
                .apply(lambda x: [genres_map[i] for i in x])

In [25]:
genres_map

{'Mystery': 1,
 'Fantasy': 2,
 'Sci-Fi': 3,
 'Documentary': 4,
 'Crime': 5,
 'Thriller': 6,
 'Action': 7,
 'War': 8,
 'Western': 9,
 "Children's": 10,
 'Comedy': 11,
 'Horror': 12,
 'Romance': 13,
 'Film-Noir': 14,
 'Musical': 15,
 'Adventure': 16,
 'Drama': 17,
 'Animation': 18}

In [25]:
movie_categories_embedding = layers.Embedding(19, Config.movie_categories_output_num, mask_zero=True, name='movie_categories_embedding')

In [21]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(
    genres, padding="post"
)

In [27]:
masked_output = movie_categories_embedding(inputs)

In [54]:
mask = tf.cast(masked_output._keras_mask, dtype=tf.float32)

In [55]:
mask

<tf.Tensor: shape=(3883, 6), dtype=float32, numpy=
array([[1., 1., 1., 0., 0., 0.],
       [1., 1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0.]], dtype=float32)>

In [59]:
masked_output = masked_output * tf.expand_dims(mask, axis=2)

In [61]:
masked_output = tf.reduce_sum(masked_output, axis=1, keepdims=True)

In [62]:
movie_categories_fc = layers.Dense(Config.embedding_dim, kernel_initializer='he_normal', name='movie_categories_fc')


In [64]:
movie_categories = movie_categories_fc(masked_output)

In [81]:
movie_id_embedding = keras.Sequential([
            layers.Embedding(3953, Config.move_id_embedding_output_dim, input_length=1, name='embedding'),
            layers.Dense(Config.embedding_dim, kernel_initializer='he_normal', name='movie_id_fc')
        ], name='movie_id_embedding')

In [84]:
movie_id = movie_id_embedding(tf.expand_dims(movies['MovieID'].to_list(), axis=1))

In [85]:
layers.concatenate([movie_id, movie_categories])

<tf.Tensor: shape=(3883, 1, 64), dtype=float32, numpy=
array([[[ 0.0201471 , -0.09114599,  0.0557896 , ...,  0.05699036,
         -0.01148556, -0.0306964 ]],

       [[-0.06619983, -0.02614153, -0.04566379, ...,  0.0535129 ,
         -0.00429837, -0.08115232]],

       [[-0.01030343, -0.00730711, -0.08777495, ...,  0.08364037,
          0.0315102 ,  0.00064117]],

       ...,

       [[ 0.04869591,  0.01327367,  0.01726161, ..., -0.00299545,
         -0.02317938,  0.01076081]],

       [[ 0.00958184, -0.07442987, -0.03849153, ..., -0.00299545,
         -0.02317938,  0.01076081]],

       [[ 0.02482926, -0.02498857, -0.00850435, ..., -0.04169276,
         -0.07790941,  0.06577192]]], dtype=float32)>

In [106]:
title = movies['Title'].apply(lambda x: re.sub(r"[-()\"/@;:<>{}`+=~|.,]", "", x).split()).to_list()
title = list(set(list(np.hstack(title))))
title_map = dict(zip(title, [i + 1 for i in range(len(title))]))
title_input = movies['Title'].apply(lambda x: re.sub(r"[-()\"/@;:<>{}`+=~|.,]", "", x).split())\
                .apply(lambda x: [title_map[i] for i in x])

In [107]:
len(title)

4749

In [99]:
from TextCNN import TextCNN

In [113]:
textCNN = TextCNN(15, 4750, 64, 32, 'relu')
model = textCNN.get_model()

In [117]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(
    title_input, padding="post", maxlen=16
)

In [118]:
model(inputs)

<tf.Tensor: shape=(3883, 32), dtype=float32, numpy=
array([[0.        , 0.03447237, 0.        , ..., 0.0089156 , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.00912313, 0.00057399,
        0.00920642],
       [0.        , 0.        , 0.00089002, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.0130721 , 0.        , ..., 0.        , 0.        ,
        0.00730382],
       [0.        , 0.        , 0.01939505, ..., 0.00673559, 0.        ,
        0.        ],
       [0.        , 0.00697018, 0.        , ..., 0.01283735, 0.        ,
        0.        ]], dtype=float32)>

In [119]:
inputs.shape

(3883, 16)