# Deep Learning for Content-Based Filtering

In [1]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate
pd.set_option("display.precision", 1)

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

# get Year and One-hot encoding Genres
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)', expand=False).fillna(1990).astype(int)
genre_dummies = movies['genres'].str.get_dummies(sep='|')
movie_cols = ['year'] + list(genre_dummies.columns)

# features into movies
movie_features_df = pd.concat([movies[['movieId', 'year']], genre_dummies], axis=1)

# 2. Xử lý User Features (Tính trung bình rating theo từng thể loại)
merged = pd.merge(ratings, movie_features_df, on='movieId')
genre_list = list(genre_dummies.columns)

weighted_genres = merged[genre_list].multiply(merged['rating'], axis=0)

user_sums = weighted_genres.groupby(merged['userId']).sum()
user_counts = merged.groupby('userId')[genre_list].sum()

# average
user_features_df = (user_sums / user_counts.replace(0, 1)).add_prefix('u_')

# Creare Final Train Data
final_train = ratings.merge(user_features_df, on='userId').merge(movie_features_df, on='movieId')

# 4. Tách Vector (X_u, X_m, y)
y_train = final_train['rating'].values
X_u = final_train[user_features_df.columns].values
X_m = final_train[movie_cols].values

print(f"X_u shape: {X_u.shape} | X_m shape: {X_m.shape} | y shape: {y_train.shape}")

# X_m = [year, no genres listed, Action, Adventure, Animation, Children, Comedy, Crime, Documentary, ...]
# X_u = [u_(no genres listed), u_Action, u_Adventure, u_Animation, u_Children, u_Comedy, u_Crime, u_Documentary, ...] u_Adventure: Điểm trung bình user chấm cho phim Adventure.
# y_train : column rating in file 'ratings.csv'

scaler = MinMaxScaler()
y_scaled = scaler.fit_transform(y_train.reshape(-1, 1)) 
scaler_m = StandardScaler()
X_m_scaled = scaler_m.fit_transform(X_m) # year so big

X_u_train, X_u_temp, X_m_train, X_m_temp, y_train_new, y_temp = train_test_split(
    X_u, X_m_scaled, y_scaled,
    test_size=0.2, 
    random_state=42, 
    shuffle=True
)

X_u_val, X_u_test, X_m_val, X_m_test, y_val, y_test = train_test_split(
    X_u_temp, X_m_temp, y_temp,
    test_size=0.5, 
    random_state=42, 
    shuffle=True
)


X_u shape: (100836, 20) | X_m shape: (100836, 21) | y shape: (100836,)


$u \cdot v = |u| \times |v| \times \cos(\theta)$

l2_normalize -> $|u| = 1$, $|v| = 1$

$u \cdot v = 1 \times 1 \times \cos(\theta) = \cos(\theta)$

Nếu 2 vector cùng hướng ( user thích phim ) => output = 1

Nếu 2 vector ngược hướng ( user k thích phim ) => output = -1

Nếu 2 vector vuông góc ( không liên quan) => output = 0

In [3]:
num_outputs = 32
# [0-5], [-1,1]
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units = 256, activation='relu'),
    tf.keras.layers.Dense(units = 128, activation='relu'),
    tf.keras.layers.Dense(units = num_outputs, activation='linear')
  
])

item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units = 256, activation='relu'),
    tf.keras.layers.Dense(units = 128, activation='relu'),
    tf.keras.layers.Dense(units = num_outputs, activation='linear')
  
])

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(X_u.shape[1], ))
vu = user_NN(input_user)
vu = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vu)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(X_m.shape[1], ))
vm = item_NN(input_item)
vm = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vm)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)

model.summary()




In [4]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss=cost_fn)

In [5]:
tf.random.set_seed(1)
model.fit([X_u_train, X_m_train], y_train_new, epochs=30)

Epoch 1/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 0.0477
Epoch 2/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 0.0426
Epoch 3/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - loss: 0.0417
Epoch 4/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - loss: 0.0412
Epoch 5/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - loss: 0.0408
Epoch 6/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 0.0403
Epoch 7/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 0.0399
Epoch 8/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 0.0395
Epoch 9/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - loss: 0.0392
Epoch 10/30
[1m2521/2521[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x190a29d4050>

In [9]:
tf.random.set_seed(1)
model.fit([X_u_val, X_m_val], y_val, epochs=30)

Epoch 1/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0388
Epoch 2/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0373  
Epoch 3/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0366  
Epoch 4/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0359  
Epoch 5/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0353
Epoch 6/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0348  
Epoch 7/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0343
Epoch 8/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0339  
Epoch 9/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0336  
Epoch 10/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3

<keras.src.callbacks.history.History at 0x190a29807d0>

In [10]:
tf.random.set_seed(1)
model.fit([X_u_test, X_m_test], y_test, epochs=30)

Epoch 1/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0413    
Epoch 2/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0381
Epoch 3/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0370
Epoch 4/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0362  
Epoch 5/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0355
Epoch 6/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0350
Epoch 7/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0346  
Epoch 8/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0341
Epoch 9/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0337
Epoch 10/30
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/s

<keras.src.callbacks.history.History at 0x19098188a50>

It is not suitable for suggesting good movies, it only suggests movies with similar content. Next,
we will take advantage of the len(32) vector obtained from the above model to improve

A similarity measure is the squared distance between the two vectors $ \mathbf{v_m^{(k)}}$ and $\mathbf{v_m^{(i)}}$ :
$$\left\Vert \mathbf{v_m^{(k)}} - \mathbf{v_m^{(i)}}  \right\Vert^2 = \sum_{l=1}^{n}(v_{m_l}^{(k)} - v_{m_l}^{(i)})^2\tag{1}$$

In [11]:
def sq_dist(a,b):
    """
    Returns the squared distance between two vectors
    Args:
      a (ndarray (n,)): vector with n features
      b (ndarray (n,)): vector with n features
    Returns:
      d (float) : distance
    """
    distance = 0.0
    for i in range(len(a)) : 
        distance += (a[i] - b[i])**2
    return distance

In [12]:
input_item_m = tf.keras.layers.Input(shape=(X_m.shape[1], ))    # input layer
vm_m = item_NN(input_item_m)                                       # use the trained item_NN
vm_m = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vm_m) # incorporate normalization as was done in the original model
model_m = tf.keras.Model(input_item_m, vm_m)                                
model_m.summary()

In [14]:
count = 5
movies_subset = movies.iloc[:count].copy()

raw_features = movie_features_df.iloc[:count].drop('movieId', axis=1).values
subset_features_scaled = scaler_m.transform(raw_features)
vms = model_m.predict(subset_features_scaled)

dim = len(vms)
dist_matrix = np.zeros((dim, dim))

for i in range(dim):
    for j in range(dim):
        dist_matrix[i, j] = sq_dist(vms[i], vms[j])

np.fill_diagonal(dist_matrix, np.inf) # mask itself

results = []
for i in range(dim):
    min_idx = np.argmin(dist_matrix[i])
    
    movie_src = movies_subset.iloc[i]
    movie_sim = movies_subset.iloc[min_idx]
    
    results.append({
        "Original Movie": movie_src['title'],
        "Genres": movie_src['genres'],
        "Recommended Movies": movie_sim['title'],
        "Genre Suggestion": movie_sim['genres'],
        "Distance": f"{dist_matrix[i, min_idx]:.4f}"
    })

# Create DataFrame
df_result = pd.DataFrame(results)

pd.set_option('display.max_colwidth', None) 
print(df_result)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 328ms/step
                       Original Movie  \
0                    Toy Story (1995)   
1                      Jumanji (1995)   
2             Grumpier Old Men (1995)   
3            Waiting to Exhale (1995)   
4  Father of the Bride Part II (1995)   

                                        Genres  \
0  Adventure|Animation|Children|Comedy|Fantasy   
1                   Adventure|Children|Fantasy   
2                               Comedy|Romance   
3                         Comedy|Drama|Romance   
4                                       Comedy   

                   Recommended Movies  \
0                      Jumanji (1995)   
1                    Toy Story (1995)   
2  Father of the Bride Part II (1995)   
3  Father of the Bride Part II (1995)   
4            Waiting to Exhale (1995)   

                              Genre Suggestion Distance  
0                   Adventure|Children|Fantasy   0.0895  
1  Adventure|Ani