In [75]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [76]:
movies = pd.read_csv('./data/content_item_train.csv')
users = pd.read_csv('./data/content_user_train.csv')
ratings = pd.read_csv('./data/content_y_train.csv', header=None)

movies = movies.drop(['movie id'], axis=1)
users = users.drop(['user id', 'rating count', 'rating ave'], axis=1)

In [77]:
movies.head()

Unnamed: 0,year,ave rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,2003,3.961832,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2003,3.961832,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,2003,3.961832,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,2004,3.761364,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2004,3.761364,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [78]:
users.head()

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.25,3.875
1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.25,3.875
2,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.25,3.875
3,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.25,3.875
4,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.25,3.875


In [79]:
ratings.head()

Unnamed: 0,0
0,4.0
1,4.0
2,4.0
3,3.5
4,3.5


In [91]:
movies = np.array(movies)
users = np.array(users)
ratings = np.array(ratings)

print('Movie:', movies.shape)
print('User:', users.shape)
print('Target:', ratings.shape)

Movie: (58187, 16)
User: (58187, 14)
Target: (58187, 1)


In [92]:
scaler_m = StandardScaler()
movie_train = scaler_m.fit_transform(movies)

scaler_u = StandardScaler()
user_train = scaler_u.fit_transform(users)

ratings_np = np.array(ratings)

In [82]:
movie_train, movie_test = train_test_split(movie_train, train_size=0.8, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.8, shuffle=True, random_state=1)
y_train, y_test = train_test_split(ratings_np, train_size=0.8, shuffle=True, random_state=1)

In [83]:
scaler_y = MinMaxScaler((-1, 1))
scaler_y.fit(y_train.reshape(-1, 1))

ynorm_train = scaler_y.transform(y_train.reshape(-1, 1))
ynorm_test = scaler_y.transform(y_test.reshape(-1, 1))

print(ynorm_train.shape, ynorm_test.shape)

(46549, 1) (11638, 1)


In [84]:
n_out = 32
tf.random.set_seed(1)

user_NN = Sequential([
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(n_out, activation='linear'),
])

movie_NN = Sequential([
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(n_out, activation='linear'),
])

input_u = tf.keras.layers.Input(shape=len(users.columns))
vu = user_NN(input_u)
vu = tf.linalg.l2_normalize(vu, axis = 1)

input_m = tf.keras.layers.Input(shape=len(movies.columns))
vm = movie_NN(input_m)
vm = tf.linalg.l2_normalize(vm, axis = 1)

output = tf.keras.layers.Dot(axes=1)([vu, vm])

model = tf.keras.models.Model([input_u, input_m], output)

model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_9 (InputLayer)        [(None, 14)]                 0         []                            
                                                                                                  
 input_10 (InputLayer)       [(None, 16)]                 0         []                            
                                                                                                  
 sequential_8 (Sequential)   (None, 32)                   40864     ['input_9[0][0]']             
                                                                                                  
 sequential_9 (Sequential)   (None, 32)                   41376     ['input_10[0][0]']            
                                                                                            

In [85]:
tf.random.set_seed(1)
model.compile(
    loss=tf.keras.losses.MeanSquaredError(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-2)
)

In [86]:
tf.random.set_seed(1)
model.fit([user_train, movie_train], ynorm_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x1e82abc7650>

In [108]:
item_vecs = pd.read_csv('./data/content_item_vecs.csv')
item_vecs_for_pred = item_vecs.drop(['movie id'], axis=1)
item_vecs_for_pred = np.array(item_vecs_for_pred)

item_vecs_for_pred.shape

(1883, 16)

In [159]:
new_user_id = 5000
new_rating_ave = 1.0
new_action = 1.0
new_adventure = 1
new_animation = 1
new_childrens = 1
new_comedy = 5
new_crime = 1
new_documentary = 1
new_drama = 1
new_fantasy = 1
new_horror = 1
new_mystery = 1
new_romance = 5
new_scifi = 5
new_thriller = 1
new_rating_count = 3

user_vec = np.array([[# new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_horror, new_mystery,
                      new_romance, new_scifi, new_thriller]])

# Generate user vector to predict which movies they might like
user_vecs = np.tile(user_vec, (len(item_vecs), 1))

In [160]:
user_vecs_norm = scaler_u.transform(user_vecs)
item_vecs_norm = scaler_m.transform(item_vecs_for_pred)

preds = model.predict([user_vecs_norm, item_vecs_norm])
preds = scaler_y.inverse_transform(preds)



In [161]:
movie_list = pd.read_csv('./data/content_movie_list.csv')
movie_list['movie id'] = movie_list['movieId']

item_vecs['user rating'] = pd.DataFrame(preds)
merged_table = pd.merge(item_vecs, movie_list, on='movie id', how='inner')
merged_table.head()

merged_table_for_display = merged_table[['title', 'genres', 'user rating']]
merged_table_for_display.head()

Unnamed: 0,title,genres,user rating
0,Save the Last Dance (2001),Drama|Romance,4.201135
1,Save the Last Dance (2001),Drama|Romance,4.406857
2,"Wedding Planner, The (2001)",Comedy|Romance,4.323529
3,"Wedding Planner, The (2001)",Comedy|Romance,4.450091
4,Hannibal (2001),Horror|Thriller,3.45164


In [162]:
merged_table_for_display.sort_values(by='user rating', ascending=False)[:20]

Unnamed: 0,title,genres,user rating
1786,Focus (2015),Comedy|Crime|Drama|Romance,4.624592
1202,27 Dresses (2008),Comedy|Romance,4.624123
797,Mr. & Mrs. Smith (2005),Action|Adventure|Comedy|Romance,4.624099
1507,"Adjustment Bureau, The (2011)",Romance|Sci-Fi|Thriller,4.623348
1446,Date Night (2010),Action|Comedy|Romance,4.623312
411,Bruce Almighty (2003),Comedy|Drama|Fantasy|Romance,4.623124
633,"Terminal, The (2004)",Comedy|Drama|Romance,4.623082
518,21 Grams (2003),Crime|Drama|Mystery|Romance|Thriller,4.622993
1794,Trainwreck (2015),Comedy|Romance,4.622161
373,How to Lose a Guy in 10 Days (2003),Comedy|Romance,4.62178


We can see that the top movies are combinations of "Romance", "Comedy", and "Scifi", which are the genres this user prefers.