In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/recsysmasterfds-2024/train.csv
/kaggle/input/recsysmasterfds-2024/kaggle_baseline.csv


# Let us first do some preprocessing

In [2]:
# Remove warnings

import warnings
warnings.filterwarnings('ignore')


# Read the input csv and rename the columns

train = pd.read_csv('/kaggle/input/recsysmasterfds-2024/train.csv')
train.rename(columns={'release_date':'genre', 'sex':'age', 'age':'sex'}, inplace=True) 
train.head()

Unnamed: 0,user_id,title,movie_id,rating,genre,age,sex
0,2592,Top Gun (1986),1101,4,Action|Romance,50,M
1,4318,12 Angry Men (1957),1203,4,Drama,25,M
2,2756,Robocop 2 (1990),2986,2,Action|Crime|Sci-Fi,18,M
3,1706,Modern Times (1936),3462,5,Comedy,25,M
4,4813,Milk Money (1994),276,3,Comedy|Romance,35,F


We tried two different approaches to make predictions, the first one is to simply use the ratings of the DataFrame, the other one is to filter to take only the top 15% of samples for each user. We tried both approaches and the ratings approaches worked better. However, we decided to include the top 15% filtering to state constancy.

In [3]:
# Calculate the threshold rating for each user_id

threshold_ratings = train.groupby('user_id')['rating'].quantile(0.85)
threshold_int = threshold_ratings.astype(int)
delta = (threshold_ratings - threshold_int) > 1e-5


# Update the threshold ratings to be integers

for i in threshold_ratings.index:
    if delta[i] == True:
        threshold_ratings[i] = int(threshold_ratings[i]) + 1
        
        
# Function to filter ratings for each user

def retain_top_ratings(group):
    threshold = threshold_ratings[group.name]
    return group[group['rating'] >= threshold]


# Apply the filtering function to retain top 85% ratings for each user

top_ratings_df = train.groupby('user_id').apply(retain_top_ratings).reset_index(drop=True)

Now we process the columns to match the factorization machine format.

In [4]:
# Obtain the release date of each movie

train['release_date'] = train['title'].str[-6:].str[1:5].astype(int)


# Mapping dictionary

class_mapping = {'M': 1, 'F': 0}


# Convert class_column to binary values

train['sex'] = train['sex'].map(class_mapping)
train.head()

Unnamed: 0,user_id,title,movie_id,rating,genre,age,sex,release_date
0,2592,Top Gun (1986),1101,4,Action|Romance,50,1,1986
1,4318,12 Angry Men (1957),1203,4,Drama,25,1,1957
2,2756,Robocop 2 (1990),2986,2,Action|Crime|Sci-Fi,18,1,1990
3,1706,Modern Times (1936),3462,5,Comedy,25,1,1936
4,4813,Milk Money (1994),276,3,Comedy|Romance,35,0,1994


In [5]:
# Unique movie genres

unique_words = set()
for genre in train['genre'].unique():
    unique_words.update(genre.split('|'))

    
# Print genres

print("Unique genres:", unique_words)
print("Number of genres:", len(unique_words))

Unique genres: {'Film-Noir', 'Action', 'Adventure', 'Thriller', 'Fantasy', "Children's", 'Drama', 'Comedy', 'Mystery', 'Horror', 'Animation', 'Musical', 'Documentary', 'Western', 'Crime', 'War', 'Sci-Fi', 'Romance'}
Number of genres: 18


# Factorization Machine approach

Taking the notebook given in the Virtual Campus as a baseline, we modified it to match this problem.

In [6]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-03-31 16:22:36.245948: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-31 16:22:36.246193: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-31 16:22:36.480534: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [7]:
def text2seq(text, n_genre):
    """ using tokenizer to encoded the multi-level categorical feature
    """
    tokenizer = Tokenizer(lower=True, split='|',filters='', num_words=n_genre)
    tokenizer.fit_on_texts(text)
    seq = tokenizer.texts_to_sequences(text)
    seq = pad_sequences(seq, maxlen=3,padding='post')
    return seq

n_genre = 18
train['genre'] = text2seq(train.genre.values, n_genre=n_genre+1).tolist()
train.head()

Unnamed: 0,user_id,title,movie_id,rating,genre,age,sex,release_date
0,2592,Top Gun (1986),1101,4,"[3, 6, 0]",50,1,1986
1,4318,12 Angry Men (1957),1203,4,"[2, 0, 0]",25,1,1957
2,2756,Robocop 2 (1990),2986,2,"[3, 8, 5]",18,1,1990
3,1706,Modern Times (1936),3462,5,"[1, 0, 0]",25,1,1936
4,4813,Milk Money (1994),276,3,"[1, 6, 0]",35,0,1994


In [8]:
# Process the DataFrame to match the factorization machine format

user_ids = train["user_id"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}

movie_ids = train["movie_id"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}

train["user"] = train["user_id"].map(user2user_encoded)
train["movie"] = train["movie_id"].map(movie2movie_encoded)

num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)

train["rating"] = train["rating"].values.astype(np.float32)


# min and max ratings will be used to normalize the ratings later

min_rating = min(train["rating"])
max_rating = max(train["rating"])

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)

Number of users: 6040, Number of Movies: 3680, Min rating: 1.0, Max rating: 5.0


In [9]:
# Show the process result

train.head()

Unnamed: 0,user_id,title,movie_id,rating,genre,age,sex,release_date,user,movie
0,2592,Top Gun (1986),1101,4.0,"[3, 6, 0]",50,1,1986,0,0
1,4318,12 Angry Men (1957),1203,4.0,"[2, 0, 0]",25,1,1957,1,1
2,2756,Robocop 2 (1990),2986,2.0,"[3, 8, 5]",18,1,1990,2,2
3,1706,Modern Times (1936),3462,5.0,"[1, 0, 0]",25,1,1936,3,3
4,4813,Milk Money (1994),276,3.0,"[1, 6, 0]",35,0,1994,4,4


We also tried several approaches to perform the train-test split, such as user or item based. Nonetheless, neither of them improved the performance, so we decided to perform the usual 80-20 split.

In [10]:
# Perform train-test split

from sklearn.model_selection import train_test_split

trai, val = train_test_split(train, test_size=0.2, random_state=7)

In [11]:
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *

def define_input_layers():
    # numerical features
    age_input = Input((1,), name = 'input_age') #age
    num_inputs = [age_input]
    
    # single level categorical features
    uid_input = Input((1,), name = 'input_uid') #user_id
    mid_input = Input((1,), name = 'input_mid') #movie_id
    sex_input = Input((1,), name = 'input_sex') #sex
    cat_sl_inputs = [uid_input, mid_input, sex_input]

    # multi level categorical features (with 3 genres at most)
    genre_input = Input((3,), name = 'input_genre')
    cat_ml_inputs = [genre_input]

    inputs = num_inputs + cat_sl_inputs + cat_ml_inputs
    
    return inputs

inputs = define_input_layers()

In [12]:
def Tensor_Mean_Pooling(name = 'mean_pooling', keepdims = False):
    return Lambda(lambda x: K.mean(x, axis = 1, keepdims=keepdims), name = name, mask=lambda inputs, mask: None)

def fm_1d(inputs, n_uid, n_mid, n_sex, n_genre):
    
    # user feat3 + user embedding + movie embedding + genre embedding
    age_input, uid_input, mid_input, sex_input, genre_input = inputs
    
    # all tensors are reshape to (None, 1)
    num_dense_1d = [Dense(1, name = 'num_dense_1d_age')(age_input)]
    cat_sl_embed_1d = [Embedding(n_uid + 1, 1, name = 'cat_embed_1d_uid')(uid_input),
                       Embedding(n_mid + 1, 1, name = 'cat_embed_1d_mid')(mid_input),
                       Embedding(n_sex + 1, 1, name = 'cat_embed_1d_sex')(sex_input)]
    cat_ml_embed_1d = [Embedding(n_genre + 1, 1, mask_zero=True, name = 'cat_embed_1d_genre')(genre_input)]

    cat_sl_embed_1d = [Reshape((1,))(i) for i in cat_sl_embed_1d]
    cat_ml_embed_1d = [Tensor_Mean_Pooling(name = 'embed_1d_mean')(i) for i in cat_ml_embed_1d]
    
    # add all tensors
    y_fm_1d = Add(name = 'fm_1d_output')(num_dense_1d + cat_sl_embed_1d + cat_ml_embed_1d)
    
    return y_fm_1d

y_1d = fm_1d(inputs, 10, 10, 2, 10)

In [13]:
def fm_2d(inputs, n_uid, n_mid, n_sex, n_genre, k):
    
    age_input, uid_input, mid_input, sex_input, genre_input = inputs
    
    num_dense_2d = [Dense(k, name = 'num_dense_2d_age')(age_input)] # shape (None, k)
    num_dense_2d = [Reshape((1,k))(i) for i in num_dense_2d] # shape (None, 1, k)

    cat_sl_embed_2d = [Embedding(n_uid + 1, k, name = 'cat_embed_2d_uid')(uid_input), 
                       Embedding(n_mid + 1, k, name = 'cat_embed_2d_mid')(mid_input),
                       Embedding(n_sex + 1, k, name = 'cat_embed_2d_sex')(sex_input)] # shape (None, 1, k)
    
    cat_ml_embed_2d = [Embedding(n_genre + 1, k, name = 'cat_embed_2d_genre')(genre_input)] # shape (None, 3, k)
    cat_ml_embed_2d = [Tensor_Mean_Pooling(name = 'cat_embed_2d_genre_mean', keepdims=True)(i) for i in cat_ml_embed_2d] # shape (None, 1, k)

    # concatenate all 2d embed layers => (None, ?, k)
    embed_2d = Concatenate(axis=1, name = 'concat_embed_2d')(num_dense_2d + cat_sl_embed_2d + cat_ml_embed_2d)

    # calcuate the interactions by simplication
    # sum of (x1*x2) = sum of (0.5*[(xi)^2 - (xi^2)])
    tensor_sum = Lambda(lambda x: K.sum(x, axis = 1), name = 'sum_of_tensors')
    tensor_square = Lambda(lambda x: K.square(x), name = 'square_of_tensors')

    sum_of_embed = tensor_sum(embed_2d)
    square_of_embed = tensor_square(embed_2d)

    square_of_sum = Multiply()([sum_of_embed, sum_of_embed])
    sum_of_square = tensor_sum(square_of_embed)

    sub = Subtract()([square_of_sum, sum_of_square])
    sub = Lambda(lambda x: x*0.5)(sub)
    y_fm_2d = Reshape((1,), name = 'fm_2d_output')(tensor_sum(sub))
    
    return y_fm_2d, embed_2d

y_fm2_d, embed_2d = fm_2d(inputs, 10, 10, 2, 10, 5)

In [14]:
def fm_model(n_uid, n_mid, n_sex, n_genre, k, dnn_dr):
    
    inputs = define_input_layers()
    
    y_fm_1d = fm_1d(inputs, n_uid, n_mid, n_sex, n_genre)
    y_fm_2d, embed_2d = fm_2d(inputs, n_uid, n_mid, n_sex, n_genre, k)
    
    # combine deep and fm parts
    y = Concatenate()([y_fm_1d, y_fm_2d])
    y = Dense(1, name = 'fm_output')(y)
    
    fm_model_1d = Model(inputs, y_fm_1d)
    fm_model_2d = Model(inputs, y_fm_2d)
    fm_model = Model(inputs, y)
    
    return fm_model_1d, fm_model_2d, fm_model

In [15]:
params = {
    'n_uid': train.user.nunique(),
    'n_mid': train.movie.nunique(),
    'n_sex': 2,
    'n_genre': 18,
    'k': 20,
    'dnn_dr': 0.5
}

fm_model_1d, fm_model_2d, fm_model = fm_model(**params)

params

{'n_uid': 6040,
 'n_mid': 3680,
 'n_sex': 2,
 'n_genre': 18,
 'k': 20,
 'dnn_dr': 0.5}

In [16]:
def df2xy(ratings):
    x = [ratings.age.values,
         ratings.user.values, 
         ratings.movie.values,
         ratings.sex.values,
         np.concatenate(ratings.genre.values).reshape(-1,3)]
    y = ratings.rating.values
    return x,y

train_x, train_y = df2xy(trai)
valid_x, valid_y = df2xy(val)

In [17]:
from tensorflow.keras.callbacks import  EarlyStopping, ModelCheckpoint

# train model

fm_model.compile(
    loss=tf.keras.losses.MeanSquaredError(), optimizer=keras.optimizers.Adam(learning_rate=0.0001)
)
early_stop = EarlyStopping(monitor='val_loss', patience=3)
callbacks=[early_stop]
train_history = fm_model.fit(train_x, train_y, 
                                  epochs=50, batch_size=4096, 
                                  validation_data=(valid_x, valid_y),
                                  callbacks = callbacks, 
                                 verbose=1)

Epoch 1/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - loss: 7.3169 - val_loss: 2.0338
Epoch 2/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 1.7322 - val_loss: 1.3186
Epoch 3/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 1.2419 - val_loss: 1.1652
Epoch 4/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 1.1193 - val_loss: 1.1051
Epoch 5/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 1.0736 - val_loss: 1.0709
Epoch 6/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 1.0440 - val_loss: 1.0444
Epoch 7/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 1.0186 - val_loss: 1.0206
Epoch 8/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 0.9948 - val_loss: 0.9985
Epoch 9/50
[1m157/157[0m [32m

After training our neural network, we must create a DataFrame with all user and unseen movies pairs.

In [18]:
# Take the ids of both users and movies without duplicates

users = train['user'].unique()
movies = train['movie'].unique()

In [19]:
# Take features of both users and movies

users_feat = train[['user', 'sex', 'age']]
users_feat = users_feat.drop_duplicates(subset='user')
users_feat.set_index('user', inplace=True)

movie_feat = train[['movie', 'title', 'genre', 'release_date']]
movie_feat = movie_feat.drop_duplicates(subset='movie')
movie_feat.set_index('movie', inplace=True)

In [20]:
def df2x(ratings):
    x = [ratings.age.values,
         ratings.user.values, 
         ratings.movie.values, 
         ratings.sex.values,
         np.concatenate(ratings.genre.values).reshape(-1,3)]
    return x

We mapped the movies in our training, so we also have to apply it to the test set.

In [21]:
# Create a DataFrame with the movies each user has not seen

results = {}
cont = 0

for user in users:
    # Log after 500 iterations
    if cont % 500 == 0:
        print(cont)
        
    # Update the counter
    cont += 1
    
    # Create the list of unseen films for each user and take its length
    missing_films = set(movies) - set(train[train['user'] == user]['movie'])
    n = len(missing_films)
    
    # Take the user's features
    user_sex = users_feat['sex'][user]
    user_age = users_feat['age'][user]
    genre_list = [movie_feat['genre'][movie] for movie in missing_films]
    release_list = [movie_feat['release_date'][movie] for movie in missing_films]

    # Create the user DataFrame
    df = pd.DataFrame({
        'user': n * [user],
        'movie': list(missing_films),
        'sex': n * [user_sex],
        'age': n * [user_age],
        'genre': genre_list,
        'release_date': release_list
    })

    # Make the predictions for the current user and store the top 25 movies
    test_x = df2x(df)
    predictions = fm_model.predict(test_x,verbose=0)
    preds = [item for sublist in predictions for item in sublist]
    sort = np.argsort(preds)[::-1][:25]
    top25 = [list(missing_films)[i] for i in sort]
    top25_mapped = [movie_encoded2movie[item] for item in top25]
    results[userencoded2user[user]] = top25_mapped
    
    ### To check if we did the transformation properly
    for t in top25_mapped:
        if t not in train['movie_id']:
            print ('Not in train')

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000


In [22]:
# Show how the solution should be stored

test_example = pd.read_csv('/kaggle/input/recsysmasterfds-2024/kaggle_baseline.csv')

Finally, we store our results in a .csv output file.

In [23]:
# Convert our results dictionary into the corresponding format

import csv

# open the file in the write mode
with open('solution.csv', 'w', encoding='UTF8') as f:
    
    # create the csv writer
    writer = csv.writer(f)
    
    # write the headers of both columns
    writer.writerow(['user_id', 'prediction'])
    
    # write a row for each user
    for user_id in test_example.user_id.unique():
        relevant_items = results[user_id]
        list_relevants = ' '.join([str(elem) for elem in relevant_items])
        writer.writerow([str(user_id),list_relevants])