Import libraries

In [76]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, concatenate, Dropout, TextVectorization, LSTM, BatchNormalization, Activation, Lambda, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2, l2
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from transformers import DistilBertTokenizer, TFDistilBertModel

Load dataset

In [7]:
pd.set_option('display.max_columns', None)
#df = pd.read_csv('Merged_df.csv', index_col=0, low_memory=False)
df = pd.read_csv("/kaggle/input/merged-df/Merged_df.csv", index_col=0, low_memory=False)

Drop unnecessary columns

In [8]:
df1 = df.drop(['MovieID', 'Timestamp', 'Title', 'Genres', 'ZipCode', 'Movie_Title',
               'status', 'backdrop_path', 'homepage', 'imdb_id', 'poster_path', 'production_companies',
               'production_countries', 'title_year', 'year', 'original_language', "release_date", "original_title", "spoken_languages"], axis=1)

Drop rows with NA values

In [9]:
df1.dropna(inplace=True)

Converting Years into Decades (Buckets to make it a categorical variable)


In [10]:
# Convert years into decades
df1['Decade'] = (df1['Movie_Year'] // 10) * 10  # This floors each year to the start of its decade

# Now drop the original 'Movie_Year' column if it's no longer needed
df1 = df1.drop(['Movie_Year'], axis=1)

Convert 'adult' column to integers

In [11]:
# Convert 'adult' column to integers (1 for True, 0 for False)
df1['adult'] = df1['adult'].astype(int)

Convert movieids to integers

In [12]:
df1['id'] = df1['id'].astype(int)

Encode Catgorical features & Scale Numerical Variables

In [13]:
# Define categorical and numerical features
categorical_features = ['Gender', 'Occupation', 'Age', 'Decade']
numerical_features = ['vote_average', 'vote_count', 'revenue', 'runtime', 'budget', 'popularity']

# One-Hot Encoding for categorical variables
one_hot_encoder = OneHotEncoder(sparse=False)  # Using sparse=False to get a dense array
encoded_features = one_hot_encoder.fit_transform(df1[categorical_features])

# Normalize Numerical Features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df1[numerical_features])



Convert Genres into Multiple Binary Columns

In [14]:
# Since genres are separated by commas in dataset
df1['genres'] = df1['genres'].apply(lambda x: x.split(', '))

mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(df1['genres'])

Before concatenating, Drop all original variables which have been transformed

In [15]:
df1.drop(categorical_features, axis=1, inplace=True)
df1.drop(numerical_features, axis=1, inplace=True)
df1.drop("genres", axis=1, inplace=True)

Combine all processed features back into one dataframe

In [16]:
# Convert encoded categorical features, scaled numerical features, and multiple genere features back to DataFrame 
encoded_cats = pd.DataFrame(encoded_features, columns=one_hot_encoder.get_feature_names_out(categorical_features), index=df1.index)
scaled_nums = pd.DataFrame(scaled_features, columns=numerical_features, index=df1.index)
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_, index=df1.index)

# Combine all features
df_combined = pd.concat([df1, encoded_cats, scaled_nums,genres_df], axis=1)

Split into train and test dataset

In [17]:
df_train_combined, df_test_combined = train_test_split(df_combined, test_size=0.2, random_state=42)

Create User embeddings

In [18]:
num_users = df_train_combined['UserID'].max() + 1  # Add 1 to handle index error as python indexes from 0, then index 0 will be unused/reserved

# UsedID embeddings
user_id_input = Input(shape=(1,), name='user_id_input')  # Entry point for User IDs  --> Each input is a single value

# Aim to capture 10 latent characteristics of each user (10 dimensions)
# input_dim is the total number of unique users
user_embedding = Embedding(input_dim=num_users, output_dim=10, name='user_embedding')(user_id_input) # Transforms each userID into a dense vector of size 10
user_flatten = Flatten(name='user_flatten')(user_embedding) # Flatten to a 1D tensor

User demographics

In [19]:
demographic_features = ['Gender_F', 'Gender_M', 'Occupation_0', 'Occupation_1', 'Occupation_2',
                        'Occupation_3', 'Occupation_4', 'Occupation_5', 'Occupation_6',
                        'Occupation_7', 'Occupation_8', 'Occupation_9', 'Occupation_10',
                        'Occupation_11', 'Occupation_12', 'Occupation_13', 'Occupation_14',
                        'Occupation_15', 'Occupation_16', 'Occupation_17', 'Occupation_18',
                        'Occupation_19', 'Occupation_20', 'Age_1', 'Age_18', 'Age_25', 'Age_35',
                        'Age_45', 'Age_50', 'Age_56']

# User demographics input
user_demographics_input = Input(shape=(len(demographic_features),), name='user_demographics_input')

Create Movie Embeddings

In [20]:
num_movies = df_train_combined['id'].max() + 1 # Add 1 to handle index error as python indexes from 0, then index 0 will be unused/reserved

# MovieID embeddings
movie_id_input = Input(shape=(1,), name='movie_id_input')  # Entry point for Movie IDs  --> Each input is a single value

# Aim to capture 50 latent characteristics of each movie due to large number of unique movies (50 dimensions)
# input_dim is the total number of unique movies
movie_embedding = Embedding(input_dim=num_movies, output_dim=50, name='movie_embedding')(movie_id_input)
movie_flatten = Flatten(name='movie_flatten')(movie_embedding)

Movie Features

In [21]:
movie_features_columns = ['vote_average', 'vote_count', 'revenue', 'runtime', 'budget', 'popularity',
                          'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama',
                          'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance',
                          'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western', 'Decade_1910',
                          'Decade_1920', 'Decade_1930', 'Decade_1940', 'Decade_1950', 'Decade_1960',
                          'Decade_1970', 'Decade_1980', 'Decade_1990', 'Decade_2000']

num_movie_features = len(movie_features_columns)

movie_input = Input(shape=(num_movie_features,), name='movie_input')

Utilize 'Overview' and 'tagline' columns to process text data

In [80]:
# preprocessing step --> transforms strings into numerical representation of fixed length 
# Define TextVectorization for overview column
vectorizer_overview = TextVectorization(max_tokens=10000, output_sequence_length=70) # Vectorizer only considers top 10000 most common words in dataset
vectorizer_overview.adapt(df_train_combined['overview'])

# Define TextVectorization for tagline column
vectorizer_tagline = TextVectorization(max_tokens=1000, output_sequence_length=15)
vectorizer_tagline.adapt(df_train_combined['tagline'])

# Text inputs
text_input_overview = Input(shape=(1,), dtype="string", name="overview_input")
text_input_tagline = Input(shape=(1,), dtype="string", name="tagline_input")

# Apply vectorization
overview_vectors = vectorizer_overview(text_input_overview) # Tokenize input and convert each token into numeric indices based on its internal vocabulary
tagline_vectors = vectorizer_tagline(text_input_tagline)

# Embedding layers
overview_embedding = Embedding(input_dim=10000, output_dim=50, name='overview_embedding')(overview_vectors)
tagline_embedding = Embedding(input_dim=1000, output_dim=50, name='tagline_embedding')(tagline_vectors)

# LSTM
overview_processed = LSTM(32, name='overview_lstm')(overview_embedding)  # 32 is the number of LSTM units in the layer. Output will be a 32-D vector. 
tagline_processed = LSTM(32, name='tagline_lstm')(tagline_embedding)

TRY USING BERT INSTEAD OF VECTORIZATION AND LSTM (TOO LONG TO COMPUTE -> SCRAPPED)

In [79]:
# # Initialize BERT tokenizer and model
# # tokenizer = DistilBertTokenizer.from_pretrained('bert-base-uncased')
# # bert_model = TFDistilBertModel.from_pretrained('bert-base-uncased')

# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

# # Preprocess text data
# overview_train = df_train_combined['overview'].tolist()
# tagline_train = df_train_combined['tagline'].tolist()

# overview_test = df_test_combined['overview'].tolist()
# tagline_test = df_test_combined['tagline'].tolist()

# # Tokenize data
# tokenized_overview_train = tokenizer(overview_train, padding=True, truncation=True, return_tensors="np", max_length=70)
# tokenized_tagline_train = tokenizer(tagline_train, padding=True, truncation=True, return_tensors="np", max_length=15)

# tokenized_overview_test = tokenizer(overview_test, padding=True, truncation=True, return_tensors="np", max_length=70)
# tokenized_tagline_test = tokenizer(tagline_test, padding=True, truncation=True, return_tensors="np", max_length=15)

# # Overview data
# input_ids_overview_train = tokenized_overview_train['input_ids']
# attention_masks_overview_train = tokenized_overview_train['attention_mask']

# input_ids_overview_test = tokenized_overview_test['input_ids']
# attention_masks_overview_test = tokenized_overview_test['attention_mask']

# # Tagline data
# input_ids_tagline_train = tokenized_tagline_train['input_ids']
# attention_masks_tagline_train = tokenized_tagline_train['attention_mask']

# input_ids_tagline_test = tokenized_tagline_test['input_ids']
# attention_masks_tagline_test = tokenized_tagline_test['attention_mask']

# # Define input layers for token IDs and attention masks
# input_ids_overview_layer = Input(shape=(70,), dtype='int32', name='input_ids_overview')
# attention_masks_overview_layer = Input(shape=(70,), dtype='int32', name='attention_masks_overview')

# input_ids_tagline_layer = Input(shape=(15,), dtype='int32', name='input_ids_tagline')
# attention_masks_tagline_layer = Input(shape=(15,), dtype='int32', name='attention_masks_tagline')

# # Get BERT embeddings (ensure to use the correct BERT model interface)
# overview_embeddings = bert_model(input_ids_overview_layer, attention_mask=attention_masks_overview_layer).pooler_output
# tagline_embeddings = bert_model(input_ids_tagline_layer, attention_mask=attention_masks_tagline_layer).pooler_output

Combine model inputs

In [81]:
# Combine user and movie inputs
combined_inputs = concatenate([user_flatten, user_demographics_input, movie_flatten, movie_input, overview_processed, tagline_processed])

Assemble Complete Model

In [83]:
# Add fully connected layers
fc1 = Dense(256, activation='relu')(combined_inputs) 
dropout1 = Dropout(0.25)(fc1)

fc2 = Dense(128, activation='relu')(dropout1)
dropout2 = Dropout(0.25)(fc2)

fc3 = Dense(64, activation='relu')(dropout2)
output = Dense(1, activation='sigmoid')(fc3) 
output =Lambda(lambda x: x * 5)(output)

# Finalizing the model
model = Model(inputs=[user_id_input, user_demographics_input, movie_id_input, movie_input, text_input_overview, text_input_tagline], outputs=output)
lr_schedule = ExponentialDecay(initial_learning_rate=0.004, decay_steps=10000, decay_rate=0.9) # Learning Rate Scheduling

# Compiling the model
model.compile(optimizer=Adam(lr_schedule), loss='mean_squared_error')

# Model summary to check the architecture
model.summary()

Preparing Inputs for training

In [84]:
# Extract ratings (Y-values)
ratings_train = df_train_combined['Rating'].values
ratings_test = df_test_combined['Rating'].values

# Prepare inputs for training
user_ids_train = df_train_combined['UserID'].values
movie_ids_train = df_train_combined['id'].values
user_demographics_train = df_train_combined[demographic_features].values
movie_features_train = df_train_combined[movie_features_columns].values
overview_train = df_train_combined['overview']  
tagline_train = df_train_combined['tagline'] 

# Prepare inputs for testing (evaluation)
user_ids_test = df_test_combined['UserID'].values
movie_ids_test = df_test_combined['id'].values
user_demographics_test = df_test_combined[demographic_features].values
movie_features_test = df_test_combined[movie_features_columns].values
overview_test = df_test_combined['overview']  
tagline_test = df_test_combined['tagline']  

Train the Model

In [85]:
history = model.fit([user_ids_train, user_demographics_train, movie_ids_train, movie_features_train, overview_train, tagline_train], ratings_train,
                    validation_data=([user_ids_test, user_demographics_test, movie_ids_test, movie_features_test, overview_test, tagline_test], ratings_test),
                    epochs=5,
                    batch_size=128)

Epoch 1/5
[1m5953/5953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 31ms/step - loss: 0.7190 - val_loss: 0.7476
Epoch 2/5
[1m5953/5953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 30ms/step - loss: 0.6635 - val_loss: 0.7582
Epoch 3/5
[1m5953/5953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 30ms/step - loss: 0.6500 - val_loss: 0.7508
Epoch 4/5
[1m5953/5953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 30ms/step - loss: 0.6405 - val_loss: 0.7490
Epoch 5/5
[1m5953/5953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 30ms/step - loss: 0.6325 - val_loss: 0.7441


Extract Unseen Movies

In [86]:
# Function to extract movies yet to be seen by a user
def get_unseen_movies(user_id, data = df_train_combined):
    seen_movieIDs = data[data['UserID'] == user_id]['id'].unique()
    all_movies = data["id"].unique()
    unseen_movieIDs = np.sort(np.setdiff1d(all_movies, seen_movieIDs))
    return unseen_movieIDs

Predict Ratings with titles for a particular user

In [93]:
def predict_ratings_for_user_with_titles(user_id, num_recommendations=5, model = model, data = df_train_combined):

    # Extract unseen movie IDs for the user
    movie_id_input = get_unseen_movies(user_id, data)
    
    # Extract user IDs
    user_id_input = np.array([user_id] * len(movie_id_input))
    
    # Extract dataset for this particular user and just keep first row as features will be constant
    user_df = data[data['UserID'] == user_id][:1]

    # Extract users demographic features and repeat for the length of unseen movies
    user_demographics_pred = np.repeat(user_df[demographic_features].values, len(movie_id_input), axis=0)

    # Extract dataset on movie features for unseen movies
    unseen_movies_df = data[data['id'].isin(movie_id_input)]
    # Drop duplicates in movieIDs as movie features will be constant and we only want the movie features
    unseen_movies_df = unseen_movies_df.drop_duplicates(subset='id', keep='first')
    # Sort by MovieID to ensure same order as the movie_id_input array
    unseen_movies_df = unseen_movies_df.sort_values(by='id', ascending=True)
    
    # Extract Movie features input
    movie_features_input = unseen_movies_df[movie_features_columns].values
    
    # Extract Movie Overview input
    overview_input = unseen_movies_df["overview"].values
    
    # Extract Movie Tagline input
    tagline_input = unseen_movies_df["tagline"].values

    # Predict Ratings for unseen movies
    predicted_ratings = model.predict([user_id_input, user_demographics_pred, movie_id_input, 
                                       movie_features_input, overview_input, tagline_input])
    
    unseen_movies_df["predicted_ratings"] = predicted_ratings    
    
    unseen_movies_df.sort_values(by='predicted_ratings', ascending=False, inplace = True)
    
    final_movies_df = unseen_movies_df[["title", "predicted_ratings"]].head(num_recommendations)

    return final_movies_df

In [94]:
predict_ratings_for_user_with_titles(67,10, data = df_combined)

[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


Unnamed: 0,title,predicted_ratings
137676,Firelight,4.911894
12550,Shadow of a Doubt,4.883208
8702,Grand Illusion,4.87824
125933,Belly,4.865616
555413,Smashing Time,4.858304
4795,The Searchers,4.857185
97599,Seven Chances,4.856375
3764,It Happened One Night,4.853857
68414,Hell in the Pacific,4.847507
3778,Notorious,4.84641
