Import libraries

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, concatenate, Dropout, TextVectorization, LSTM, BatchNormalization, Activation, Lambda, MultiHeadAttention, LayerNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2, l2
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras import backend as K

Load dataset

In [77]:
pd.set_option('display.max_columns', None)
#df = pd.read_csv('Merged_df.csv', index_col=0, low_memory=False)
df = pd.read_csv("/kaggle/input/merged-df/Merged_df.csv", index_col=0, low_memory=False)

Drop unnecessary columns

In [78]:
df1 = df.drop(['MovieID', 'Timestamp', 'Title', 'Genres', 'ZipCode', 'Movie_Title',
               'status', 'backdrop_path', 'homepage', 'imdb_id', 'poster_path', 'production_companies',
               'production_countries', 'title_year', 'year', 'original_language', "release_date", "original_title", "spoken_languages"], axis=1)

Drop rows with NA values

In [79]:
df1.dropna(inplace=True)

Convert 'adult' column to integers

In [80]:
# Convert 'adult' column to integers (1 for True, 0 for False)
df1['adult'] = df1['adult'].astype(int)

Convert movieids to integers

In [81]:
df1['id'] = df1['id'].astype(int)

Encode Catgorical features & Scale Numerical Variables

In [82]:
# Define categorical and numerical features
categorical_features = ['Gender', 'Occupation', 'Age']
numerical_features = ['vote_average', 'vote_count', 'revenue', 'runtime', 'budget', 'popularity']

# One-Hot Encoding for categorical variables
one_hot_encoder = OneHotEncoder(sparse=False)  # Using sparse=False to get a dense array
encoded_features = one_hot_encoder.fit_transform(df1[categorical_features])

# Normalize Numerical Features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df1[numerical_features])



Convert Genres into Multiple Binary Columns

In [83]:
# Since genres are separated by commas in dataset
df1['genres'] = df1['genres'].apply(lambda x: x.split(', '))

mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(df1['genres'])

Before concatenating, Drop all original variables which have been transformed

In [84]:
df1.drop(categorical_features, axis=1, inplace=True)
df1.drop(numerical_features, axis=1, inplace=True)
df1.drop("genres", axis=1, inplace=True)

Combine all processed features back into one dataframe

In [85]:
# Convert encoded categorical features, scaled numerical features, and multiple genere features back to DataFrame 
encoded_cats = pd.DataFrame(encoded_features, columns=one_hot_encoder.get_feature_names_out(categorical_features), index=df1.index)
scaled_nums = pd.DataFrame(scaled_features, columns=numerical_features, index=df1.index)
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_, index=df1.index)

# Combine all features
df_combined = pd.concat([df1, encoded_cats, scaled_nums,genres_df], axis=1)

Split into train and test dataset

In [86]:
df_train_combined, df_test_combined = train_test_split(df_combined, test_size=0.2, random_state=42)

Create User embeddings

In [87]:
num_users = df_train_combined['UserID'].max() + 1  # Add 1 to handle index error as python indexes from 0, then index 0 will be unused/reserved

# UsedID embeddings
user_id_input = Input(shape=(1,), name='user_id_input')  # Entry point for User IDs  --> Each input is a single value

# Aim to capture 10 latent characteristics of each user (10 dimensions)
# input_dim is the total number of unique users
user_embedding = Embedding(input_dim=num_users, output_dim=10, name='user_embedding')(user_id_input) # Transforms each userID into a dense vector of size 10
user_flatten = Flatten(name='user_flatten')(user_embedding) # Flatten to a 1D tensor

User demographics

In [88]:
demographic_features = ['Gender_F', 'Gender_M', 'Occupation_0', 'Occupation_1', 'Occupation_2',
                        'Occupation_3', 'Occupation_4', 'Occupation_5', 'Occupation_6',
                        'Occupation_7', 'Occupation_8', 'Occupation_9', 'Occupation_10',
                        'Occupation_11', 'Occupation_12', 'Occupation_13', 'Occupation_14',
                        'Occupation_15', 'Occupation_16', 'Occupation_17', 'Occupation_18',
                        'Occupation_19', 'Occupation_20', 'Age_1', 'Age_18', 'Age_25', 'Age_35',
                        'Age_45', 'Age_50', 'Age_56']

# User demographics input
user_demographics_input = Input(shape=(len(demographic_features),), name='user_demographics_input')

Create Movie Embeddings

In [89]:
num_movies = df_train_combined['id'].max() + 1 # Add 1 to handle index error as python indexes from 0, then index 0 will be unused/reserved

# MovieID embeddings
movie_id_input = Input(shape=(1,), name='movie_id_input')  # Entry point for Movie IDs  --> Each input is a single value

# Aim to capture 50 latent characteristics of each movie due to large number of unique movies (50 dimensions)
# input_dim is the total number of unique movies
movie_embedding = Embedding(input_dim=num_movies, output_dim=50, name='movie_embedding')(movie_id_input)
movie_flatten = Flatten(name='movie_flatten')(movie_embedding)

Movie Features

In [90]:
movie_features_columns = ['vote_average', 'vote_count', 'revenue', 'runtime', 'budget', 'popularity',
                          'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama',
                          'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance',
                          'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western']

num_movie_features = len(movie_features_columns)

movie_input = Input(shape=(num_movie_features,), name='movie_input')

Utilize 'Overview' and 'tagline' columns to process text data

In [91]:
# preprocessing step --> transforms strings into numerical representation of fixed length 
# Define TextVectorization for overview column
vectorizer_overview = TextVectorization(max_tokens=10000, output_sequence_length=70) # Vectorizer only considers top 10000 most common words in dataset
vectorizer_overview.adapt(df_train_combined['overview'])

# Define TextVectorization for tagline column
vectorizer_tagline = TextVectorization(max_tokens=1000, output_sequence_length=15)
vectorizer_tagline.adapt(df_train_combined['tagline'])

# Text inputs
text_input_overview = Input(shape=(1,), dtype="string", name="overview_input")
text_input_tagline = Input(shape=(1,), dtype="string", name="tagline_input")

# Apply vectorization
overview_vectors = vectorizer_overview(text_input_overview) # Tokenize input and convert each token into numeric indices based on its internal vocabulary
tagline_vectors = vectorizer_tagline(text_input_tagline)

# Embedding layers
overview_embedding = Embedding(input_dim=10000, output_dim=50, name='overview_embedding')(overview_vectors)
tagline_embedding = Embedding(input_dim=1000, output_dim=50, name='tagline_embedding')(tagline_vectors)

# Embedding layers followed by Multi-Head Attention
def transformer_encoder(embedding_layer):
    # Apply Multi-Head Attention using 2 heads
    attention_output = MultiHeadAttention(num_heads=2, key_dim=50)(embedding_layer, embedding_layer)
    
    # Add a feed-forward network (FFN) as commonly used in Transformers
    ffn_output = Dense(50, activation='relu')(attention_output)

    # Layer Normalization and Residual (skip) Connection
    output = LayerNormalization()(ffn_output + embedding_layer) 
    return output

# Process both text inputs with Multi-Head Attention
overview_attention = transformer_encoder(overview_embedding)
tagline_attention = transformer_encoder(tagline_embedding)

# Since attention outputs a sequence, reduce it to a single vector via global pooling or by taking the mean
overview_pooled = Lambda(lambda x: K.mean(x, axis=1))(overview_attention)
tagline_pooled = Lambda(lambda x: K.mean(x, axis=1))(tagline_attention)

Combine model inputs

In [92]:
# Combine user and movie inputs
combined_inputs = concatenate([user_flatten, user_demographics_input, movie_flatten, movie_input, overview_pooled, tagline_pooled])

Assemble Complete Model

In [93]:
# Add fully connected layers
fc1 = Dense(256, activation='relu')(combined_inputs) 
dropout1 = Dropout(0.25)(fc1)

fc2 = Dense(128, activation='relu')(dropout1)
dropout2 = Dropout(0.25)(fc2)

fc3 = Dense(64, activation='relu')(dropout2)
output = Dense(1, activation='sigmoid')(fc3) 
output =Lambda(lambda x: x * 5)(output)

# Finalizing the model
model = Model(inputs=[user_id_input, user_demographics_input, movie_id_input, movie_input, text_input_overview, text_input_tagline], outputs=output)
lr_schedule = ExponentialDecay(initial_learning_rate=0.004, decay_steps=10000, decay_rate=0.9) # Learning Rate Scheduling

# Compiling the model
model.compile(optimizer=Adam(lr_schedule), loss='mean_squared_error')

# Model summary to check the architecture
model.summary()

Preparing Inputs for training

In [94]:
# Extract ratings (Y-values)
ratings_train = df_train_combined['Rating'].values
ratings_test = df_test_combined['Rating'].values

# Prepare inputs for training
user_ids_train = df_train_combined['UserID'].values
movie_ids_train = df_train_combined['id'].values
user_demographics_train = df_train_combined[demographic_features].values
movie_features_train = df_train_combined[movie_features_columns].values
overview_train = df_train_combined['overview']  
tagline_train = df_train_combined['tagline'] 

# Prepare inputs for testing (evaluation)
user_ids_test = df_test_combined['UserID'].values
movie_ids_test = df_test_combined['id'].values
user_demographics_test = df_test_combined[demographic_features].values
movie_features_test = df_test_combined[movie_features_columns].values
overview_test = df_test_combined['overview']  
tagline_test = df_test_combined['tagline']  

Train the Model

In [95]:
history = model.fit([user_ids_train, user_demographics_train, movie_ids_train, movie_features_train, overview_train, tagline_train], ratings_train,
                    validation_data=([user_ids_test, user_demographics_test, movie_ids_test, movie_features_test, overview_test, tagline_test], ratings_test),
                    epochs=5,
                    batch_size=128)

Epoch 1/5
[1m5953/5953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 32ms/step - loss: 0.9095 - val_loss: 0.8324
Epoch 2/5
[1m5953/5953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 32ms/step - loss: 0.7987 - val_loss: 0.7908
Epoch 3/5
[1m5953/5953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 32ms/step - loss: 0.7636 - val_loss: 0.7922
Epoch 4/5
[1m5953/5953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 32ms/step - loss: 0.7398 - val_loss: 0.7763
Epoch 5/5
[1m5953/5953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 32ms/step - loss: 0.7240 - val_loss: 0.7646


In [127]:
model.weights

[<KerasVariable shape=(10000, 50), dtype=float32, path=overview_embedding/embeddings>,
 <KerasVariable shape=(1000, 50), dtype=float32, path=tagline_embedding/embeddings>,
 <KerasVariable shape=(50, 2, 50), dtype=float32, path=multi_head_attention_2/query/kernel>,
 <KerasVariable shape=(2, 50), dtype=float32, path=multi_head_attention_2/query/bias>,
 <KerasVariable shape=(50, 2, 50), dtype=float32, path=multi_head_attention_2/key/kernel>,
 <KerasVariable shape=(2, 50), dtype=float32, path=multi_head_attention_2/key/bias>,
 <KerasVariable shape=(50, 2, 50), dtype=float32, path=multi_head_attention_2/value/kernel>,
 <KerasVariable shape=(2, 50), dtype=float32, path=multi_head_attention_2/value/bias>,
 <KerasVariable shape=(2, 50, 50), dtype=float32, path=multi_head_attention_2/attention_output/kernel>,
 <KerasVariable shape=(50,), dtype=float32, path=multi_head_attention_2/attention_output/bias>,
 <KerasVariable shape=(50, 2, 50), dtype=float32, path=multi_head_attention_3/query/kernel>,

In [131]:
for i in range(len(model.weights)):
    model.weights[i]._handle_name = model.weights[i].name + "_" + str(i)

In [134]:
model.save('new_model.h5')

ValueError: Unable to synchronously create dataset (name already exists)

Process TMDB Dataset

In [96]:
# Load Dataset
tmdb_df = pd.read_csv("/kaggle/input/tmdb-movies/TMDB_movie_dataset_v11.csv", index_col=0, low_memory=False)

# Cleaning up the release date column to coerce problematic values to NaT, and ensure dates are in proper format
tmdb_df['release_date'] = pd.to_datetime(tmdb_df['release_date'], errors='coerce')

# Creating new "year" (string) column and extracting year from the "release_date" field
# Changed .replace('nan', '') to .replace('<NA>', '')
tmdb_df['year'] = tmdb_df['release_date'].dt.year.astype('Int64').astype(str).replace('<NA>', '')

# Drop unnecessary columns
tmdb_df1 = tmdb_df.drop(['status', 'backdrop_path', 'homepage', 'imdb_id', 'poster_path', 'production_companies','production_countries', 'release_date', 
                         'original_language', "original_title", "spoken_languages"], axis=1)

In [97]:
# Scale numerical features
scaled_features1 = scaler.transform(tmdb_df1[numerical_features])
scaled_nums1 = pd.DataFrame(scaled_features1, columns=numerical_features, index=tmdb_df.index)

# Have multiple genres and one hot encode it
tmdb_df1['genres'] = tmdb_df1['genres'].astype(str)
tmdb_df1['genres'] = tmdb_df1['genres'].apply(lambda x: x.split(', ') if x != 'nan' else [])
genres_encoded1 = mlb.transform(tmdb_df1['genres'])
genres_df1 = pd.DataFrame(genres_encoded1, columns=mlb.classes_, index=tmdb_df1.index)

# Drop Original features
tmdb_df1.drop(numerical_features, axis=1, inplace=True)
tmdb_df1.drop("genres", axis=1, inplace=True)

# Concat all new features
tmdb_df2 = pd.concat([tmdb_df1, scaled_nums1, genres_df1], axis=1)

In [116]:
# Replace NaNs in 'overview' and 'tagline' with an empty string
tmdb_df2['overview'] = tmdb_df2['overview'].fillna('')
tmdb_df2['tagline'] = tmdb_df2['tagline'].fillna('')

Predict Ratings with titles for a particular user

In [119]:
def predict_ratings_for_user_with_titles(user_id, data2, num_recommendations=5, model = model, data1 = df_combined):
    
    # Extract user IDs
    user_id_input = np.array([user_id] * len(data2))
    
    # Extract dataset for this particular user and just keep first row as features will be constant
    user_df = data1[data1['UserID'] == user_id][:1]

    # Extract users demographic features and repeat for the length of unseen movies
    user_demographics_pred = np.repeat(user_df[demographic_features].values, len(data2), axis=0)
    
    # Extract Movie IDs
    movie_id_input = data2.index.values
    # Extract Movie features input
    movie_features_input = data2[movie_features_columns].values
    
    # Extract Movie Overview input
    overview_input = data2["overview"].values
    
    # Extract Movie Tagline input
    tagline_input = data2["tagline"].values
    
    # Batch prediction
    predictions = []
    batch_size = 512  # Adjust batch size based on your system's capability
    for start in range(0, len(data2), batch_size):
        end = min(start + batch_size, len(data2))
        batch_preds = model.predict([user_id_input[start:end], user_demographics_pred[start:end],
                                     movie_id_input[start:end], movie_features_input[start:end],
                                     overview_input[start:end], tagline_input[start:end]])
        predictions.extend(batch_preds.flatten())

    # Predict Ratings for unseen movies
    predicted_ratings = model.predict([user_id_input, user_demographics_pred, movie_id_input, 
                                       movie_features_input, overview_input, tagline_input])
    
    data2["predicted_ratings"] = predicted_ratings    
    
    data2.sort_values(by='predicted_ratings', ascending=False, inplace = True)
    
    final_movies_df = data2[["title", "Movie_Year", "predicted_ratings"]].head(num_recommendations)

    return final_movies_df

In [120]:
predict_ratings_for_user_with_titles(1,data2 = tmdb_df2)

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0


KeyboardInterrupt

