Import libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, concatenate, Dropout, TextVectorization, LSTM, BatchNormalization, Activation, Lambda, MultiHeadAttention, LayerNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2, l2
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras import backend as K

Load dataset

In [2]:
pd.set_option('display.max_columns', None)
#df = pd.read_csv('Merged_df.csv', index_col=0, low_memory=False)
df = pd.read_csv("/kaggle/input/merged-df/Merged_df.csv", index_col=0, low_memory=False)

Drop unnecessary columns

In [3]:
df1 = df.drop(['MovieID', 'Timestamp', 'Title', 'Genres', 'ZipCode', 'Movie_Title',
               'status', 'backdrop_path', 'homepage', 'imdb_id', 'poster_path', 'production_companies',
               'production_countries', 'title_year', 'year', 'original_language', "release_date", "original_title", "spoken_languages"], axis=1)

Drop rows with NA values

In [4]:
df1.dropna(inplace=True)

Convert 'adult' column to integers

In [5]:
# Convert 'adult' column to integers (1 for True, 0 for False)
df1['adult'] = df1['adult'].astype(int)

Convert movieids to integers

In [6]:
df1['id'] = df1['id'].astype(int)

Encode Catgorical features & Scale Numerical Variables

In [7]:
# Define categorical and numerical features
categorical_features = ['Gender', 'Occupation', 'Age']
numerical_features = ['vote_average', 'vote_count', 'revenue', 'runtime', 'budget', 'popularity']

# One-Hot Encoding for categorical variables
one_hot_encoder = OneHotEncoder(sparse=False)  # Using sparse=False to get a dense array
encoded_features = one_hot_encoder.fit_transform(df1[categorical_features])

# Normalize Numerical Features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df1[numerical_features])



Convert Genres into Multiple Binary Columns

In [8]:
# Since genres are separated by commas in dataset
df1['genres'] = df1['genres'].apply(lambda x: x.split(', '))

mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(df1['genres'])

Before concatenating, Drop all original variables which have been transformed

In [9]:
df1.drop(categorical_features, axis=1, inplace=True)
df1.drop(numerical_features, axis=1, inplace=True)
df1.drop("genres", axis=1, inplace=True)

Combine all processed features back into one dataframe

In [10]:
# Convert encoded categorical features, scaled numerical features, and multiple genere features back to DataFrame 
encoded_cats = pd.DataFrame(encoded_features, columns=one_hot_encoder.get_feature_names_out(categorical_features), index=df1.index)
scaled_nums = pd.DataFrame(scaled_features, columns=numerical_features, index=df1.index)
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_, index=df1.index)

# Combine all features
df_combined = pd.concat([df1, encoded_cats, scaled_nums,genres_df], axis=1)

Split into train and test dataset

In [11]:
df_train_combined, df_test_combined = train_test_split(df_combined, test_size=0.2, random_state=42)

Create User embeddings

In [12]:
num_users = df_train_combined['UserID'].max() + 1  # Add 1 to handle index error as python indexes from 0, then index 0 will be unused/reserved

# UsedID embeddings
user_id_input = Input(shape=(1,), name='user_id_input')  # Entry point for User IDs  --> Each input is a single value

# Aim to capture 10 latent characteristics of each user (10 dimensions)
# input_dim is the total number of unique users
user_embedding = Embedding(input_dim=num_users, output_dim=10, name='user_embedding')(user_id_input) # Transforms each userID into a dense vector of size 10
user_flatten = Flatten(name='user_flatten')(user_embedding) # Flatten to a 1D tensor

User demographics

In [13]:
demographic_features = ['Gender_F', 'Gender_M', 'Occupation_0', 'Occupation_1', 'Occupation_2',
                        'Occupation_3', 'Occupation_4', 'Occupation_5', 'Occupation_6',
                        'Occupation_7', 'Occupation_8', 'Occupation_9', 'Occupation_10',
                        'Occupation_11', 'Occupation_12', 'Occupation_13', 'Occupation_14',
                        'Occupation_15', 'Occupation_16', 'Occupation_17', 'Occupation_18',
                        'Occupation_19', 'Occupation_20', 'Age_1', 'Age_18', 'Age_25', 'Age_35',
                        'Age_45', 'Age_50', 'Age_56']

# User demographics input
user_demographics_input = Input(shape=(len(demographic_features),), name='user_demographics_input')

Create Movie Embeddings

In [14]:
num_movies = df_train_combined['id'].max() + 1 # Add 1 to handle index error as python indexes from 0, then index 0 will be unused/reserved

# MovieID embeddings
movie_id_input = Input(shape=(1,), name='movie_id_input')  # Entry point for Movie IDs  --> Each input is a single value

# Aim to capture 50 latent characteristics of each movie due to large number of unique movies (50 dimensions)
# input_dim is the total number of unique movies
movie_embedding = Embedding(input_dim=num_movies, output_dim=50, name='movie_embedding')(movie_id_input)
movie_flatten = Flatten(name='movie_flatten')(movie_embedding)

Movie Features

In [15]:
movie_features_columns = ['vote_average', 'vote_count', 'revenue', 'runtime', 'budget', 'popularity',
                          'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama',
                          'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance',
                          'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western']

num_movie_features = len(movie_features_columns)

movie_input = Input(shape=(num_movie_features,), name='movie_input')

Utilize 'Overview' and 'tagline' columns to process text data

In [16]:
# # preprocessing step --> transforms strings into numerical representation of fixed length 
# # Define TextVectorization for overview column
# vectorizer_overview = TextVectorization(max_tokens=10000, output_sequence_length=70) # Vectorizer only considers top 10000 most common words in dataset
# vectorizer_overview.adapt(df_train_combined['overview'])

# # Define TextVectorization for tagline column
# vectorizer_tagline = TextVectorization(max_tokens=1000, output_sequence_length=15)
# vectorizer_tagline.adapt(df_train_combined['tagline'])

# # Apply vectorization
# overview_vectors_train = vectorizer_overview(df_train_combined['overview']) # Tokenize and convert each token into numeric indices based on its internal vocabulary
# tagline_vectors_train = vectorizer_tagline(df_train_combined['tagline'])

# # Embedding layers
# overview_embedding = Embedding(input_dim=10000, output_dim=50, name='overview_embedding')(overview_vectors_train)
# tagline_embedding = Embedding(input_dim=1000, output_dim=50, name='tagline_embedding')(tagline_vectors_train)

# # Define Text inputs
# text_input_overview = Input(shape=(50,), dtype="string", name="overview_input")
# text_input_tagline = Input(shape=(50,), dtype="string", name="tagline_input")

# # Embedding layers followed by Multi-Head Attention
# def transformer_encoder(embedding_layer):
#     # Apply Multi-Head Attention using 2 heads
#     attention_output = MultiHeadAttention(num_heads=2, key_dim=50)(embedding_layer, embedding_layer)
    
#     # Add a feed-forward network (FFN) as commonly used in Transformers
#     ffn_output = Dense(50, activation='relu')(attention_output)

#     # Layer Normalization and Residual (skip) Connection
#     output = LayerNormalization()(ffn_output + embedding_layer) 
#     return output

# # Process both text inputs with Multi-Head Attention
# overview_attention = transformer_encoder(text_input_overview)
# tagline_attention = transformer_encoder(text_input_tagline)

# # Since attention outputs a sequence, reduce it to a single vector via global pooling or by taking the mean
# overview_pooled = Lambda(lambda x: K.mean(x, axis=1))(overview_attention)
# tagline_pooled = Lambda(lambda x: K.mean(x, axis=1))(tagline_attention)

Combine model inputs

In [17]:
# Combine user and movie inputs
#combined_inputs = concatenate([user_flatten, user_demographics_input, movie_flatten, movie_input, overview_pooled, tagline_pooled])
combined_inputs = concatenate([user_flatten, user_demographics_input, movie_flatten, movie_input])

Assemble Complete Model

In [18]:
# Add fully connected layers
fc1 = Dense(256, activation='relu')(combined_inputs) 
dropout1 = Dropout(0.25)(fc1)

fc2 = Dense(128, activation='relu')(dropout1)
dropout2 = Dropout(0.25)(fc2)

fc3 = Dense(64, activation='relu')(dropout2)
output = Dense(1, activation='sigmoid')(fc3) 
output =Lambda(lambda x: x * 5)(output)

# Finalizing the model
# model = Model(inputs=[user_id_input, user_demographics_input, movie_id_input, movie_input, text_input_overview, text_input_tagline], outputs=output)
model = Model(inputs=[user_id_input, user_demographics_input, movie_id_input, movie_input], outputs=output)
lr_schedule = ExponentialDecay(initial_learning_rate=0.004, decay_steps=10000, decay_rate=0.9) # Learning Rate Scheduling

# Compiling the model
model.compile(optimizer=Adam(lr_schedule), loss='mean_squared_error')

# Model summary to check the architecture
model.summary()

Preparing Inputs for training

In [19]:
# Extract ratings (Y-values)
ratings_train = df_train_combined['Rating'].values
ratings_test = df_test_combined['Rating'].values

# Prepare inputs for training
user_ids_train = df_train_combined['UserID'].values
movie_ids_train = df_train_combined['id'].values
user_demographics_train = df_train_combined[demographic_features].values
movie_features_train = df_train_combined[movie_features_columns].values
# overview_train = df_train_combined['overview']  
# tagline_train = df_train_combined['tagline'] 

# Prepare inputs for testing (evaluation)
user_ids_test = df_test_combined['UserID'].values
movie_ids_test = df_test_combined['id'].values
user_demographics_test = df_test_combined[demographic_features].values
movie_features_test = df_test_combined[movie_features_columns].values
# overview_test = df_test_combined['overview']  
# tagline_test = df_test_combined['tagline']  

Train the Model

In [20]:
history = model.fit([user_ids_train, user_demographics_train, movie_ids_train, movie_features_train], ratings_train,
                    validation_data=([user_ids_test, user_demographics_test, movie_ids_test, movie_features_test], ratings_test),
                    epochs=10,
                    batch_size=128)

Epoch 1/10
[1m  16/5953[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:03[0m 11ms/step - loss: 1.5401

I0000 00:00:1713087917.280027      87 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m5953/5953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 11ms/step - loss: 0.8981 - val_loss: 0.8172
Epoch 2/10
[1m5953/5953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 11ms/step - loss: 0.7764 - val_loss: 0.7907
Epoch 3/10
[1m5953/5953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 11ms/step - loss: 0.7403 - val_loss: 0.7691
Epoch 4/10
[1m5953/5953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 11ms/step - loss: 0.7206 - val_loss: 0.7524
Epoch 5/10
[1m5953/5953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 11ms/step - loss: 0.7041 - val_loss: 0.7605
Epoch 6/10
[1m5953/5953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 11ms/step - loss: 0.6878 - val_loss: 0.7465
Epoch 7/10
[1m5953/5953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 11ms/step - loss: 0.6746 - val_loss: 0.7417
Epoch 8/10
[1m5953/5953[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 11ms/step - loss: 0.6616 - val_loss: 0.7494
Epoch 9/10
[1m5953

Process TMDB Dataset

In [21]:
# Load Dataset
tmdb_df = pd.read_csv("/kaggle/input/tmdb-movies/TMDB_movie_dataset_v11.csv", index_col=0, low_memory=False)

# Cleaning up the release date column to coerce problematic values to NaT, and ensure dates are in proper format
tmdb_df['release_date'] = pd.to_datetime(tmdb_df['release_date'], errors='coerce')

# Creating new "year" (string) column and extracting year from the "release_date" field
# Changed .replace('nan', '') to .replace('<NA>', '')
tmdb_df['year'] = tmdb_df['release_date'].dt.year.astype('Int64').astype(str).replace('<NA>', '')

# Drop unnecessary columns
tmdb_df1 = tmdb_df.drop(['status', 'backdrop_path', 'homepage', 'imdb_id', 'poster_path', 'production_companies','production_countries', 'release_date', 
                         'original_language', "original_title", "spoken_languages"], axis=1)

In [22]:
# Scale numerical features
scaled_features1 = scaler.transform(tmdb_df1[numerical_features])
scaled_nums1 = pd.DataFrame(scaled_features1, columns=numerical_features, index=tmdb_df.index)

# Have multiple genres and one hot encode it
tmdb_df1['genres'] = tmdb_df1['genres'].astype(str)
tmdb_df1['genres'] = tmdb_df1['genres'].apply(lambda x: x.split(', ') if x != 'nan' else [])
genres_encoded1 = mlb.transform(tmdb_df1['genres'])
genres_df1 = pd.DataFrame(genres_encoded1, columns=mlb.classes_, index=tmdb_df1.index)

# Drop Original features
tmdb_df1.drop(numerical_features, axis=1, inplace=True)
tmdb_df1.drop("genres", axis=1, inplace=True)

# Concat all new features
tmdb_df2 = pd.concat([tmdb_df1, scaled_nums1, genres_df1], axis=1)

In [None]:
# Replace NaNs in 'overview' and 'tagline' with an empty string
# tmdb_df2['overview'] = tmdb_df2['overview'].fillna('')
# tmdb_df2['tagline'] = tmdb_df2['tagline'].fillna('')

Predict Ratings with titles for a particular user

In [26]:
def predict_ratings_for_user_with_titles(user_id, data2, num_recommendations=5, model = model, data1 = df_combined):
    
    # Extract user IDs
    user_id_input = np.array([user_id] * len(data2))
    
    # Extract dataset for this particular user and just keep first row as features will be constant
    user_df = data1[data1['UserID'] == user_id][:1]

    # Extract users demographic features and repeat for the length of unseen movies
    user_demographics_pred = np.repeat(user_df[demographic_features].values, len(data2), axis=0)
    
    # Extract Movie IDs
    movie_id_input = data2.index.values
    # Extract Movie features input
    movie_features_input = data2[movie_features_columns].values
    
    # Predict Ratings for unseen movies
    predicted_ratings = model.predict([user_id_input, user_demographics_pred, movie_id_input, 
                                       movie_features_input])
    
    data2["predicted_ratings"] = predicted_ratings    
    
    data2.sort_values(by='predicted_ratings', ascending=False, inplace = True)
    
    final_movies_df = data2[["title", "year", "predicted_ratings"]].head(num_recommendations)

    return final_movies_df

In [27]:
predict_ratings_for_user_with_titles(1,data2 = tmdb_df2)

[1m31321/31321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 2ms/step


Unnamed: 0_level_0,title,year,predicted_ratings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
251800,Modern Times Forever,2011,4.999936
272074,Cinématon,1978,4.998858
710874,Svalbard minutt for minutt,2020,4.99852
240815,Five-Year Diary,1997,4.983882
197299,Beijing 2003,2004,4.979376


In [43]:
# Function to get users input for new users to predict based on their demographic features
def get_user_input():
    print("Please enter your demographic information:")
    
    # Display gender choices
    print("Gender is denoted by a 'M' for male and 'F' for female.")
    gender = input("Gender (F/M): ").strip().upper()
    
    # Display age choices
    print("\nAge is chosen from the following ranges:")
    print("  1: 'Under 18'")
    print(" 18: '18-24'")
    print(" 25: '25-34'")
    print(" 35: '35-44'")
    print(" 45: '45-49'")
    print(" 50: '50-55'")
    print(" 56: '56+'")
    age = int(input("Enter the number corresponding to your age group: "))
    
    # Display occupation choices
    print("\nOccupation is chosen from the following choices:")
    print("  0: 'other' or not specified")
    print("  1: 'academic/educator'")
    print("  2: 'artist'")
    print("  3: 'clerical/admin'")
    print("  4: 'college/grad student'")
    print("  5: 'customer service'")
    print("  6: 'doctor/health care'")
    print("  7: 'executive/managerial'")
    print("  8: 'farmer'")
    print("  9: 'homemaker'")
    print(" 10: 'K-12 student'")
    print(" 11: 'lawyer'")
    print(" 12: 'programmer'")
    print(" 13: 'retired'")
    print(" 14: 'sales/marketing'")
    print(" 15: 'scientist'")
    print(" 16: 'self-employed'")
    print(" 17: 'technician/engineer'")
    print(" 18: 'tradesman/craftsman'")
    print(" 19: 'unemployed'")
    print(" 20: 'writer'")
    occupation = int(input("Enter the number corresponding to your occupation: "))
    
    # Convert inputs to model-compatible format (one-hot encoding)
    user_data = np.zeros((1, 30))  # 30 features as per your model
    if gender == 'F':
        user_data[0, 0] = 1  # Gender_F
    else:
        user_data[0, 1] = 1  # Gender_M

    # Set occupation (assuming 0-20 are correctly input)
    user_data[0, 2 + occupation] = 1

    # Set age
    age_indices = {1: 23, 18: 24, 25: 25, 35: 26, 45: 27, 50: 28, 56: 29}
    user_data[0, age_indices[age]] = 1

    return user_data


In [46]:
def recommend_movies_for_new_user(data2, num_recommendations=5, model = model, data1 = df_combined):
    
    # initialize random user id = 10000 (does not exist in data)
    user_id_input = np.array([10000] * len(data2))
    
    # Get user demographic features
    user_data = get_user_input()
    user_demographics_pred = np.repeat(user_data, len(data2), axis=0)
    
    # Extract Movie IDs
    movie_id_input = data2.index.values
    # Extract Movie features input
    movie_features_input = data2[movie_features_columns].values
    
    # Predict Ratings for unseen movies
    predicted_ratings = model.predict([user_id_input, user_demographics_pred, movie_id_input, movie_features_input])
    
    data2["predicted_ratings"] = predicted_ratings    
    
    data2.sort_values(by='predicted_ratings', ascending=False, inplace = True)
    
    final_movies_df = data2[["title", "year", "predicted_ratings"]].head(num_recommendations)

    return final_movies_df

In [47]:
recommend_movies_for_new_user(data2 = tmdb_df2)

Please enter your demographic information:
Gender is denoted by a 'M' for male and 'F' for female.


Gender (F/M):  M



Age is chosen from the following ranges:
  1: 'Under 18'
 18: '18-24'
 25: '25-34'
 35: '35-44'
 45: '45-49'
 50: '50-55'
 56: '56+'


Enter the number corresponding to your age group:  18



Occupation is chosen from the following choices:
  0: 'other' or not specified
  1: 'academic/educator'
  2: 'artist'
  3: 'clerical/admin'
  4: 'college/grad student'
  5: 'customer service'
  6: 'doctor/health care'
  7: 'executive/managerial'
  8: 'farmer'
  9: 'homemaker'
 10: 'K-12 student'
 11: 'lawyer'
 12: 'programmer'
 13: 'retired'
 14: 'sales/marketing'
 15: 'scientist'
 16: 'self-employed'
 17: 'technician/engineer'
 18: 'tradesman/craftsman'
 19: 'unemployed'
 20: 'writer'


Enter the number corresponding to your occupation:  4


[1m31321/31321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 2ms/step


Unnamed: 0_level_0,title,year,predicted_ratings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
251800,Modern Times Forever,2011,4.99967
272074,Cinématon,1978,4.999613
710874,Svalbard minutt for minutt,2020,4.990778
240815,Five-Year Diary,1997,4.945063
1157368,Château Espérance,1976,4.929083
