Import libraries

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, concatenate, Dropout
from tensorflow.keras.optimizers import Adam

Load dataset

In [40]:
pd.set_option('display.max_columns', None)
df = pd.read_csv('Merged_df.csv', index_col=0, low_memory=False) 

Drop unnecessary columns 

In [41]:
df1 = df.drop(['MovieID', 'Timestamp', 'Title', 'Genres', 'ZipCode', 'Movie_Title', 
               'status', 'backdrop_path', 'homepage', 'imdb_id', 'poster_path', 'production_companies', 
               'production_countries', 'title_year', 'year', 'original_language', "release_date", "original_title", "spoken_languages"], axis=1)

Drop rows with NA values

In [42]:
df1.dropna(inplace=True)

Converting Years into Decades (Buckets to make it a categorical variable)


In [43]:
# Convert years into decades
df1['Decade'] = (df1['Movie_Year'] // 10) * 10  # This floors each year to the start of its decade

# Now drop the original 'Movie_Year' column if it's no longer needed
df1 = df1.drop(['Movie_Year'], axis=1)

Convert 'adult' column to integers 

In [44]:
# Convert 'adult' column to integers (1 for True, 0 for False)
df1['adult'] = df1['adult'].astype(int)

Convert movieids to integers

In [45]:
df1['id'] = df1['id'].astype(int)

Encode Catgorical features & Scale Numerical Variables

In [46]:
# Define categorical and numerical features
categorical_features = ['Gender', 'Occupation', 'Age', 'Decade']
numerical_features = ['Rating', 'vote_average', 'vote_count', 'revenue', 'runtime', 'budget', 'popularity']

# Perform a train-test split
df_train, df_test = train_test_split(df1, test_size=0.2, random_state=42)

# One-Hot Encoding for categorical variables
one_hot_encoder = OneHotEncoder(sparse=False)  # Using sparse=False to get a dense array
encoded_features_train = one_hot_encoder.fit_transform(df_train[categorical_features])
encoded_features_test = one_hot_encoder.transform(df_test[categorical_features])

# Normalize Numerical Features
scaler = StandardScaler()
scaled_features_train = scaler.fit_transform(df_train[numerical_features])
scaled_features_test = scaler.transform(df_test[numerical_features])



Convert Genres into Multiple Binary Columns

In [47]:
# Since genres are separated by commas in dataset
df_train['genres'] = df_train['genres'].apply(lambda x: x.split(', '))
df_test['genres'] = df_test['genres'].apply(lambda x: x.split(', '))

mlb = MultiLabelBinarizer()
# Fit on the training data and transform it
genres_encoded_train = mlb.fit_transform(df_train['genres'])

# Transform the test data using the same binarizer fitted on the training data
genres_encoded_test = mlb.transform(df_test['genres'])

Before concatenating, Drop all original variables which have been transformed

In [48]:
# For Training Dataset
df_train.drop(categorical_features, axis=1, inplace=True)
df_train.drop(numerical_features, axis=1, inplace=True)
df_train.drop("genres", axis=1, inplace=True)

# For Testing Dataset
df_test.drop(categorical_features, axis=1, inplace=True)
df_test.drop(numerical_features, axis=1, inplace=True)
df_test.drop("genres", axis=1, inplace=True)

Combine all processed features back into one dataframe

In [49]:
# FOR TRAINING DATASET
# Convert encoded categorical features, scaled numerical features, and multiple genere features back to DataFrame for training data
encoded_cats_train_df = pd.DataFrame(encoded_features_train, columns=one_hot_encoder.get_feature_names_out(categorical_features))
scaled_nums_train_df = pd.DataFrame(scaled_features_train, columns=numerical_features)
genres_train_df = pd.DataFrame(genres_encoded_train, columns=mlb.classes_)

# Reset df_train indexes to match
df_train.reset_index(drop=True, inplace=True)

# Combine all features
df_train_combined = pd.concat([df_train, encoded_cats_train_df, scaled_nums_train_df,genres_train_df], axis=1)

# FOR TESTING DATASET
# Convert encoded categorical features, scaled numerical features and nultiple genre features back to DataFrame for testing data
encoded_cats_test_df = pd.DataFrame(encoded_features_test, columns=one_hot_encoder.get_feature_names_out(categorical_features))
scaled_nums_test_df = pd.DataFrame(scaled_features_test, columns=numerical_features)
genres_test_df = pd.DataFrame(genres_encoded_test, columns=mlb.classes_)

# Reset df_test indexes to match
df_test.reset_index(drop=True, inplace=True)

# Combine all features for testing dataset
df_test_combined = pd.concat([df_test, encoded_cats_test_df, scaled_nums_test_df, genres_test_df], axis=1)

Create User embeddings

In [50]:
num_users = df_train_combined['UserID'].max() + 1  # Add 1 to handle index error as python indexes from 0, then index 0 will be unused/reserved

# UsedID embeddings
user_id_input = Input(shape=(1,), name='user_id_input')  # Entry point for User IDs  --> Each input is a single value
user_embedding = Embedding(input_dim=num_users, output_dim=15, name='user_embedding')(user_id_input) # Transforms each userID into a dense vector of size 15 
user_embedding_flat = Flatten(name='user_flatten')(user_embedding) # Flatten to a 1D tensor

User demographics

In [51]:
demographic_features = ['Gender_F', 'Gender_M', 'Occupation_0', 'Occupation_1', 'Occupation_2',
                        'Occupation_3', 'Occupation_4', 'Occupation_5', 'Occupation_6',
                        'Occupation_7', 'Occupation_8', 'Occupation_9', 'Occupation_10',
                        'Occupation_11', 'Occupation_12', 'Occupation_13', 'Occupation_14',
                        'Occupation_15', 'Occupation_16', 'Occupation_17', 'Occupation_18',
                        'Occupation_19', 'Occupation_20', 'Age_1', 'Age_18', 'Age_25', 'Age_35',
                        'Age_45', 'Age_50', 'Age_56']

# User demographics input
user_demographics_input = Input(shape=(len(demographic_features),), name='user_demographics_input')

Combine User embeddings and User demographics

In [52]:
# Combine User ID embedding with demographic features
combined_user_input = concatenate([user_embedding_flat, user_demographics_input])

Create Movie Embeddings

In [53]:
num_movies = df_train_combined['id'].max() + 1 # Add 1 to handle index error as python indexes from 0, then index 0 will be unused/reserved

# UsedID embeddings
movie_id_input = Input(shape=(1,), name='movie_id_input')  # Entry point for Movie IDs  --> Each input is a single value
movie_embedding = Embedding(input_dim=num_movies, output_dim=15, name='movie_embedding')(movie_id_input)
movie_flatten = Flatten(name='movie_flatten')(movie_embedding)

Movie Features

In [54]:
movie_features_columns = ['vote_average', 'vote_count', 'revenue', 'runtime', 'budget', 'popularity',
                          'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 
                          'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 
                          'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western', 'Decade_1910', 
                          'Decade_1920', 'Decade_1930', 'Decade_1940', 'Decade_1950', 'Decade_1960',
                          'Decade_1970', 'Decade_1980', 'Decade_1990', 'Decade_2000']

num_movie_features = len(movie_features_columns)  

movie_input = Input(shape=(num_movie_features,), name='movie_input')

Assemble Complete Model

In [55]:
# Combine user and movie inputs
combined_inputs = concatenate([combined_user_input, movie_flatten, movie_input])

# Add fully connected layers
fc1 = Dense(256, activation='relu')(combined_inputs)
dropout1 = Dropout(0.2)(fc1)
fc2 = Dense(128, activation='relu')(dropout1)
dropout2 = Dropout(0.2)(fc2)
output = Dense(1, activation='linear')(dropout2)

# Finalizing the model
model = Model(inputs=[user_id_input, user_demographics_input, movie_id_input, movie_input], outputs=output)

# Compiling the model
model.compile(optimizer=Adam(0.001), loss='mean_squared_error') # Learning rate of 0.001 is a common default for Adam

# Model summary to check the architecture
#model.summary()

Preparing Inputs for training

In [56]:
# Extract ratings (Y-values)
ratings_train = df_train_combined['Rating'].values
ratings_test = df_test_combined['Rating'].values

# Prepare inputs for training
user_ids_train = df_train_combined['UserID'].values
movie_ids_train = df_train_combined['id'].values
user_demographics_train = df_train_combined[demographic_features].values
movie_features_train = df_train_combined[movie_features_columns].values

# Prepare inputs for testing (evaluation)
user_ids_test = df_test_combined['UserID'].values
movie_ids_test = df_test_combined['id'].values
user_demographics_test = df_test_combined[demographic_features].values
movie_features_test = df_test_combined[movie_features_columns].values

Train the Model

In [58]:
model.fit([user_ids_train, user_demographics_train, movie_ids_train, movie_features_train], ratings_train, 
                    validation_data=([user_ids_test, user_demographics_test, movie_ids_test, movie_features_test], ratings_test), 
                    epochs=10, 
                    batch_size=32)

Epoch 1/10

KeyboardInterrupt: 