# Imports and Preprocessing

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
movies_df = pd.read_csv('../../datasets/tmdb_5000_movies.csv', usecols=['id','title','overview'])
movies_df

Unnamed: 0,id,overview,title
0,19995,"In the 22nd century, a paraplegic Marine is di...",Avatar
1,285,"Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End
2,206647,A cryptic message from Bond’s past sends him o...,Spectre
3,49026,Following the death of District Attorney Harve...,The Dark Knight Rises
4,49529,"John Carter is a war-weary, former military ca...",John Carter
...,...,...,...
4798,9367,El Mariachi just wants to play his guitar and ...,El Mariachi
4799,72766,A newlywed couple's honeymoon is upended by th...,Newlyweds
4800,231617,"""Signed, Sealed, Delivered"" introduces a dedic...","Signed, Sealed, Delivered"
4801,126186,When ambitious New York attorney Sam is sent t...,Shanghai Calling


## Generate Random Ratings

In [14]:
num_users = 50

# Create a new DataFrame for user ratings (user_id, movie_id, rating)
random_ratings = []

# Generate random ratings for each user
for user_id in range(1, num_users + 1):
    movie_ids = np.random.choice(movies_df['id'], size=np.random.randint(20, 50), replace=False)
    
    for movie_id in movie_ids:
        # Generate a random rating between 1 and 5
        rating = np.random.randint(1, 6)
        
        # Append to the list
        random_ratings.append([user_id, movie_id, rating])

# Convert to DataFrame
ratings_df = pd.DataFrame(random_ratings, columns=['user_id', 'movie_id', 'rating'])
ratings_df

Unnamed: 0,user_id,movie_id,rating
0,1,838,1
1,1,13920,4
2,1,82700,3
3,1,46849,5
4,1,4657,3
...,...,...,...
1677,50,120,3
1678,50,1858,4
1679,50,36669,5
1680,50,2142,5


In [15]:
print(f"Number of movies: {ratings_df['movie_id'].nunique()}")
print(f"Number of users: {ratings_df['user_id'].nunique()}")

Number of movies: 1418
Number of users: 50


In [16]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

# Method 1 : FNN

In [21]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.metrics import mean_squared_error

## Training

In [19]:
# Define the number of users and movies
num_users = train_df['user_id'].nunique() + 1
max_movies = train_df['movie_id'].max() + 1

# Define the model
user_input = layers.Input(shape=(1,), name='user')
movie_input = layers.Input(shape=(1,), name='movie')

# Embedding layers for users and movies
user_embedding = layers.Embedding(input_dim=num_users, output_dim=8)(user_input)
movie_embedding = layers.Embedding(input_dim=max_movies, output_dim=8)(movie_input)

# Flatten the embeddings
user_flat = layers.Flatten()(user_embedding)
movie_flat = layers.Flatten()(movie_embedding)

# Concatenate user and movie embeddings
concat = layers.Concatenate()([user_flat, movie_flat])

# Add a fully connected layer
dense = layers.Dense(64, activation='relu')(concat)
output = layers.Dense(1)(dense)  # Output a single rating

# Create the model
model = models.Model(inputs=[user_input, movie_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

model.summary()

In [20]:
# Prepare the data
user_input_data = train_df['user_id'].values
movie_input_data = train_df['movie_id'].values
ratings_data = train_df['rating'].values

# Train the model
model.fit([user_input_data, movie_input_data], ratings_data, epochs=15, batch_size=32)

Epoch 1/15
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - loss: 11.0584
Epoch 2/15
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - loss: 7.7485
Epoch 3/15
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - loss: 3.3232
Epoch 4/15
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step - loss: 1.8817
Epoch 5/15
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - loss: 1.5666
Epoch 6/15
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - loss: 1.2817
Epoch 7/15
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - loss: 0.8208
Epoch 8/15
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step - loss: 0.5299
Epoch 9/15
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - loss: 0.4038
Epoch 10/15
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - loss: 0.361

<keras.src.callbacks.history.History at 0x2085de0f3d0>

## Evaluation

In [22]:
user_test = test_df['user_id'].values
movie_test = test_df['movie_id'].values
actual_ratings = test_df['rating'].values

# Get predictions from the model
test_loss = model.evaluate([user_test, movie_test], actual_ratings, verbose=0)
print(f'Test Loss (Mean Squared Error): {test_loss}')

Test Loss (Mean Squared Error): 2.3623435497283936


In [23]:
# Predict rating for a specific user and movie
user_id = 1
movie_id = 3
predicted_rating = model.predict([np.array([user_id]), np.array([movie_id])])
print(f'Predicted rating for user {user_id} and movie {movie_id}: {predicted_rating[0][0]}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step
Predicted rating for user 1 and movie 3: 3.2203845977783203


In [24]:
# User ID for whom we want to recommend movies
user_id = 1 

# List of all movie IDs (existing movies in the dataset)
all_movie_ids = ratings_df['movie_id'].unique()

# Get the list of movies the user has already rated
rated_movie_ids = ratings_df[ratings_df['user_id'] == user_id]['movie_id'].values

# Get the list of movies the user has not rated
unrated_movie_ids = np.setdiff1d(all_movie_ids, rated_movie_ids)

# Predict ratings for all unrated movies
user_input = np.array([user_id] * len(unrated_movie_ids))  # Create an array of the same user_id
movie_input = unrated_movie_ids  # The unrated movie IDs

# Predict ratings for all unrated movies
predicted_ratings = model.predict([user_input, movie_input])

# Create a DataFrame to hold movie IDs and their predicted ratings
predicted_ratings_df = pd.DataFrame({
    'movie_id': unrated_movie_ids,
    'predicted_rating': np.clip(predicted_ratings.flatten(), 1, 5)  # Flatten and clip
})

merged_df = pd.merge(predicted_ratings_df, movies_df, how='inner', left_on='movie_id', right_on='id')
merged_df = merged_df.drop(columns=['id'])

# Sort the DataFrame by predicted ratings in descending order
top_10_movies = merged_df.sort_values(by='predicted_rating', ascending=False).head(10)

# Display the top 10 recommended movies
print("Top 10 recommended movies for User", user_id)
top_10_movies

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Top 10 recommended movies for User 1


Unnamed: 0,movie_id,predicted_rating,overview,title
688,14120,5.0,"""End of the Spear"" is the story of Mincayani, ...",End of the Spear
324,8457,5.0,Three kids hire a low-budget bodyguard to prot...,Drillbit Taylor
1044,55903,5.0,Grieving after the death of her young son Jose...,Betty Fisher and Other Stories
745,16471,5.0,"Jealous, harried air traffic controller Max Fi...",Modern Problems
723,15208,5.0,Bathory is based on the legends surrounding th...,Bathory: Countess of Blood
238,4133,5.0,A boy named George Jung grows up in a struggli...,Blow
960,39833,5.0,Elvis Prestley's first film is a Civil War drama.,Love Me Tender
314,8202,5.0,"400 years into the future, disease has wiped o...",Æon Flux
899,31867,5.0,"In the future, medical technology has advanced...",Repo Men
1211,157547,5.0,A woman tries to exonerate her brother's murde...,Oculus


# Method 2: SVD (Single Value Decomposition)

In [27]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

In [28]:
# Load data into surprise dataset format
reader = Reader(rating_scale=(1, 5))  # Define the rating scale
data = Dataset.load_from_df(ratings_df[['user_id', 'movie_id', 'rating']], reader)

# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.2)

In [33]:
# Train the SVD model
svd = SVD()
svd.fit(trainset)

# Test the model on the test set
predictions = svd.test(testset)

# Calculate MSE
mse = accuracy.mse(predictions)

MSE: 2.0080


In [34]:
# Predict the rating for a specific user and movie
user_id = 1
movie_id = 3

# Predict the rating for user 1 and movie 3
predicted_rating = svd.predict(user_id, movie_id)
print(f"Predicted rating for user {user_id} and movie {movie_id}: {predicted_rating.est}")

Predicted rating for user 1 and movie 3: 3.1608976598783105


In [35]:
# User ID for whom we want to recommend movies
user_id = 1 

# List of all movie IDs (existing movies in the dataset)
all_movie_ids = ratings_df['movie_id'].unique()

# Get the list of movies the user has already rated
rated_movie_ids = ratings_df[ratings_df['user_id'] == user_id]['movie_id'].values

# Get the list of movies the user has not rated
unrated_movie_ids = np.setdiff1d(all_movie_ids, rated_movie_ids)

# Generate predictions
predicted_ratings = [svd.predict(user_id, movie_id).est for movie_id in unrated_movie_ids]

# Create a DataFrame to hold movie IDs and their predicted ratings
predicted_ratings_df = pd.DataFrame({
    'movie_id': unrated_movie_ids,
    'predicted_rating': np.clip(predicted_ratings, 1, 5)  # Flatten and clip
})

merged_df = pd.merge(predicted_ratings_df, movies_df, how='inner', left_on='movie_id', right_on='id')
merged_df = merged_df.drop(columns=['id'])

# Sort the DataFrame by predicted ratings in descending order
top_10_movies = merged_df.sort_values(by='predicted_rating', ascending=False).head(10)

# Display the top 10 recommended movies
print("Top 10 recommended movies for User", user_id)
top_10_movies

Top 10 recommended movies for User 1


Unnamed: 0,movie_id,predicted_rating,overview,title
56,348,3.754627,"During its return to the earth, commercial spa...",Alien
754,17113,3.687763,Jack Slavin is an environmentalist with a hear...,The Ballad of Jack and Rose
423,9672,3.653246,"While researching his book In Cold Blood, writ...",Infamous
775,18713,3.648717,When two unemployed telephone pranksters decid...,The Jerky Boys
53,331,3.630204,"In need of funds for research, Dr. Alan Grant ...",Jurassic Park III
690,14177,3.627733,You thought you'd heard it all in the barbersh...,Beauty Shop
911,34417,3.599203,Meet Etta Milford. Loving Wife. Doting Mother....,Good Intentions
1352,327833,3.596378,After his family is killed by a Serbian gangst...,Skin Trade
1134,85446,3.577366,Emily arrives in Miami with aspirations to bec...,Step Up Revolution
836,24206,3.575295,Waxman is a former Special Forces soldier who ...,Silent Trigger
