In [1]:
#!wget -O ./data/moviedataset.zip http://files.grouplens.org/datasets/movielens/ml-1m.zip
#!unzip -o ./data/moviedataset.zip -d ./data

In [2]:
#Tensorflow library. Used to implement machine learning models
import tensorflow as tf
#Numpy contains helpful functions for efficient mathematical calculations
import numpy as np
#Dataframe manipulation library
import pandas as pd
#Graph plotting library
import matplotlib.pyplot as plt

In [3]:
#Loading in the movies dataset
movies_df = pd.read_csv('https://raw.githubusercontent.com/Gurubux/CognitiveClass-DL/master/2_Deep_Learning_with_TensorFlow/DL_CC_2_4_RBM/ml-1m/movies.dat', sep='::', header=None, engine='python',encoding = "ISO-8859-1")
movies_df

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [4]:
#Loading in the ratings dataset
ratings_df = pd.read_csv('https://raw.githubusercontent.com/Gurubux/CognitiveClass-DL/master/2_Deep_Learning_with_TensorFlow/DL_CC_2_4_RBM/ml-1m/ratings.dat', sep='::', header=None, engine='python',encoding = "ISO-8859-1")
ratings_df.head()

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
movies_df.columns = ['MovieID', 'Title', 'Genres']
movies_df.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings_df.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
len(movies_df)

3883

In [8]:
user_rating_df = ratings_df.pivot(index='UserID', columns='MovieID', values='Rating')
# index: Which column's values do you want to become the new rows?
#columns: Which column's values do you want to become the new columns?
#values: Which column's values should fill in the cells of the grid?
user_rating_df.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,


In [9]:
norm_user_rating_df = user_rating_df.fillna(0) / 5.0
trX = norm_user_rating_df.values
trX[0:5]

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
#1. Define the RBM Model Parameters and Hyperparameters

# Hyperparameters
hiddenUnits = 50  # Number of hidden units (features)
visibleUnits = len(user_rating_df.columns) # Number of visible units (number of movies)
learning_rate = 0.01
epochs = 20
batch_size = 128

In [11]:
# RBM Model Class using TensorFlow 2.x
class RBM(tf.keras.Model):
    def __init__(self, visibleUnits, hiddenUnits):
        super(RBM, self).__init__()
        self.visible_units = visibleUnits
        self.hidden_units = hiddenUnits

        # Initialize weights and biases as TensorFlow Variables
        # W: Weight matrix connecting visible and hidden units
        self.W = tf.Variable(tf.random.normal([self.visible_units, self.hidden_units], 0.01), name="W")
        # vb: Bias for the visible units
        self.vb = tf.Variable(tf.zeros([self.visible_units]), name="visible_bias")
        # hb: Bias for the hidden units
        self.hb = tf.Variable(tf.zeros([self.hidden_units]), name="hidden_bias")

    # Forward pass: Calculate hidden layer probabilities given visible layer
    def prob_h_given_v(self, v):
        return tf.nn.sigmoid(tf.matmul(v, self.W) + self.hb)

    # Sample hidden layer activations (0 or 1) from probabilities
    def sample_h_given_v(self, v):
        ph_v = self.prob_h_given_v(v)
        return tf.nn.relu(tf.sign(ph_v - tf.random.uniform(tf.shape(ph_v))))

    # Backward pass: Calculate visible layer probabilities given hidden layer
    def prob_v_given_h(self, h):
        return tf.nn.sigmoid(tf.matmul(h, tf.transpose(self.W)) + self.vb)

    # Sample visible layer activations (0 or 1) from probabilities
    def sample_v_given_h(self, h):
        pv_h = self.prob_v_given_h(h)
        return tf.nn.relu(tf.sign(pv_h - tf.random.uniform(tf.shape(pv_h))))

    # Training step using Contrastive Divergence (CD-1)
    def train_step(self, v0):
        # Gibbs Sampling Step 1
        h0 = self.sample_h_given_v(v0)
        v1 = self.sample_v_given_h(h0)
        h1 = self.prob_h_given_v(v1) # Use probabilities for the final update

        # Calculate gradients
        w_positive_grad = tf.matmul(tf.transpose(v0), h0)
        w_negative_grad = tf.matmul(tf.transpose(v1), h1)

        # Update rule for weights and biases
        update_w = self.W.assign_add(learning_rate * (w_positive_grad - w_negative_grad) / tf.cast(tf.shape(v0)[0], tf.float32))
        update_vb = self.vb.assign_add(learning_rate * tf.reduce_mean(v0 - v1, 0))
        update_hb = self.hb.assign_add(learning_rate * tf.reduce_mean(h0 - h1, 0))

        # Return reconstruction error
        error = tf.reduce_mean(tf.square(v0 - v1))
        return error

In [12]:
# 2. Train the RBM Model

# Instantiate the RBM model
rbm_model = RBM(visibleUnits, hiddenUnits)

# Create a TensorFlow dataset for batching
train_ds = tf.data.Dataset.from_tensor_slices(trX.astype(np.float32)).shuffle(len(trX)).batch(batch_size)

print("Starting RBM Training...")
for epoch in range(epochs):
    epoch_error = 0.0
    for i, batch in enumerate(train_ds):
        batch_error = rbm_model.train_step(batch)
        epoch_error += batch_error
    print(f"Epoch {epoch+1}/{epochs} - Reconstruction Error: {epoch_error / (i+1):.4f}")
print("Training Complete.")

Starting RBM Training...
Epoch 1/20 - Reconstruction Error: 0.4228
Epoch 2/20 - Reconstruction Error: 0.3583
Epoch 3/20 - Reconstruction Error: 0.3298
Epoch 4/20 - Reconstruction Error: 0.3077
Epoch 5/20 - Reconstruction Error: 0.2891
Epoch 6/20 - Reconstruction Error: 0.2720
Epoch 7/20 - Reconstruction Error: 0.2566
Epoch 8/20 - Reconstruction Error: 0.2429
Epoch 9/20 - Reconstruction Error: 0.2297
Epoch 10/20 - Reconstruction Error: 0.2184
Epoch 11/20 - Reconstruction Error: 0.2077
Epoch 12/20 - Reconstruction Error: 0.1982
Epoch 13/20 - Reconstruction Error: 0.1893
Epoch 14/20 - Reconstruction Error: 0.1809
Epoch 15/20 - Reconstruction Error: 0.1739
Epoch 16/20 - Reconstruction Error: 0.1671
Epoch 17/20 - Reconstruction Error: 0.1613
Epoch 18/20 - Reconstruction Error: 0.1554
Epoch 19/20 - Reconstruction Error: 0.1500
Epoch 20/20 - Reconstruction Error: 0.1454
Training Complete.


In [13]:
# Check a user's rating habits
user_id_to_check = 78

# Get the ratings for this user from the normalized dataframe
user_ratings = norm_user_rating_df.iloc[user_id_to_check - 1]

# Count how many movies have a rating of 0 (meaning they were unrated)
unrated_movies_count = (user_ratings == 0).sum()

print(f"\nVerifying User ID {user_id_to_check}:")
print(f"Number of movies NOT rated by this user: {unrated_movies_count}")
print("--------------------------------------------------\n")


Verifying User ID 78:
Number of movies NOT rated by this user: 3566
--------------------------------------------------



In [16]:
# 3. Make Recommendations for a User

# Select a mock user to recommend movies for (e.g., user ID 25)
mock_user_id = 85

# Get the ratings of the mock user, ensuring NaN is replaced with 0

mock_user_ratings = user_rating_df.iloc[mock_user_id - 1].fillna(0).values.reshape(1, -1)

# Prepare the input for the RBM model (normalize and convert to float32)
input_v = mock_user_ratings / 5.0
input_v = tf.cast(input_v, tf.float32)

# Run the RBM model to get reconstructed ratings
# Pass the input through the RBM (v -> h -> v')
prob_h = rbm_model.prob_h_given_v(input_v)
reconstructed_v = rbm_model.prob_v_given_h(prob_h)

# Convert the output back to a NumPy array
reconstructed_ratings = reconstructed_v.numpy()

# Create a DataFrame for recommendations
recommendation_df = pd.DataFrame({
    'MovieID': user_rating_df.columns,
    'OriginalRating': mock_user_ratings.flatten(),
    'PredictedRating': reconstructed_ratings.flatten()
})

# Merge with movie titles
recommendation_df = pd.merge(recommendation_df, movies_df, on='MovieID', how='left')

# Filter out movies the user has already watched
unwatched_movies = recommendation_df[recommendation_df['OriginalRating'] == 0]

# Sort by predicted rating to get the top recommendations
top_recommendations = unwatched_movies.sort_values(by='PredictedRating', ascending=False)

# Display the top 10 movie recommendations for the user
print("\n==============================================")
print(f"Top 10 Movie Recommendations for User ID {mock_user_id}")
print("==============================================")
top_recommendations[['Title', 'Genres', 'PredictedRating']].head(10)


Top 10 Movie Recommendations for User ID 85


Unnamed: 0,Title,Genres,PredictedRating
962,Robin Hood: Prince of Thieves (1991),Drama,0.85196
33,Babe (1995),Children's|Comedy|Drama,0.806367
916,Moonlight Murder (1936),Mystery,0.794837
2652,Stop Making Sense (1984),Documentary,0.772963
2765,Bringing Out the Dead (1999),Drama|Horror,0.756585
843,Rear Window (1954),Mystery|Thriller,0.738416
1108,Raiders of the Lost Ark (1981),Action|Adventure,0.733105
1212,Butch Cassidy and the Sundance Kid (1969),Action|Comedy|Western,0.722845
2926,U2: Rattle and Hum (1988),Documentary|Musical,0.718081
665,"Substitute, The (1996)",Action,0.714437
