In [18]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader


import torch.nn as nn
import torch.optim as optim

import numpy as np

1. Load the MovieLens 100k dataset

u.data     -- The full u data set, 100000 ratings by 943 users on 1682 items.
              Each user has rated at least 20 movies.  Users and items are
              numbered consecutively from 1.  The data is randomly
              ordered. This is a tab separated list of 
	         user id | item id | rating | timestamp. 
              The time stamps are unix seconds since 1/1/1970 UTC  

In [5]:
columns = ['user_id', 'movie_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data',sep='\t', names=columns)

# Drop timestamp as it's not needed
df.drop(columns=['timestamp'], inplace=True)

In [None]:
# 2. Encode User IDs and Movie IDs to start from zero (for PyTorch embeddings)
df['user_id'] = df['user_id'] - 1
df['movie_id'] = df['movie_id'] - 1

# 3. Split into train (80%), validation (10%), and test (10%) sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

print(f"Train size: {len(train_data)}, Validation size: {len(val_data)}, Test size: {len(test_data)}")

In [7]:
# 4. Convert to PyTorch tensors
train_tensor = torch.tensor(train_data.values, dtype=torch.long)
val_tensor = torch.tensor(val_data.values, dtype=torch.long)
test_tensor = torch.tensor(test_data.values, dtype=torch.long)

In [8]:
# 5. Create PyTorch Dataset class
class MovieLensDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        user_id, movie_id, rating = self.data[index]
        return user_id, movie_id, rating.float()

In [9]:
# 6. Create DataLoaders
batch_size = 64
train_loader = DataLoader(MovieLensDataset(train_tensor), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(MovieLensDataset(val_tensor), batch_size=batch_size, shuffle=False)
test_loader = DataLoader(MovieLensDataset(test_tensor), batch_size=batch_size, shuffle=False)

Step 2: Implementing Matrix Factorization Model

We’ll use Embedding layers in PyTorch to learn representations for users and movies.
Approach:

    Each user and movie will have an embedding (vector representation).
    The dot product of these embeddings will predict the rating.
    We’ll train the model using Mean Squared Error (MSE) Loss.

In [12]:

# Define the Matrix Factorization Model
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_movies, emb_size=50):
        super(MatrixFactorization, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.movie_emb = nn.Embedding(num_movies, emb_size)
        
    def forward(self, user_id, movie_id):
        user_vector = self.user_emb(user_id)  # (batch_size, emb_size)
        movie_vector = self.movie_emb(movie_id)  # (batch_size, emb_size)
        rating_pred = (user_vector * movie_vector).sum(dim=1)  # Dot product
        return rating_pred


In [14]:

# Get the number of users and movies
num_users = df['user_id'].nunique()
num_movies = df['movie_id'].nunique()


print(f"Number of users: {num_users}, Number of movies: {num_movies}")

Number of users: 943, Number of movies: 1682


In [15]:
# Instantiate the model
model = MatrixFactorization(num_users, num_movies, emb_size=50)


# Loss function and optimizer
criterion = nn.MSELoss() # Mean Squared Error
optimizer = optim.Adam(model.parameters(), lr=0.01) # Learning rate

# Move to GPU 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


MatrixFactorization(
  (user_emb): Embedding(943, 50)
  (movie_emb): Embedding(1682, 50)
)

In [16]:
# Training function
def train_model(model, train_loader, val_loader, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for user_id, movie_id, rating in train_loader:
            user_id, movie_id, rating = user_id.to(device), movie_id.to(device), rating.to(device)
            
            optimizer.zero_grad()
            predictions = model(user_id, movie_id)
            loss = criterion(predictions, rating)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        # Validation loss
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for user_id, movie_id, rating in val_loader:
                user_id, movie_id, rating = user_id.to(device), movie_id.to(device), rating.to(device)
                predictions = model(user_id, movie_id)
                val_loss += criterion(predictions, rating).item()
        
        print(f"Epoch {epoch+1}/{epochs}: Train Loss = {total_loss/len(train_loader):.4f}, Validation Loss = {val_loss/len(val_loader):.4f}")


In [17]:
# Train the model
train_model(model, train_loader, val_loader, epochs=10)

Epoch 1/10: Train Loss = 39.1095, Validation Loss = 20.4537
Epoch 2/10: Train Loss = 6.2560, Validation Loss = 8.7115
Epoch 3/10: Train Loss = 1.9911, Validation Loss = 6.0283
Epoch 4/10: Train Loss = 1.3419, Validation Loss = 4.7765
Epoch 5/10: Train Loss = 1.5432, Validation Loss = 3.9948
Epoch 6/10: Train Loss = 1.8801, Validation Loss = 3.2595
Epoch 7/10: Train Loss = 1.6230, Validation Loss = 2.6696
Epoch 8/10: Train Loss = 1.2726, Validation Loss = 2.3639
Epoch 9/10: Train Loss = 1.1334, Validation Loss = 2.1777
Epoch 10/10: Train Loss = 1.0840, Validation Loss = 2.0379



Step 3: Model Evaluation

We’ll evaluate the model using the Root Mean Squared Error (RMSE) on the test set.
Why RMSE?

    RMSE measures the error between predicted and actual ratings.
    A lower RMSE means better performance.

In [19]:
# Function to evaluate the model
def evaluate_model(model, test_loader):
    model.eval()  # Set to evaluation mode
    total_loss = 0
    all_preds = []
    all_actuals = []

    with torch.no_grad():
        for user_id, movie_id, rating in test_loader:
            user_id, movie_id, rating = user_id.to(device), movie_id.to(device), rating.to(device)
            predictions = model(user_id, movie_id)
            
            all_preds.extend(predictions.cpu().numpy())
            all_actuals.extend(rating.cpu().numpy())

    # Compute RMSE
    all_preds = np.array(all_preds)
    all_actuals = np.array(all_actuals)
    rmse = np.sqrt(np.mean((all_preds - all_actuals) ** 2))
    return rmse

In [20]:
# Compute RMSE
test_rmse = evaluate_model(model, test_loader)
print(f"Test RMSE: {test_rmse:.4f}")

Test RMSE: 1.4166


In [21]:
class MFWithBias(nn.Module):
    def __init__(self, num_users, num_movies, emb_size=50):
        super(MFWithBias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.movie_emb = nn.Embedding(num_movies, emb_size)

        # Bias terms
        self.user_bias = nn.Embedding(num_users, 1)
        self.movie_bias = nn.Embedding(num_movies, 1)
        
        # Global bias (single scalar value)
        self.global_bias = nn.Parameter(torch.zeros(1))

    def forward(self, user_id, movie_id):
        user_vector = self.user_emb(user_id)
        movie_vector = self.movie_emb(movie_id)
        
        # Dot product of embeddings
        rating_pred = (user_vector * movie_vector).sum(dim=1)
        
        # Add biases
        rating_pred += self.user_bias(user_id).squeeze() + self.movie_bias(movie_id).squeeze() + self.global_bias
        
        return rating_pred

# Re-train with new model
model = MFWithBias(num_users, num_movies, emb_size=50).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)
train_model(model, train_loader, val_loader, epochs=10)

# Evaluate new model
test_rmse = evaluate_model(model, test_loader)
print(f"Improved Test RMSE: {test_rmse:.4f}")


Epoch 1/10: Train Loss = 29.2383, Validation Loss = 12.4745
Epoch 2/10: Train Loss = 4.1276, Validation Loss = 6.7240
Epoch 3/10: Train Loss = 1.6254, Validation Loss = 4.7539
Epoch 4/10: Train Loss = 1.2292, Validation Loss = 3.9485
Epoch 5/10: Train Loss = 1.5000, Validation Loss = 3.2844
Epoch 6/10: Train Loss = 1.6830, Validation Loss = 2.5851
Epoch 7/10: Train Loss = 1.3138, Validation Loss = 2.0844
Epoch 8/10: Train Loss = 0.9974, Validation Loss = 1.9135
Epoch 9/10: Train Loss = 0.9016, Validation Loss = 1.7621
Epoch 10/10: Train Loss = 0.8755, Validation Loss = 1.6492
Improved Test RMSE: 1.2934
