In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.utils import shuffle

In [2]:
!wget -nc https://files.grouplens.org/datasets/movielens/ml-20m.zip

--2023-02-21 08:59:39--  https://files.grouplens.org/datasets/movielens/ml-20m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 198702078 (189M) [application/zip]
Saving to: ‘ml-20m.zip’


2023-02-21 08:59:41 (90.4 MB/s) - ‘ml-20m.zip’ saved [198702078/198702078]



In [3]:
!unzip -n ml-20m.zip

Archive:  ml-20m.zip
   creating: ml-20m/
  inflating: ml-20m/genome-scores.csv  
  inflating: ml-20m/genome-tags.csv  
  inflating: ml-20m/links.csv        
  inflating: ml-20m/movies.csv       
  inflating: ml-20m/ratings.csv      
  inflating: ml-20m/README.txt       
  inflating: ml-20m/tags.csv         


In [4]:
!ls

ml-20m	ml-20m.zip  sample_data


In [5]:
df = pd.read_csv("ml-20m/ratings.csv")

In [6]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [7]:
df.userId = pd.Categorical(df.userId)
df["new_user_id"] = df.userId.cat.codes

In [8]:
df.movieId = pd.Categorical(df.movieId)
df["new_movie_id"] = df.movieId.cat.codes

In [9]:
# Get user IDs, movie IDs and ratings as separate arrays
user_ids = df["new_user_id"].values
movie_ids = df["new_movie_id"].values
ratings = df["rating"].values - 2.5 

In [10]:
# Get number of users and number of movies
N = len(set(user_ids))
M = len(set(movie_ids))

# Set embedding dimension
D = 10

In [11]:
# Make a neural network
class Model(nn.Module):
  def __init__(self, n_users, n_items, embed_dim, n_hidden=1024):
    super(Model, self).__init__()
    self.N = n_users
    self.M = n_items
    self.D = embed_dim

    self.u_embed = nn.Embedding(self.N, self.D)
    self.m_embed = nn.Embedding(self.M, self.D)
    self.fc1 = nn.Linear(2 * self.D, n_hidden)
    self.fc2 = nn.Linear(n_hidden, 1)

    # set the weights since N(0, 1) leads to poor results
    self.u_embed.weight.data = nn.Parameter(torch.Tensor(np.random.randn(self.N, self.D) * 0.01))
    self.m_embed.weight.data = nn.Parameter(torch.Tensor(np.random.randn(self.N, self.D) * 0.01))

  def forward(self, user, movie):
    user = self.u_embed(user) # output is num_samples x D
    movie = self.m_embed(movie) # output is num_samples x D

    # merge
    out = torch.cat((user, movie), 1) # output is num_samples x 2D

    # ANN
    out = self.fc1(out)
    out = F.relu(out)
    out = self.fc2(out)
    return out

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [13]:
model = Model(N, M, D)
model.to(device)

Model(
  (u_embed): Embedding(138493, 10)
  (m_embed): Embedding(26744, 10)
  (fc1): Linear(in_features=20, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1, bias=True)
)

In [14]:
# Loss and Optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())
# optimizer = torch.optim.SGD(model.parameters(), lr=0.08, momentum=0.9)

In [15]:
# Shuffle the data in corresponding order
user_ids, movie_ids, ratings = shuffle(user_ids, movie_ids, ratings)

In [16]:
# Convert to tensors
user_ids_t = torch.from_numpy(user_ids).long()
movie_ids_t = torch.from_numpy(movie_ids).long()
ratings_t = torch.from_numpy(ratings).long()

In [17]:
# Make datasets
Ntrain = 0.8 * len(ratings)
train_dataset = torch.utils.data.TensorDataset(
    user_ids_t[:int(Ntrain)],
    movie_ids_t[:int(Ntrain)],
    ratings_t[:int(Ntrain)]
)

test_dataset = torch.utils.data.TensorDataset(
    user_ids_t[int(Ntrain):],
    movie_ids_t[int(Ntrain):],
    ratings_t[int(Ntrain):]
)

In [18]:
# Data Loader
batch_size = 512

train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True
)

test_loader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=False
)

In [21]:
def batch_gd(model, criterion, optimizer, train_loader, test_loader, epochs):
  train_losses = np.zeros(epochs)
  test_losses = np.zeros(epochs)

  for i in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = []
    for users, movies, targets in train_loader:
      # reshape the target
      targets = targets.view(-1, 1).float()

      # move data to GPU
      users, movies, targets = users.to(device), movies.to(device), targets.to(device)

      # zero the parameter gradients
      optimizer.zero_grad()

      # Forward pass
      outputs = model(users, movies)
      loss = criterion(outputs, targets)

      # Backward and Optimize
      loss.backward()
      optimizer.step()
      
      train_loss.append(loss.item())

    train_losses[i] = np.mean(train_loss)

    model.eval()
    test_loss = []
    for users, movies, targets in test_loader:
      # reshape the target
      targets = targets.view(-1, 1).float()
      # move data to GPU
      users, movies, targets = users.to(device), movies.to(device), targets.to(device)
      outputs = model(users, movies)
      loss = criterion(outputs, targets)
      test_loss.append(loss.item())
    
    test_losses[i] = np.mean(test_loss)

    dt = datetime.now() - t0
    print(f"Epoch: {i+1} / {epochs}, Train Loss: {train_losses[i]:.4f}, Test Loss: {test_losses[i]:.4f}, Duration: {dt}")
  
  return train_losses, test_losses

In [None]:
# profile the code using %prun
%prun train_losses, test_losses = batch_gd(model, criterion, optimizer, train_loader, test_loader, epochs=25)

In [None]:
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.legend();