<a href="https://colab.research.google.com/github/kallepalomaki/MovieLens-recommender/blob/main/Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.utils import shuffle
from torch.utils.data import TensorDataset, DataLoader

In [2]:
!wget -nc https://files.grouplens.org/datasets/movielens/ml-25m.zip
!unzip ml-25m.zip

--2022-11-27 19:14:35--  https://files.grouplens.org/datasets/movielens/ml-25m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 261978986 (250M) [application/zip]
Saving to: ‘ml-25m.zip’


2022-11-27 19:14:57 (12.1 MB/s) - ‘ml-25m.zip’ saved [261978986/261978986]

Archive:  ml-25m.zip
   creating: ml-25m/
  inflating: ml-25m/tags.csv         
  inflating: ml-25m/links.csv        
  inflating: ml-25m/README.txt       
  inflating: ml-25m/ratings.csv      
  inflating: ml-25m/genome-tags.csv  
  inflating: ml-25m/genome-scores.csv  
  inflating: ml-25m/movies.csv       


In [3]:
df=pd.read_csv('ml-25m/ratings.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [4]:
df.userId=pd.Categorical(df.userId)
df.userId=df.userId.cat.codes
user_ids=df.userId.values

In [5]:
df.movieId=pd.Categorical(df.movieId)
df.movieId=df.movieId.cat.codes
movie_ids=df.movieId.values

In [6]:
ratings=df.rating.values
# keep rmse scaling but center
rating_mean=ratings.mean()
ratings-=rating_mean
np.mean(ratings)


-3.427872505066647e-17

In [7]:
num_users=len(set(user_ids))
num_movies=len(set(movie_ids))

In [8]:
# Embedding dim
embedding_dim=10

In [9]:
class Model(nn.Module):
  def __init__(self, num_users, num_movies, embedding_dim, num_hidden=512):
    super(Model, self).__init__()
    self.num_users=num_users
    self.num_movies=num_movies
    self.embedding_dim=embedding_dim

    self.gen_user_embedding=nn.Embedding(self.num_users, self.embedding_dim)
    self.gen_movie_embedding=nn.Embedding(self.num_movies, self.embedding_dim)

    self.linear_stack = nn.Sequential(
            nn.Linear(2* self.embedding_dim, num_hidden),
            nn.ReLU(),
            nn.Linear(num_hidden, 1),
        )
    
  def forward(self, user_ids, movie_ids):
    user_embedding=self.gen_user_embedding(user_ids)
    movie_embedding=self.gen_movie_embedding(movie_ids)
    embeddings=torch.cat((user_embedding, movie_embedding),1)
    logits=self.linear_stack(embeddings)
    
    return logits

In [10]:
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [11]:
model=Model(num_users=num_users, num_movies=num_movies, num_hidden=100, embedding_dim=embedding_dim)
model.to(device)

Model(
  (gen_user_embedding): Embedding(162541, 10)
  (gen_movie_embedding): Embedding(59047, 10)
  (linear_stack): Sequential(
    (0): Linear(in_features=20, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=1, bias=True)
  )
)

In [12]:
criterion=nn.MSELoss()
optimizer=torch.optim.Adam(model.parameters())

In [13]:
user_ids, movie_ids, ratings = shuffle(user_ids, movie_ids, ratings)
user_ids=torch.from_numpy(user_ids).long()
movie_ids=torch.from_numpy(movie_ids).long()
ratings=torch.from_numpy(ratings).float()

In [14]:
print(user_ids.shape, movie_ids.shape, ratings.shape)


torch.Size([25000095]) torch.Size([25000095]) torch.Size([25000095])


In [15]:
train_len=int(0.8*len(ratings))

In [16]:
train_data=TensorDataset(user_ids[:train_len], movie_ids[:train_len], ratings[:train_len])
test_data=TensorDataset(user_ids[train_len:], movie_ids[train_len:], ratings[train_len:])

In [17]:
batch_size=512
train_loader=DataLoader(dataset=train_data,
                        batch_size=batch_size,
                        shuffle=True)
test_loader=DataLoader(dataset=test_data,
                        batch_size=batch_size,
                        shuffle=False)

In [18]:
def train(model, criterion, optimizer, train_loader, test_loader, epochs):
  train_losses=np.zeros(epochs)
  test_losses=np.zeros(epochs)

  for it in range(epochs):
    t0=datetime.now()
    train_loss=[]
    for users, movies, targets in train_loader:
      users, movies, targets=users.to(device), movies.to(device), targets.to(device)
      model.zero_grad()
      pred_ratings=model(users, movies)
      #print(pred_ratings.shape, targets.shape)
      loss=criterion(pred_ratings, targets.reshape(-1,1))
      loss.backward()
      optimizer.step()
      train_loss.append(loss.item())
    train_losses[it]=np.mean(train_loss)
    test_loss=[]  
    for users, movies, targets in test_loader:
      users, movies, targets=users.to(device), movies.to(device), targets.to(device)
      model.zero_grad()
      pred_ratings=model(users, movies)
      loss=criterion(pred_ratings, targets.reshape(-1,1))
      test_loss.append(loss.item())
    test_losses[it]=np.mean(test_loss)
    print("train loss: ", train_losses[it], "test loss: ", test_losses[it])
  
  return train_losses, test_losses 



In [None]:
train(model, criterion, optimizer, train_loader, test_loader, 25 )

train loss:  0.8172426153504317 test loss:  0.7527773327669257
train loss:  0.7355174279733575 test loss:  0.727672666421123
train loss:  0.7124829875923315 test loss:  0.7132137145892093
train loss:  0.6993894929057384 test loss:  0.7074423608124659
train loss:  0.6916943183891406 test loss:  0.7020368509707138
train loss:  0.6864858571926064 test loss:  0.6983642781364265
train loss:  0.6824935499544478 test loss:  0.6954339829510214
train loss:  0.6793007299514195 test loss:  0.694845746554924
train loss:  0.6766167054057192 test loss:  0.6933754195499049
train loss:  0.6743612533436513 test loss:  0.6925880393116732
train loss:  0.6723101818922878 test loss:  0.6921084678529497


In [None]:
type(rating)