https://betterprogramming.pub/building-a-recommendation-engine-with-pytorch-d64be4856fe7

# Embeddings for Recommendations

Here I use a move rating data set from Kaggle.
In this notebook I will derive embeddings for users and movies
and these will be the ingredients of a matrix factorization
of the full rating matrix.

We will train a simple Neural Net using Pytorch.

In [76]:
import os
import datetime
import pandas as pd
import numpy as np
import torch.cuda

In [77]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [78]:
!export CUDA_LAUNCH_BLOCKING=1

### Note: I am going to use my GPU card ... or try to anyway.

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device in use:", device)

Device in use: cuda


In [5]:
## Read in the data and show shape and head

In [79]:
df = pd.read_csv("ratings.csv")
print(df.shape)
df.head(2)

(26024289, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435


In [80]:
print(f"max userId {df.userId.max()} num users {df.userId.nunique()}")
print(f"max movieId {df.movieId.max()} num movies {df.movieId.nunique()}")

max userId 270896 num users 270896
max movieId 176275 num movies 45115


In [81]:
subsample = True
if subsample:
    max_user = 10000
    max_movie = 5000
    df = df.loc[df.movieId < max_movie]
    df = df.loc[df.userId < max_user]
    print(f"max userId {df.userId.max()} num users {df.userId.nunique()}")
    print(f"max movieId {df.movieId.max()} num movies {df.movieId.nunique()}")    
    print(df.shape)    


max userId 9999 num users 9930
max movieId 4999 num movies 4783
(676352, 4)


## Partition the dataset
into train, val and test.
We can use val to tune the hyper parameters.

Test will be held out until all training is done.

We could use the sklearn function but I find this just as easy.

In [83]:
def partition(df, pct=0.1):
    size = int(np.floor(df.shape[0])*0.1)
    idx = list(np.random.choice(df.index, size, replace=False))
    subset = df.filter(items=idx, axis=0)
    rest = df.drop(index = idx)
    return subset, rest

testdf, val_train = partition(df, 0.1)
valdf, traindf = partition(val_train, 0.2)
traindf.shape[0] / 1024
print(valdf.shape, traindf.shape)
traindf.reset_index(inplace=True)
valdf.reset_index(inplace=True)

(60871, 4) (547846, 4)


## the Model
The model is fairly simple: 2 embedding layers, 
    one each for users and movies.
    
At the end of "froward" we simply do the dot product.

In [86]:
class MF(nn.Module):
    def __init__(self, n_users, n_movies, emb_size=100):
        super(MF, self).__init__()
        self.n_users = n_users
        self.n_movies = n_movies
        print(f" n_users: {n_users}  n_movies: {n_movies}")
        self.user_emb = nn.Embedding(n_users, emb_size)
        self.movie_emb = nn.Embedding(n_movies, emb_size)
        
        # initializing our matrices with a positive number generally will yield better results
        self.user_emb.weight.data.uniform_(0, 0.5)
        self.movie_emb.weight.data.uniform_(0, 0.5)
    
    def forward(self, users, movies):
        m = self.movie_emb(movies)
        u = self.user_emb(users)
        return (u * m).sum(1)  # taking the dot product


## instantiate the Model
and push it to the gpu

In [87]:
USE_CUDA = False
n_users = df.userId.max()+1
n_movies = df.movieId.max()+1
emb_size = 64
print(n_users, n_movies)
model = MF(n_users=n_users, n_movies=n_movies, emb_size=emb_size)
use_cuda = False
if USE_CUDA:
    if torch.cuda.is_available():
        print("using cuda")
        model = model.to(device)
print(next(model.parameters()).is_cuda)

10000 5000
 n_users: 10000  n_movies: 5000
False


## Dataset and dataloader
I want to use mini-batch training so we need a dataset
and a dataloader.

I adapted some code for converting a pandas dataframe into a dataloader

In [88]:
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        print(f" len {dataframe.shape[0]}")

    def __getitem__(self, index):
        #print(index)
        #row = self.dataframe.iloc[index].to_numpy()
        userid = self.dataframe.userId[index]
        movieid = self.dataframe.movieId[index]   #int(row[1])
        rating = np.float32(self.dataframe.rating[index])    #np.float32(row[2])
        return userid, movieid, rating

    def __len__(self):
        length = self.dataframe.shape[0]
        return length

traindata = CustomDataset(dataframe=traindf)
train_dataloader = DataLoader(traindata, batch_size=256)

valdata = CustomDataset(dataframe=valdf)
val_dataloader = DataLoader(valdata, batch_size=256) # pin_memory=True)

 len 547846
 len 60871


In [75]:
# training
epochs=4
lr=0.01
wd=0.0
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
#model.to(device)
#optimizer.to(device)
#train_dataloader.to(device)
model.train()
for ei in range(epochs):
    print(f" epoch {ei}  {datetime.datetime.now()}")
    train_loss = 0.0
    val_loss = 0.0
    if False:
        #print(df.userId.nunique(), df.movieId.nunique())
        
        userIds = torch.LongTensor(traindf.userId.values).cuda()
        movieIds = torch.LongTensor(traindf.movieId.values).cuda()
        ratings = torch.FloatTensor(traindf.rating.values).cuda()
        print(userIds.max(), userIds.shape, movieIds.max())
        y_hat = model(userIds, movieIds)
        loss = F.mse_loss(y_hat, ratings)
        train_loss += np.round(loss.item(), 4)/ userIds.size[0]
        optimizer.zero_grad()  # reset gradient
        loss.backward()
        optimizer.step() 
        
        # validation
        userIds = torch.LongTensor(valdf.userId.values).cuda()
        movieIds = torch.LongTensor(valdf.movieId.values).cuda()
        ratings = torch.FloatTensor(valdf.rating.values).cuda()
        y_hat = model(users, movies)
        loss = F.mse_loss(y_hat, ratings)
        val_loss += np.round(loss.item(), 4)/ userIds.size()[0]        

        print(f" train loss {train_loss}  val loss {val_loss}") 
    if True:
        for tbi, data in enumerate(val_dataloader):
            if tbi % 20 == 0:
                pass
                print(f" val batch {tbi} {val_loss}")
            for i, data in enumerate(val_dataloader):
                users, movies, ratings = data
                users = users#.cuda()
                movies = movies#.cuda()
                ratings = ratings#.cuda()
                y_hat = model(users, movies)
                loss = F.mse_loss(y_hat, ratings)
                val_loss += loss.item()/ users.size()[0]
        print(f"val_loss {np.round(val_loss, 4)}  {datetime.datetime.now()}")          
    if True:
        for tbi, data in enumerate(train_dataloader):
            if tbi % 150 == 0:
                pass
                print(f" batch {tbi} {train_loss}")
            users, movies, ratings = data
            users = users#.cuda()
            movies = movies#.cuda()
            ratings = ratings#.cuda()
            y_hat = model(users, movies)
            loss = F.mse_loss(y_hat, ratings)
            train_loss += loss.item()/ users.size()[0]

            optimizer.zero_grad()  # reset gradient
            loss.backward()
            optimizer.step() 
        print(f"train_loss {np.round(train_loss, 4)} {datetime.datetime.now()}")        
    if True:
        for tbi, data in enumerate(val_dataloader):
            if tbi % 20 == 0:
                pass
                print(f" val batch {tbi} {val_loss}")
            for i, data in enumerate(val_dataloader):
                users, movies, ratings = data
                users = users#.cuda()
                movies = movies#.cuda()
                ratings = ratings#.cuda()
                y_hat = model(users, movies)
                loss = F.mse_loss(y_hat, ratings)
                val_loss += loss.item()/ users.size()[0]
        print(f"val_loss {np.round(val_loss, 4)}  {datetime.datetime.now()}")          


 epoch 0  2022-10-04 16:42:48.485239
 val batch 0 0.0
 val batch 20 187.71937192740606
 val batch 40 375.4387438548118
 val batch 60 563.1581157822172
 val batch 80 750.8774877096228
 val batch 100 938.5968596370283
 val batch 120 1126.3162315644338
 val batch 140 1314.0356034918393
 val batch 160 1501.7549754192448
 val batch 180 1689.4743473466503
 val batch 200 1877.1937192740559
 val batch 220 2064.9130912014616
val_loss 2233.8605  2022-10-04 16:47:30.599059
 batch 0 0.0
 batch 150 1.31846070359461
 batch 300 2.370537465903908
 batch 450 3.3947003493085504
 batch 600 4.506368232192472
 batch 750 6.0191912818700075
 batch 900 7.619078364688903
 batch 1050 9.041840915568173
 batch 1200 10.571063495939597
 batch 1350 11.945449589984491
 batch 1500 13.334848600905389
 batch 1650 14.685705098789185
 batch 1800 16.15197489899583
 batch 1950 18.501841221703216
 batch 2100 20.556309435749426
train_loss 21.1106 2022-10-04 16:47:54.881264
 val batch 0 2233.8605259361266
 val batch 20 2397.80

In [71]:
def test_model(model, df):
    model.eval()
    users = torch.LongTensor(df.userId.values)
    movies = torch.LongTensor(df.movieId.values)
    ratings = torch.FloatTensor(df.rating.values)
    y_hat = model(users, movies)
    loss = F.mse_loss(y_hat, ratings)
    return  loss.item()
val_err = test_model(model, val)
test_err = test_model(model, test)    
print(val_err, test_err)

NameError: name 'val' is not defined

In [None]:
user = torch.tensor([10])
games = torch.tensor(game_ratings['movieId'].unique().tolist())
predictions = model(user, games).tolist()
print(predictions)

In [None]:
normalized_predictions = [i/max(predictions)*10 for i in predictions]
print(normalized_predictions)

In [None]:
sortedIndices = predictions.argsort()
recommendations = dataset['Title'].unique()[sortedIndices][:30]  # taking top 30
print(recommendations)