https://betterprogramming.pub/building-a-recommendation-engine-with-pytorch-d64be4856fe7

# Embeddings for Recommendations

Here I use a move rating data set from Kaggle.
In this notebook I will derive embeddings for users and movies
and these will be the ingredients of a matrix factorization
of the full rating matrix.

We will train a simple Neural Net using Pytorch.

In [1]:
import os
import datetime
import pandas as pd
import numpy as np
import torch.cuda

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [3]:
!export CUDA_LAUNCH_BLOCKING=1

### Note: I am going to use my GPU card ... or try to anyway.

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device in use:", device)

Device in use: cuda


In [5]:
## Read in the data and show shape and head

In [6]:
df = pd.read_csv("ratings.csv")
print(df.shape)
df.head(2)

(26024289, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435


In [8]:
print(f"max userId {df.userId.max()} num users {df.userId.nunique()}")
print(f"max movieId {df.movieId.max()} num movies {df.movieId.nunique()}")

max userId 270896 num users 270896
max movieId 176275 num movies 45115


In [49]:
subsample = True
if subsample:
    max_user = 50000
    max_movie = 5000
    df = df.loc[df.movieId < max_movie]
    df = df.loc[df.userId < max_user]
    print(f"max userId {df.userId.max()} num users {df.userId.nunique()}")
    print(f"max movieId {df.movieId.max()} num movies {df.movieId.nunique()}")    
    print(df.shape)    


max userId 49999 num users 49601
max movieId 4999 num movies 4896
(3370149, 4)


## Partition the dataset
into train, val and test.
We can use val to tune the hyper parameters.

Test will be held out until all training is done.

We could use the sklearn function but I find this just as easy.

In [50]:
def partition(df, pct=0.1):
    size = int(np.floor(df.shape[0])*0.1)
    idx = list(np.random.choice(df.index, size, replace=False))
    subset = df.filter(items=idx, axis=0)
    rest = df.drop(index = idx)
    return subset, rest

testdf, val_train = partition(df, 0.1)
valdf, traindf = partition(val_train, 0.2)
traindf.shape[0] / 1024

2665.841796875

## Dataset and dataloader
I want to use mini-batch training so we need a dataset
and a dataloader.

I adapted some code for converting a pandas dataframe into a dataloader

In [51]:
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __getitem__(self, index):
        row = self.dataframe.iloc[index].to_numpy()
        userid = int(row[0])
        movieid = int(row[1])
        rating = np.float32(row[2])
        return userid, movieid, rating

    def __len__(self):
        return len(self.dataframe)

traindata = CustomDataset(dataframe=traindf)
train_dataloader = DataLoader(traindata, batch_size=1024, pin_memory=True)

valdata = CustomDataset(dataframe=valdf)
val_dataloader = DataLoader(valdata, batch_size=512, pin_memory=True)

In [43]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f61b3aa2f50>

## the Model
The model is fairly simple: 2 embedding layers, 
    one each for users and movies.
    
At the end of "froward" we simply do the dot product.

In [36]:
class MF(nn.Module):
    def __init__(self, n_users, n_movies, emb_size=100):
        super(MF, self).__init__()
        self.n_users = n_users
        self.n_movies = n_movies
        self.user_emb = nn.Embedding(n_users, emb_size)
        self.movie_emb = nn.Embedding(n_movies, emb_size)
        # initializing our matrices with a positive number generally will yield better results
        self.user_emb.weight.data.uniform_(0, 0.5)
        self.movie_emb.weight.data.uniform_(0, 0.5)
    def forward(self, users, movies):
        #print("in forward")
        #print(users.max(), users.shape, movies.max(), movies.shape)
        #print(n_users, n_movies)
        m = self.movie_emb(movies)
        u = self.user_emb(users)
        return (u * m).sum(1)  # taking the dot product


## instantiate the Model
and push it to the gpu

In [37]:
USE_CUDA = True

In [38]:
n_users = df.userId.max()+1
n_movies = df.movieId.max()+1
print(n_users, n_movies)
model = MF(n_users, n_movies, emb_size=100)
use_cuda = False
if USE_CUDA:
    if torch.cuda.is_available():
        print("using cuda")
        model = model.to(device)
print(next(model.parameters()).is_cuda)

50000 9020
using cuda
True


In [39]:
model

MF(
  (user_emb): Embedding(50000, 100)
  (movie_emb): Embedding(9020, 100)
)

In [41]:
# training
epochs=4
lr=0.01
wd=0.0
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
#model.to(device)
#optimizer.to(device)
#train_dataloader.to(device)
model.train()
for ei in range(epochs):
    print(f" epoch {ei}")
    train_loss = 0.0
    val_loss = 0.0
    if False:
        #print(df.userId.nunique(), df.movieId.nunique())
        
        userIds = torch.LongTensor(traindf.userId.values).cuda()
        movieIds = torch.LongTensor(traindf.movieId.values).cuda()
        ratings = torch.FloatTensor(traindf.rating.values).cuda()
        print(userIds.max(), userIds.shape, movieIds.max())
        y_hat = model(userIds, movieIds)
        loss = F.mse_loss(y_hat, ratings)
        train_loss += np.round(loss.item(), 4)/ userIds.size[0]
        optimizer.zero_grad()  # reset gradient
        loss.backward()
        optimizer.step() 
        
        # validation
        userIds = torch.LongTensor(valdf.userId.values).cuda()
        movieIds = torch.LongTensor(valdf.movieId.values).cuda()
        ratings = torch.FloatTensor(valdf.rating.values).cuda()
        y_hat = model(users, movies)
        loss = F.mse_loss(y_hat, ratings)
        val_loss += np.round(loss.item(), 4)/ userIds.size()[0]        

        print(f" train loss {train_loss}  val loss {val_loss}")        
    if True:
        for tbi, data in enumerate(train_dataloader):
            if tbi % 10 == 0:
                print(f" batch {tbi} {train_loss}")
            users, movies, ratings = data
            users = users.cuda()
            movies = movies.cuda()
            ratings = ratings.cuda()
            y_hat = model(users, movies)
            loss = F.mse_loss(y_hat, ratings)
            train_loss += np.round(loss.item(), 4)/ users.size()[0]

            optimizer.zero_grad()  # reset gradient
            loss.backward()
            optimizer.step() 

        print(f"train_loss {train_loss}")
        for tbi, data in enumerate(val_dataloader):
            if tbi+1 % 128 == 0:
                print(f" val batch {tbi} {val_loss}")
            for i, data in enumerate(val_dataloader):
                users, movies, ratings = data
                users = users.cuda()
                movies = movies.cuda()
                ratings = ratings.cuda()
                y_hat = model(users, movies)
                loss = F.mse_loss(y_hat, ratings)
                val_loss += np.round(loss.item(), 4)/ users.size()[0]
        print("val_loss {val_loss}")        




 epoch 0
 batch 0 0.0
 batch 10 0.03091328125
 batch 20 0.05743164062499999
 batch 30 0.08576503906249999
 batch 40 0.11526445312499999
 batch 50 0.15070878906249996
 batch 60 0.17968417968749997
 batch 70 0.22663320312499996
 batch 80 0.273773046875
 batch 90 0.31269394531249994
 batch 100 0.3627755859374999
 batch 110 0.40634375
 batch 120 0.4614003906249999
 batch 130 0.520915234375
 batch 140 0.5756519531250001
 batch 150 0.6341634765625002
 batch 160 0.7001457031250002
 batch 170 0.7647980468750003
 batch 180 0.8303585937500003
 batch 190 0.8916527343750005
 batch 200 0.9623736328125004
 batch 210 1.0459083984375006
 batch 220 1.116830468750001
 batch 230 1.2067304687500011
 batch 240 1.2991796875000008
 batch 250 1.368230273437501
 batch 260 1.436738281250001
 batch 270 1.516301757812501
 batch 280 1.5970746093750012
 batch 290 1.6763718750000012
 batch 300 1.7560130859375012
 batch 310 1.8412138671875007
 batch 320 1.9420212890625008
 batch 330 2.0597363281250014
 batch 340 2.14

 batch 2750 35.702688085937474
 batch 2760 35.89021542968747
 batch 2770 36.03893867187497
 batch 2780 36.16528300781246
 batch 2790 36.30068554687497
 batch 2800 36.45427207031249
 batch 2810 36.585259570312495
 batch 2820 36.741649023437496
 batch 2830 36.9541619140625
 batch 2840 37.1263392578125
 batch 2850 37.273319140625006
 batch 2860 37.42782519531251
 batch 2870 37.5863970703125
 batch 2880 37.733478125
 batch 2890 37.82072246093749
 batch 2900 37.99586835937498
 batch 2910 38.17503183593748
 batch 2920 38.36104589843749
 batch 2930 38.504339648437494
 batch 2940 38.68731621093749
 batch 2950 38.857885156249985
 batch 2960 39.03303300781248
 batch 2970 39.209511914062475
 batch 2980 39.35282617187497
 batch 2990 39.56351562499997
 batch 3000 39.72957148437498
 batch 3010 39.88578652343749
 batch 3020 40.04232499999999
 batch 3030 40.210052539062495
 batch 3040 40.338597656249995
 batch 3050 40.4884521484375
 batch 3060 40.6512140625
 batch 3070 40.792176757812506
 batch 3080 4

KeyboardInterrupt: 

In [18]:
type(users)

torch.Tensor

In [None]:
type(users)
print(users)

In [None]:
def test_model(model, df):
    model.eval()
    users = torch.LongTensor(df.userId.values)
    movies = torch.LongTensor(df.movieId.values)
    ratings = torch.FloatTensor(df.rating.values)
    y_hat = model(users, movies)
    loss = F.mse_loss(y_hat, ratings)
    return  loss.item()
val_err = test_model(model, val)
test_err = test_model(model, test)    
print(val_err, test_err)

In [None]:
user = torch.tensor([10])
games = torch.tensor(game_ratings['movieId'].unique().tolist())
predictions = model(user, games).tolist()
print(predictions)

In [None]:
normalized_predictions = [i/max(predictions)*10 for i in predictions]
print(normalized_predictions)

In [None]:
sortedIndices = predictions.argsort()
recommendations = dataset['Title'].unique()[sortedIndices][:30]  # taking top 30
print(recommendations)