In [None]:
# Explicit feedback implementation of Recommender Systems based on MovieLens 1M

The objective of this notebook (project) is to test multiple recommendation systems for the movielens dataset.  I wanted to learn more on the following subjects:
- Pytorch: Learn more about this framework and implement some deep learning models
- Skorch: Get rid of the infamous Pytorch training loop.
- scikit-surprise: Learn how the framework works to obtain some benchmarks to compare my implementations

The dataset that was used is MovieLens 1m: http://files.grouplens.org/datasets/movielens/ml-1m-README.txt . The main reason we used this dataset is because it's the biggest that contained side information.

I tried to implement a simple version of the following papers:  
1- **Matrix Factorization techniques with SGD learning** - https://datajobs.com/data-science-repo/Recommender-Systems-[Netflix].pdf

2- **Wide & Deep for recommender systems** - https://arxiv.org/pdf/1606.07792.pdf:
The wide part of the model are manual interactions. Those interactions have been built using the patsy package.
More information on feature crosses: https://datascience.stackexchange.com/questions/57435/how-is-the-cross-product-transformation-defined-for-binary-features

3- **Neural Collaborative Filtering** - https://arxiv.org/pdf/1708.05031.pdf

In [1]:
### Notes to properly run the notebook
# At the time of developping this notebook, tensorboard was not fully integrated in skorch
# so it has to be installed from the sources
# git clone https://github.com/skorch-dev/skorch.git && cd skorch && python setup.py install

# The ipywidgets package needs to be installed to see the progressbar checkpoint
# It also needs to be activated like this: jupyter nbextension enable --py widgetsnbextension

In [2]:
### Important notes for JSGL
# If doing gridsearch, don't activate the function from dataloaders that spawn multiprocesses, memory will be hogged

In [3]:
import datetime
import itertools
import numpy as np
import os
import pandas as pd
import patsy
import time

import sklearn
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch import optim
from torch.autograd import Variable

from skorch import NeuralNet
from skorch.helper import predefined_split, SliceDataset
from skorch.callbacks import BatchScoring, Checkpoint, EarlyStopping, EpochScoring, LRScheduler, TensorBoard, ProgressBar

# Install latest Tensorflow build
#!pip install -q tf-nightly-2.0-preview
import tensorflow as tf
from tensorflow import summary
#%load_ext tensorboard



In [4]:
# Torch parameters
identifier = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device = torch.device(identifier)
device = 'cpu'
print('Using device ', device)

print('Using torch version ', torch.__version__)

torch.set_printoptions(precision=7)

Using device  cpu
Using torch version  1.2.0


In [5]:
if not os.path.exists('ml-1m'):
    !wget http://files.grouplens.org/datasets/movielens/ml-1m.zi
    !unzip -o ml-1m.zip

### Dataset

In [19]:
class rsdataset(Dataset):
    def __init__(self, usersfile, moviesfile, ratingsfile, nrows=None):
        
        # Read files
        self.movies = pd.read_csv(moviesfile, sep='::', names=['MovieID', 'Title', 'Genres'], engine='python')
        self.users = pd.read_csv(usersfile, sep='::', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zipcode'], engine='python')
        self.ratings = pd.read_csv(ratingsfile, sep='::', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python', nrows=nrows)
        
        assert self.users['UserID'].nunique() >= self.ratings['UserID'].nunique(), 'UserID with unknown information'
        assert self.movies['MovieID'].nunique() >= self.ratings['MovieID'].nunique(), 'Movies with unknown information'

        self.users_emb_columns = []
        self.users_ohe_columns = []
        self.movies_emb_columns = []
        self.movies_ohe_columns = []
        self.interact_columns = []

        self.nusers = self.ratings['UserID'].nunique()
        self.nmovies = self.ratings['MovieID'].nunique()

        self.y_range = (self.ratings['Rating'].min(), self.ratings['Rating'].max())

    def __len__(self):
        return len(self.y)
        
    def __getitem__(self, idx):
        """
        What have we learned regarding tensors and GPU memory
        -----------------------------------------------------
        - For every type of data, use the smalleset memory size required. For example, don't use int64 for ohe.
        - pinned_memory=True didn't help my speed problems when I tried.
        - As a consequence, everything was put in memory, and I __getitem__ was used to slice data.
        - num_workers helped improving the speed. 
        """
        return (((self.users_emb[idx])),
                ((self.users_ohe[idx])),
                ((self.movies_emb[idx])),
                ((self.movies_ohe[idx])),
                ((self.interact[idx]))), (self.y[idx])

    def to_tensor(self):
        self.users_emb = torch.from_numpy(self.ratings[self.users_emb_columns].values)
        self.users_ohe = torch.tensor(self.ratings[self.users_ohe_columns].values, dtype=torch.float)
        self.movies_emb = torch.from_numpy(self.ratings[self.movies_emb_columns].values)
        self.movies_ohe = torch.tensor(self.ratings[self.movies_ohe_columns].values, dtype=torch.float)
        self.interact = torch.from_numpy(self.ratings[self.interact_columns].values)
        self.y = torch.tensor(self.y.values, dtype=torch.float)

In [20]:
train = rsdataset('ml-1m/users.dat', 'ml-1m/movies.dat', 'ml-1m/ratings.dat', nrows=10000)

### Preprocessing of dataset

In [21]:
train.ratings = train.ratings.merge(train.movies, left_on='MovieID', right_on='MovieID')
train.movies = train.ratings[train.movies.columns]

train.ratings = train.ratings.merge(train.users, left_on='UserID', right_on='UserID')
train.users = train.ratings[train.users.columns]

train.y = train.ratings['Rating']

In [22]:
# Label Encode users
columns = ['UserID', 'Gender', 'Age', 'Occupation']
train.ratings[columns] = train.ratings[columns].apply(preprocessing.LabelEncoder().fit_transform)
train.users_emb_columns = train.users_emb_columns + columns

In [23]:
# Label Encode movies
columns = ['MovieID']
train.ratings[columns] = train.ratings[columns].apply(preprocessing.LabelEncoder().fit_transform)
train.movies_emb_columns = train.movies_emb_columns + columns

In [24]:
# One Hot Encode users
#columns = ['Gender', 'Age', 'Occupation', 'Zipcode']
columns = ['Gender', 'Age', 'Occupation']
ohe = preprocessing.OneHotEncoder(categories='auto', sparse=False, dtype='uint8')
ohe.fit(train.ratings[columns])
train.ratings = pd.concat([train.ratings, pd.DataFrame(data=ohe.transform(train.ratings[columns]), columns=ohe.get_feature_names(columns))], axis=1)
train.users_ohe_columns = ohe.get_feature_names(columns)

assert train.ratings[train.users_ohe_columns].max().max()<=1, 'Error with ohe columns'

In [25]:
# One Hot Encode movies (non exclusive)
genres = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
          'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

for genre in genres:
    genre = genre.replace('-', '')
    column = 'Genre_' + str(genre)
    train.ratings[column] = train.ratings['Genres'].apply(lambda x: 1 if genre in x else 0)
    train.movies_ohe_columns.append(column)
    
assert train.ratings[train.movies_ohe_columns].max().max()<=1, 'Error with ohe columns'

In [26]:
int_genres_gender = ""
for genre in train.movies_ohe_columns:
    int_genres_gender = int_genres_gender + '+' +genre + ':Gender'

int_genres_age = ""
for genre in train.movies_ohe_columns:
    int_genres_age = int_genres_age + '+' + genre + ':Age'
    
interact = patsy.dmatrix("0 + Gender:Age + Gender:Occupation + Age:Occupation"+int_genres_gender+int_genres_age, data=train.ratings.astype('object'), return_type='dataframe').astype('int8')
interact = interact.astype('uint8')
train.ratings = pd.concat([train.ratings, interact], axis=1)
train.interact_columns = interact.columns

In [27]:
# Drop unused columns
train.movies.drop(['Title', 'Genres'], inplace=True, axis=1)
train.ratings.drop(['Title', 'Genres', 'Zipcode'], inplace=True, axis=1)

In [28]:
train.to_tensor()

### DataLoaders

In [29]:
# Split
train_size = int(0.8 * len(train))
test_size = len(train) - train_size
train_dataset, valid_dataset = torch.utils.data.random_split(train, [train_size, test_size])

# Create dataloaders
dataloaders = {}
dataloaders['train'] = torch.utils.data.DataLoader(train_dataset, batch_size=4096, shuffle=True)
dataloaders['valid'] = torch.utils.data.DataLoader(valid_dataset, batch_size=4096, shuffle=True)

### Define Pytorch models

In [30]:
class deepnwide(nn.Module):
    """
    Hyperparameters:
        - module__size_emb
        - module__dropout
        - module__linear_size
}
    Best run: -0.9766627748807272 {'lr': 0.001, 'module__dropout': 0.2, 'module__size_emb': 30}

    """
    def __init__(self, users_emb, movies_emb, users_ohe, movies_ohe, interact, size_emb, y_range, dropout, linear_size):
        super().__init__()
        
        self.name = 'deepnwide'
        self.y_range = y_range

        # wide part - We don't need to specify nothing here
        
        # deep
        self.emb_UserID = nn.Embedding(len(torch.unique(users_emb[:, 0])), size_emb)
        self.emb_UserID.weight.data.uniform_(-.01, .01)
        self.emb_Gender = nn.Embedding(len(torch.unique(users_emb[:, 1])), size_emb)
        self.emb_Gender.weight.data.uniform_(-.01, .01)
        self.emb_Age = nn.Embedding(len(torch.unique(users_emb[:, 2])), size_emb)
        self.emb_Age.weight.data.uniform_(-.01, .01)
        self.emb_Occupation = nn.Embedding(len(torch.unique(users_emb[:, 3])), size_emb)
        self.emb_Occupation.weight.data.uniform_(-.01, .01)
        self.emb_MovieID = nn.Embedding(len(torch.unique(movies_emb[:, 0])), size_emb)
        self.emb_MovieID.weight.data.uniform_(-.01, .01)

        # hidden layers
        self.h1 = nn.Linear(5 * size_emb, linear_size)
        self.h2 = nn.Linear(linear_size, linear_size)
        self.h3 = nn.Linear(linear_size, linear_size)

        # Dropout layers
        self.dropout1 = nn.Dropout(p=dropout)
        self.dropout2 = nn.Dropout(p=dropout)
        self.dropout3 = nn.Dropout(p=dropout)

        # final dense layer 
        self.last_layer = nn.Linear((interact.shape[1]) + (movies_ohe.shape[1]) + (linear_size), 1)


    def forward(self, X):
        # Assign data
        user_emb = X[0]
        user_ohe = X[1]
        movie_emb = X[2]
        movie_ohe = X[3]
        interact = X[4]
        
        UserID = user_emb[:, 0]
        Gender = user_emb[:, 1]
        Age = user_emb[:, 2]
        Occupation = user_emb[:, 3]
        MovieID = movie_emb[:, 0]

        UserID = self.emb_UserID(UserID)
        Gender = self.emb_Gender(Gender)
        Age = self.emb_Age(Age)
        Occupation = self.emb_Occupation(Occupation)
        MovieID = self.emb_MovieID(MovieID)

        emb = torch.cat([UserID,
                         Age,
                         Gender,
                         Occupation,
                         MovieID],
                         dim=1)
        
        emb = F.relu(self.dropout1(self.h1(emb)))
        emb = F.relu(self.dropout2(self.h2(emb)))
        emb = F.relu(self.dropout3(self.h3(emb)))

        result = self.last_layer(torch.cat([interact.float(), movie_ohe.float(), emb.float()], dim=1))

        return (torch.sigmoid(result) * (self.y_range[1]-self.y_range[0]) + self.y_range[0]).squeeze()


model = deepnwide(train.users_emb, train.movies_emb, train.users_ohe, train.movies_ohe, train.interact, 60, train.y_range, 0.5, 100)
model.to(device)
print(model)

deepnwide(
  (emb_UserID): Embedding(70, 60)
  (emb_Gender): Embedding(2, 60)
  (emb_Age): Embedding(7, 60)
  (emb_Occupation): Embedding(19, 60)
  (emb_MovieID): Embedding(2159, 60)
  (h1): Linear(in_features=300, out_features=100, bias=True)
  (h2): Linear(in_features=100, out_features=100, bias=True)
  (h3): Linear(in_features=100, out_features=100, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (dropout2): Dropout(p=0.5, inplace=False)
  (dropout3): Dropout(p=0.5, inplace=False)
  (last_layer): Linear(in_features=404, out_features=1, bias=True)
)


In [31]:
class twoembeds(torch.nn.Module):
    """
    Hyperparameters:
        module__size_emb
    """
    def __init__(self, size_emb, y_range):
        super().__init__()

        # set name of model
        self.name = 'twoembeds'
        self.y_range = y_range

        # User and movie embeddings
        self.emb_UserID = nn.Embedding(train.nusers, size_emb)
        self.emb_MovieID = nn.Embedding(train.nmovies, size_emb)
        self.emb_UserID.weight.data.uniform_(-.01, .01)
        self.emb_MovieID.weight.data.uniform_(-.01, .01)
        
        # User and movie embeddings weights
        self.emb_UserID_b = nn.Embedding(train.nusers, 1)
        self.emb_MovieID_b = nn.Embedding(train.nmovies, 1)
        self.emb_UserID_b.weight.data.uniform_(-.01, .01)
        self.emb_MovieID_b.weight.data.uniform_(-.01, .01)
 

    def forward(self, X):
        user_emb = X[0]
        user_ohe = X[1]
        movie_emb = X[2]
        movie_ohe = X[3]
        interact = X[4]

        UserID = user_emb[:, 0]
        MovieID = movie_emb[:, 0]

        user_emb = self.emb_UserID(UserID)
        movie_emb = self.emb_MovieID(MovieID)

        mult = (user_emb * movie_emb).sum(1)

        # add bias
        multb = mult + self.emb_UserID_b(UserID).squeeze() + self.emb_MovieID_b(MovieID).squeeze()

        multb = multb.float()

        return (torch.sigmoid(multb) * (self.y_range[1]-self.y_range[0]) + self.y_range[0]).squeeze()

        return multb


model = twoembeds(15, train.y_range)
model.to(device)
print(model)

twoembeds(
  (emb_UserID): Embedding(70, 15)
  (emb_MovieID): Embedding(2159, 15)
  (emb_UserID_b): Embedding(70, 1)
  (emb_MovieID_b): Embedding(2159, 1)
)


In [32]:
class ncf(torch.nn.Module):
    """
    Neural Collaborative Filtering: https://arxiv.org/pdf/1708.05031.pdf
    There is a matrix factorization part in this model and a deep learning one.
    
    Hyper parameters:
    - module__size_emb
    - module__dropout
    - module__linear_size
    """
    def __init__(self, users_emb, movies_emb, users_ohe, movies_ohe, interact, size_emb, dropout, linear_size):
        super().__init__()

        # set name of model
        self.name = 'ncf'
        
        ### GMF part
        # user embeddings
        self.gmf_embuserid = nn.Embedding(len(torch.unique(users_emb[:, 0])), size_emb)
        self.gmf_embuserid.weight.data.uniform_(-.01, .01)
        self.gmf_embgender = nn.Embedding(len(torch.unique(users_emb[:, 1])), size_emb)
        self.gmf_embgender.weight.data.uniform_(-.01, .01)
        self.gmf_embage = nn.Embedding(len(torch.unique(users_emb[:, 2])), size_emb)
        self.gmf_embage.weight.data.uniform_(-.01, .01)
        self.gmf_embocc = nn.Embedding(len(torch.unique(users_emb[:, 3])), size_emb)
        self.gmf_embocc.weight.data.uniform_(-.01, .01)
        # movie embeddings
        self.gmf_embmovieid = nn.Embedding(len(torch.unique(movies_emb[:, 0])), size_emb*4-len(train.movies_ohe_columns))
        self.gmf_embmovieid.weight.data.uniform_(-.01, .01)
        
        
        ### MLP part
        # user embeddings
        self.mlp_embuserid = nn.Embedding(len(torch.unique(users_emb[:, 0])), size_emb)
        self.mlp_embuserid.weight.data.uniform_(-.01, .01)
        self.mlp_embgender = nn.Embedding(len(torch.unique(users_emb[:, 1])), size_emb)
        self.mlp_embgender.weight.data.uniform_(-.01, .01)
        self.mlp_embage = nn.Embedding(len(torch.unique(users_emb[:, 2])), size_emb)
        self.mlp_embage.weight.data.uniform_(-.01, .01)
        self.mlp_embocc = nn.Embedding(len(torch.unique(users_emb[:, 3])), size_emb)
        self.mlp_embocc.weight.data.uniform_(-.01, .01)
        # movie embeddings
        self.mlp_embmovieid = nn.Embedding(len(torch.unique(movies_emb[:, 0])), size_emb)
        self.mlp_embmovieid.weight.data.uniform_(-.01, .01)
        # hidden layers
        self.h1 = nn.Linear(5*size_emb+len(train.movies_ohe_columns), linear_size)
        self.h2 = nn.Linear(linear_size, linear_size)
        self.h3 = nn.Linear(linear_size, linear_size)
        # Dropout layers
        self.dropout1 = nn.Dropout(p=dropout)
        self.dropout2 = nn.Dropout(p=dropout)
        self.dropout3 = nn.Dropout(p=dropout)
        
        # final dense layer 
        self.last_layer = nn.Linear(size_emb*4+linear_size, 1)
                                       
    def forward(self, X):
        user_emb = X[0]
        user_ohe = X[1]
        movie_emb = X[2]
        movie_ohe = X[3]
        interact = X[4]
        
        UserID = user_emb[:, 0]
        Gender = user_emb[:, 1]
        Age = user_emb[:, 2]
        Occupation = user_emb[:, 3]
        MovieID = movie_emb[:, 0]

        # GMF part
        gmf_embuserid = self.gmf_embuserid(UserID)
        gmf_embgender = self.gmf_embgender(Gender)
        gmf_embage = self.gmf_embage(Age)
        gmf_embocc = self.gmf_embocc(Occupation)
        gmf_embmovieid = self.gmf_embmovieid(MovieID)

        gmf_user_vector = torch.cat([gmf_embuserid,
                                    gmf_embgender,
                                    gmf_embage,
                                    gmf_embocc],
                                    dim=1)

        gmf_movie_vector = torch.cat([gmf_embmovieid, movie_ohe], 1)
        
        gmf_vector = (gmf_user_vector * gmf_movie_vector)

        
        # MLP part
        mlp_embuserid = self.mlp_embuserid(UserID)
        mlp_embgender = self.mlp_embgender(Gender)
        mlp_embage = self.mlp_embage(Age)
        mlp_embocc = self.mlp_embocc(Occupation)
        mlp_movieid = self.mlp_embmovieid(MovieID)
        
        mlp_vector = torch.cat([mlp_embuserid,
                                mlp_embgender,
                                mlp_embage,
                                mlp_embocc,
                                mlp_movieid,
                                movie_ohe],
                                dim=1)
        mlp_vector = F.relu(self.dropout1(self.h1(mlp_vector)))
        mlp_vector = F.relu(self.dropout2(self.h2(mlp_vector)))
        mlp_vector = F.relu(self.dropout3(self.h3(mlp_vector)))

        # Fusion
        result = torch.cat([gmf_vector, mlp_vector], dim=1)
        result = self.last_layer(result)
        
        return (torch.sigmoid(result) * (5-1) + 1).squeeze()

model = ncf(train.users_emb, train.movies_emb, train.users_ohe, train.movies_ohe, train.interact, 60, 0.5, 200)
model.to(device)
print(model)

ncf(
  (gmf_embuserid): Embedding(70, 60)
  (gmf_embgender): Embedding(2, 60)
  (gmf_embage): Embedding(7, 60)
  (gmf_embocc): Embedding(19, 60)
  (gmf_embmovieid): Embedding(2159, 222)
  (mlp_embuserid): Embedding(70, 60)
  (mlp_embgender): Embedding(2, 60)
  (mlp_embage): Embedding(7, 60)
  (mlp_embocc): Embedding(19, 60)
  (mlp_embmovieid): Embedding(2159, 60)
  (h1): Linear(in_features=318, out_features=200, bias=True)
  (h2): Linear(in_features=200, out_features=200, bias=True)
  (h3): Linear(in_features=200, out_features=200, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (dropout2): Dropout(p=0.5, inplace=False)
  (dropout3): Dropout(p=0.5, inplace=False)
  (last_layer): Linear(in_features=440, out_features=1, bias=True)
)


### Skorch callbacks

In [33]:
# Earlystopping callback
earlystopping = EarlyStopping(monitor='valid_loss', patience=5, threshold=0.005)

In [34]:
# RMSE callback
def rmseloss(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmseloss)

epoch_rmse = EpochScoring(rmse_scorer, name='rmse_score', lower_is_better=True)

In [35]:
# Checkpoint callback
checkpoint = Checkpoint(monitor='rmse_score_best', f_params='params.pt', f_optimizer='optimizer.pt', f_history='history.json', f_pickle='model')

In [36]:
# Learning rate scheduler callback
lr_scheduler = LRScheduler(policy="StepLR", step_size=7, gamma=0.1)

In [37]:
# Progressbar callback
progressbar = ProgressBar()

In [38]:
# Tensorboard
writer = SummaryWriter()
#%tensorboard --logdir 'runs/'

### Neural Collaborative Filtering

#### Manually specify hyperparamers 

In [39]:
ncfnet = NeuralNet(
    ncf,
    module__users_emb=train.users_emb,
    module__movies_emb=train.movies_emb,
    module__users_ohe=train.users_ohe,
    module__movies_ohe=train.movies_ohe,
    module__interact=train.interact,
    module__size_emb=30,
    module__dropout=0.5,
    module__linear_size=200,
    module__y_range=train.y_range,#### Manually specify hyperparamers 
    max_epochs=30,
    lr=0.0005,
    optimizer=torch.optim.Adam,
    criterion=torch.nn.MSELoss,
    device=device,
    iterator_train__batch_size=1024,
    iterator_train__num_workers=0,
    iterator_train__shuffle=True,
    iterator_valid__batch_size=4096,
    train_split=predefined_split(valid_dataset),
    callbacks=[
               earlystopping,
               epoch_rmse,
               #checkpoint,
               lr_scheduler,
               #TensorBoard(writer),
               #progressbar
               ]
)

In [40]:
#ncfnet.fit(train_dataset)

#### GridSearchCV

In [None]:
params = {
    'lr': [0.001, 0.0005, 0.0001],
    'module__size_emb': [30, 60, 120],
    'module__dropout': [0.2, 0.5],
    'module__linear_size': [100, 150, 200, 400]
}
gs = GridSearchCV(ncfnet,
                  params,
                  verbose=50,
                  refit=False,
                  #pre_dispatch=8,
                  n_jobs=8,
                  cv=2,
                  scoring='neg_mean_squared_error')

X_ds = SliceDataset(train, idx=0)
y_ds = SliceDataset(train, idx=1)
gs.fit(X_ds, y_ds)

print(gs.best_score_, gs.best_params_)

### Deep and Wide

#### Manually specify hyperparamers 

In [None]:
deepnwidenet = NeuralNet(
    deepnwide,
    module__users_emb=train.users_emb,
    module__movies_emb=train.movies_emb,
    module__users_ohe=train.users_ohe,
    module__movies_oh#### Manually specify hyperparamers e=train.movies_ohe,
    module__interact=train.interact,
    module__size_emb=30,
    module__y_range=train.y_range,
    module__dropout=0.2,
    max_epochs=30,
    lr=0.001,
    optimizer=torch.optim.Adam,
    criterion=torch.nn.MSELoss,
    device=device,
    iterator_train__batch_size=1024,
    iterator_train__num_workers=0,
    iterator_train__shuffle=True,
    iterator_valid__batch_size=4096,
    train_split=predefined_split(valid_dataset),
    callbacks=[
               earlystopping,
               epoch_rmse,
               #checkpoint,
               lr_scheduler,
               #TensorBoard(writer),
               #progressbar
               ]
)

In [None]:
#deepnwidenet.fit(train_dataset)

#### GridsearchCV

In [None]:
params = {
    'lr': [0.001, 0.01],
    'module__size_emb': [30, 60, 120],
    'module__dropout': [0.2, 0.5]
}
gs = GridSearchCV(deepnwidenet,
                  params,
                  verbose=50,
                  refit=False,
                  #pre_dispatch=8,
                  n_jobs=8,
                  cv=3,
                  scoring='neg_mean_squared_error')

X_ds = SliceDataset(train, idx=0)
y_ds = SliceDataset(train, idx=1)
gs.fit(X_ds, y_ds)

print(gs.best_score_, gs.best_params_)

### Two embeddings - Basic matrix factorization

#### Manually specify hyperparamers 

In [None]:
twoembedsnet = NeuralNet(
    twoembeds,
    module__size_emb=128,
    module__y_range=train.y_range,
    max_epochs=30,
    lr=0.001,
    optimizer=torch.optim.Adam,
    criterion=torch.nn.MSELoss,
    device=device,
    iterator_train__batch_size=4096,
    iterator_train__num_workers=0,
    iterator_train__shuffle=True,
    iterator_valid__batch_size=4096,
    train_split=predefined_split(valid_dataset),
    callbacks=[earlystopping,
               epoch_rmse,
               #checkpoint,
               lr_scheduler]
)

In [None]:
twoembedsnet.fit(train_dataset)

#### GridSearchCV

In [None]:
params = {
    'lr': [0.001, 0.01],
    'module__size_emb': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
}
gs = GridSearchCV(twoembedsnet,
                  params,
                  verbose=50,
                  refit=False,
                  #pre_dispatch=8,
                  n_jobs=8,
                  cv=3,
                  scoring='neg_mean_squared_error')

X_ds = SliceDataset(train, idx=0)
y_ds = SliceDataset(train, idx=1)
gs.fit(X_ds, y_ds)

print(gs.best_score_, gs.best_params_)

### Benchmark with scikit-surprise SVD algorithm

In [605]:
#!pip install surprise

In [608]:
from surprise import NormalPredictor
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split, KFold

In [626]:
train[dataloaders['train'].dataset.indices][1]

tensor([4., 5., 3.,  ..., 1., 4., 4.])

In [628]:
user = train[dataloaders['train'].dataset.indices][0][0][:, 0].data.numpy()
movie = train[dataloaders['train'].dataset.indices][0][2][:, 0].data.numpy()
y = train[dataloaders['train'].dataset.indices][1].data.numpy()
df = pd.DataFrame({'user': user, 'movie': movie, 'y': y})
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user', 'movie', 'y']], reader)

In [629]:
data = Dataset.load_from_df(train.ratings.loc[dataloaders['train'].dataset.indices, ['UserID', 'MovieID', 'Rating']], reader)

In [630]:
a = train.ratings.loc[dataloaders['train'].dataset.indices, ['UserID', 'MovieID', 'Rating']]
b = pd.DataFrame({'UserID': user, 'MovieID': movie, 'Rating': y})

In [631]:
#data = Dataset.load_builtin('ml-1m')

In [632]:
# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.8918


0.8917552064774448