# Collaborative Filtering - Predicting Movie Ratings (MovieLens Datset)

This notebook is about the implementation in pytorch of a Collaborative filtering algorithm in order to predict movie ratings (MovieLens Dataset).

Description: 
- MovieLens is a tabular data containing the user ID, movie ID and movie ratings made by the users 
- The objective is to predict the movie ratings and recommend movies unseen by the users.

We're going to use the MovieLens 100K dataset, which has 100,000 movie reviews. 

author: Jorge Ivan Avalos Lopez & Jose Alberto Moreno
- python: 3.8.3
- pytorch: 1.6.0
- sklearn: 0.23.1

## 1.- Pre-proccesing data 

In [None]:
import pandas as pd
import shelve
import numpy as np
import os

dataPath = "./Data/"
os.listdir(dataPath)  # Lets check the datasets

In [None]:
# Read the data
data = pd.read_csv(
    dataPath + "u.data", delimiter="\t", header=None, names=["User", "Movie", "Rating", "Timestamp"]
)
data.head()

In [None]:
data.info()  # Let´s check the dtypes

In [None]:
data["Rating"].unique()  # lets check the reitings

In [None]:
# subCrosstable
# lets take random sample
data_sample = data.sample(frac=0.001)
cross_tabulated = pd.crosstab(
    data_sample.User, data_sample.Movie, values=data_sample.Rating, aggfunc="first"
)
cross_tabulated  # we have a lot NaN values, it means non-raiting movies by one user

In [None]:
# Read the mapping of movie and it's name
movies = pd.read_csv(
    dataPath + "u.item",
    delimiter="|",
    encoding="latin-1",
    header=None,
    usecols=(0, 1),
    names=["Movie", "Title"],
)
movies.head()

In [None]:
# let's join data and movies by Movie column
ratings = data.merge(movies, on="Movie")
ratings.head()

In [None]:
# we substract one to each User and Movie because the embedding matrix requiered
# Note: This is important, because if User or Movie have not a "0" in one record, the embedding matrix won't work and launch an error
ratings["User"] = ratings["User"] - 1
ratings["Movie"] = ratings["Movie"] - 1

In [None]:
# ratings["User"].describe() # we observe that user has min value of 0
ratings["Movie"].describe()

In [None]:
# We save the DataFrame
shelve_data = shelve.open(dataPath + "ratings.db")
try:
    shelve_data["ratings"] = ratings
finally:
    shelve_data.close()

## 2.- Building movieDataset Dataset

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler, Adam
import torch.nn as nn
from sklearn.model_selection import train_test_split
import shelve
import numpy as np
import math

In [None]:
class movieDataset(Dataset):
    """__init__ method creation

    Args:
        path (str) : Define the path where the data is located.
        transform (Class) : Define a transformation on the dataset.
        train (bool) : Define train or test data.
        split_data (dict) : Define defaul parameters to train_test_split function random state must be the number for training and validation.
    """

    def __init__(
        self, path, transform=None, train=True, split_data={"test_size": 0.2, "random_state": None}
    ):
        super(movieDataset, self).__init__()

        self._path = path
        self._transform = transform
        self._train = train
        self._split_data = split_data

        # Read the dataset from shelve object
        with shelve.open(path) as data:
            self._ratings = data["ratings"]  # Pandas DataFrame

        # Split X_data (input vector - feature vector) and Y_data(output_vector - label vector)
        # from de dataset
        self._x_data, self._y_data = self._ratings[["User", "Movie"]], self._ratings[["Rating"]]

        # Split dataset into train and test using train_test_split
        self._x_train, self._x_val, self._y_train, self._y_val = train_test_split(
            self._x_data,
            self._y_data,
            test_size=self._split_data["test_size"],
            random_state=self._split_data["random_state"],
        )

        # get number of users
        self.n_users = self._ratings["User"].nunique()
        # get number of movies
        self.n_movies = self._ratings["Movie"].nunique()

        # Get the cardinality of the dataset
        if self._train:
            self._n_samples = len(self._x_train)
        else:
            self._n_samples = len(self._x_val)

        """ __getitem__ magic method to index the object
        
        Args:
            index (int): Define the index
            
        return:
            sample (tuple): (input vector, label vector)

    """

    def __getitem__(self, index):
        if self._train:
            sample = self._x_train.iloc[index, :], self._y_train.iloc[index, :]
        else:
            sample = self._x_val.iloc[index, :], self._y_val.iloc[index, :]

        if self._transform:
            sample = self._transform(sample)

        return sample

    """ __len__ magic method to len the object
    """

    def __len__(self):
        return self._n_samples


class ToTensor:
    """__call__ magic method to recive objects and transform them

    return:
        (torch.Tensor, torch.Tensor)
    """

    def __call__(self, sample):
        x, y = sample
        return torch.tensor(x.values).long(), torch.squeeze(torch.tensor(y.values)).to(torch.float)

## 3.- Building a collaborative Filtering model

In [None]:
class CollFilt(nn.Module):
    def __init__(self, n_users, n_movies, n_factors, output_range=(0, 5.5)):
        super(CollFilt, self).__init__()
        self.output_range = output_range
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.user_bias = nn.Embedding(n_users, 1)

        self.movie_factors = nn.Embedding(n_movies, n_factors)
        self.movie_bias = nn.Embedding(n_movies, 1)

        self.sigmoid = nn.Sigmoid()

    def forward(self, t_input):
        users_t = t_input[:, 0]
        movies_t = t_input[:, 1]
        users = self.user_factors(users_t)
        movies = self.movie_factors(movies_t)
        dotProd = (users * movies).sum(dim=1)
        dotProd += self.user_bias(users_t)[:, 0] + self.movie_bias(movies_t)[:, 0]
        return self.sigmoid_range(dotProd, self.output_range)

    def sigmoid_range(self, t_input, output_range):
        min_val, max_val = output_range
        return (max_val - min_val) * self.sigmoid(t_input) + min_val

## 4.- Training and Evaluating the Colaborative Filtering Model

In [None]:
def train_model(
    model,
    loss,
    optimizer,
    scheduler,
    data_train,
    data_val,
    num_epochs=10,
    batch_size=128,
    device="cuda",
):
    """Training Model
    Args:
        model (nn.Module) : Model to train, model must be in gpu or cpu
        loss (nn.lossFunction) : Loss function to minimize
        optimizer (torch.optim.optimizer) : optimizer algorithm
        data_train (torch.utils.data.Dataset) : a Dataset instance of the data train
        data_test (torch.utils.data.Dataset) : a Dataset instance of the data train
        num_epochs (int) : number of training epochs
        batch_size (int) : number of batch size
        device (str) : device type
    return:
        model (nn.Module) : Model trained
    """

    # Build The DataLoader Object to make batches in training
    trainloader = DataLoader(dataset=data_train, batch_size=batch_size, shuffle=True)
    valloader = DataLoader(dataset=data_val, batch_size=batch_size, shuffle=False)

    # number of iterations per epoch
    n_iterations_train = math.ceil(len(trainloader))
    n_iterations_val = math.ceil(len(valloader))

    # to store errors
    train_err = []
    val_err = []

    for epoch in range(num_epochs):
        train_error = 0
        for i, (x_train, y_train) in enumerate(trainloader):
            x_train, y_train = x_train.to(device), y_train.to(device)
            optimizer.zero_grad()
            output = model(x_train)
            l = loss(output, y_train)
            l.backward()
            optimizer.step()
            train_error += l.item()
            scheduler.step()
        train_error_avg = train_error / n_iterations_train
        print("Train -> epoch : {0}/{1}, loss : {2}".format(epoch + 1, num_epochs, train_error_avg))
        train_err.append(train_error_avg)

        with torch.no_grad():
            val_error = 0
            for i, (x_val, y_val) in enumerate(valloader):
                x_val, y_val = x_val.to(device), y_val.to(device)
                output = model.eval()(x_val)
                l = loss(output, y_val)
                val_error += l.item()

            val_error_avg = val_error / n_iterations_val
            print(
                "Test -> epoch : {0}/{1}, loss : {2}".format(epoch + 1, num_epochs, val_error_avg)
            )
            val_err.append(val_error_avg)

        print("-" * 50)

    return model

In [None]:
# Downloading the raiting DataFrame
dataPath = "./Data/ratings.db"
split_data = {"test_size": 0.2, "random_state": 848}

# Defining the Dataset objects for training and validation
data_train = movieDataset(dataPath, transform=ToTensor(), split_data=split_data)
data_val = movieDataset(dataPath, transform=ToTensor(), train=False, split_data=split_data)

# Hyperparameters of the CollFilt Model and the training
n_users = data_train.n_users
n_movies = data_train.n_movies
n_factors = 50
device = "cuda"
weight_decay = 0.001
output_range = (0, 5.5)
num_epochs = 15
batch_size = 64

# Instanciating the model
model = CollFilt(n_users, n_movies, n_factors, output_range).to(device)

# Defining optimizer and learning rate scheduler
optimizer = torch.optim.Adam(model.parameters(), weight_decay=weight_decay)
scheduler = lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=0.05,
    steps_per_epoch=math.ceil(len(data_train) / batch_size),
    epochs=num_epochs,
)
# Defining the loss function
loss = nn.MSELoss()
# Training
model_trained = train_model(
    model,
    loss,
    optimizer,
    scheduler,
    data_train,
    data_val,
    num_epochs=num_epochs,
    batch_size=batch_size,
    device=device,
)

In [None]:
# Saving the trained model
torch.save(model_trained.state_dict(), "./data/CollFilt")

## 5.- Training and Evaluating the Deep Colaborative Filtering Model

In [None]:
class CollFiltDNN(nn.Module):
    """DNN initialization
    Args:
        input_dim (int): Input dimension.
        dict_arch (dict): DNN architecture.
    """

    def __init__(self, n_users, n_movies, n_factors, dict_arch, output_range):
        super(CollFiltDNN, self).__init__()
        self.output_range = output_range
        self.n_users = n_users
        self.n_movies = n_movies
        self.n_factors = n_factors

        self.dict_arch = dict_arch

        self.user_factors = nn.Embedding(n_users, n_factors)
        self.movie_factors = nn.Embedding(n_movies, n_factors)

        # Define layers
        self.layer1 = nn.Sequential(
            nn.Linear(self.n_factors + self.n_factors, self.dict_arch["layer1"]["input_dim"]),
            nn.ReLU(),
        )

        self.layer2 = nn.Sequential(
            nn.Linear(self.dict_arch["layer1"]["input_dim"], self.dict_arch["layer2"]["input_dim"]),
        )

        self.dnn = nn.Sequential(self.layer1, self.layer2)
        self.sigmoid = nn.Sigmoid()

    """ Forward pass
        Args (torch.Tensor): Tensor input
    
    """

    def forward(self, t_input):
        embs = torch.cat(
            (self.user_factors(t_input[:, 0]), self.movie_factors(t_input[:, 1])), dim=1
        )
        output = self.dnn(embs)
        return self.sigmoid_range(output, self.output_range)[:, 0]

    def sigmoid_range(self, t_input, output_range):
        min_val, max_val = output_range
        return (max_val - min_val) * self.sigmoid(t_input) + min_val

In [None]:
dataPath = "./Data/ratings.db"
split_data = {"test_size": 0.2, "random_state": 848}

data_train = movieDataset(dataPath, transform=ToTensor(), split_data=split_data)
data_val = movieDataset(dataPath, transform=ToTensor(), train=False, split_data=split_data)

n_users = data_train.n_users
n_movies = data_train.n_movies
n_factors = 50
device = "cuda"
weight_decay = 0.01
output_range = (0, 5.5)
num_epochs = 10
batch_size = 64


dict_arch = {"layer1": {"input_dim": 100}, "layer2": {"input_dim": 1}}


# Instanciating the model
model = CollFiltDNN(n_users, n_movies, n_factors, dict_arch, output_range).to(device)


# Defining optimizer and a learning rate scheduler
optimizer = torch.optim.Adam(model.parameters(), weight_decay=weight_decay)
scheduler = lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=0.05,
    steps_per_epoch=math.ceil(len(data_train) / batch_size),
    epochs=num_epochs,
)
# Defining the loss function
loss = nn.MSELoss()
# Training
model_trained = train_model(
    model,
    loss,
    optimizer,
    scheduler,
    data_train,
    data_val,
    num_epochs=num_epochs,
    batch_size=batch_size,
    device=device,
)