# Step 4. Modelling

**Author:** Danis Alukaev <br>
**Email:** d.alukaev@innopolis.university <br>

This notebook collects my attempts to construct baseline model. I've considered well-known approach proposed by Rendle S. in ["Factorization Machines, 2010"](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf). In short, this model extends traditional matrix factorization by also learning interactions between different feature values.  

## 0. Prerequisites

In [1]:
import pandas as pd
from pathlib import Path
import time
import math

import torch as torch
import torch.utils.data as data
import torch.nn as nn
import torch.optim as optim

In [2]:
DATA_DIR = Path("./data/movielens-1m/ml-1m")
MOVIES_FILE_PATH = DATA_DIR / Path("movies.dat")
USERS_FILE_PATH = DATA_DIR / Path("users.dat")
RATINGS_FILE_PATH = DATA_DIR / Path("ratings.dat")

ENCODING = 'latin-1'
ENGINE = 'python'

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
batch_size = 1024

## 1 Data Preparation

### 1.1 Movielens-1M

In [5]:
names = ['movie_id', 'title', 'genres']
movies = pd.read_csv(MOVIES_FILE_PATH, sep='::', names=names, encoding=ENCODING, engine=ENGINE)
movies['movie_index'] = movies['movie_id'].astype('category').cat.codes

In [6]:
names = ['user_id', 'gender', 'age', 'occupation', 'zipcode']
users = pd.read_csv(USERS_FILE_PATH, sep='::', names=names, encoding=ENCODING, engine=ENGINE)
users['user_id_index'] = users['user_id'].astype('category').cat.codes
users['gender_index'] = users['gender'].astype('category').cat.codes
users['age_index'] = users['age'].astype('category').cat.codes
users['occupation_index'] = users['occupation'].astype('category').cat.codes

In [7]:
names = ['user_id', 'movie_id', 'rating', 'time']
ratings = pd.read_csv(RATINGS_FILE_PATH, sep='::', names=names, encoding=ENCODING, engine=ENGINE)

In [8]:
ratings = ratings.join(movies.set_index('movie_id'), on='movie_id')
ratings = ratings.join(users.set_index('user_id'), on='user_id')

In [9]:
features = ['user_id_index', 'movie_index', 'age_index', 'gender_index', 'occupation_index']

In [10]:
features_sizes = {
    'user_id_index':len(ratings['user_id_index'].unique()),
    'movie_index':len(ratings['movie_index'].unique()),
    'age_index':len(ratings['age_index'].unique()),
    'gender_index':len(ratings['gender_index'].unique()),
    'occupation_index':len(ratings['occupation_index'].unique()),
}

next_offset = 0
features_offsets={}
for k,v in features_sizes.items():
    features_offsets[k] = next_offset
    next_offset += v

In [11]:
for column in features:
    ratings[column] = ratings[column].apply(lambda c: c + features_offsets[column])   

In [12]:
ratings.to_csv(DATA_DIR / "movielens-1m.csv", index=False)

### 1.2 PyTorch dataset

In [13]:
df = pd.read_csv(DATA_DIR / "movielens-1m.csv")[[*features, 'rating']]

In [14]:
X = torch.tensor(df[features].values)
y = torch.tensor(df['rating'].values).float()
dataset = data.TensorDataset(X, y)

In [15]:
train_n = int(len(dataset) * 0.9)
test_n = len(dataset) - train_n
splits = [train_n, test_n]

In [16]:
trainset, testset = torch.utils.data.random_split(dataset, splits)
trainloader = data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
testloader = data.DataLoader(testset, batch_size=batch_size, shuffle=True)

## 2. Designing Model

In [17]:
class FactorizationMachine(nn.Module):
    """Implementation of Factorization Machine acccording to "Factorization 
    Machines" by Rendle.
    https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf
    """
    
    def __init__(self, n, k):
        """Constructor of FMM class.
        
        Parameters
        ----------
        n : int
            size of feature vector
        
        k : int
            size of embedding to use
        
        Returns
        -------
        None
        """
        super().__init__()
        self.w0 = nn.Parameter(torch.zeros(1))
        self.bias = nn.Embedding(n, 1)
        self.embeddings = nn.Embedding(n, k)
        self._init_trunc_normal()
    
    def _init_trunc_normal(self, mean=0., std=0.01):
        """Initialize weights via truncated normal function.
        
        Implemented according to "An Exploration of Word Embedding 
        Initialization in Deep-Learning Tasks" by Kocmi T. and Bojar O.
        https://arxiv.org/pdf/1711.09160.pdf
        
        Parameters
        ----------
        mean : float (default: 0.00)
            mean of normal distribution
        
        std : float (default: 0.01)
            standard deviation of normal distribution
        
        Returns
        -------
        None
        """
        with torch.no_grad(): 
            self.embeddings.weight.normal_().fmod_(2).mul_(std).add_(mean)
            self.bias.weight.normal_().fmod_(2).mul_(std).add_(mean)
    
    def forward(self, x):
        "Compute interactions using Lemma 3.1"
        bias = self.bias(x).squeeze().sum(1)
        embeded = self.embeddings(x)
        pow_of_sum = embeded.sum(dim=1).pow(2)
        sum_of_pow = embeded.pow(2).sum(dim=1)
        pairwise = (pow_of_sum - sum_of_pow).sum(1) * 0.5
        y = torch.sigmoid(self.w0 + bias + pairwise) * 5.5
        return y

In [18]:
class TrainingRoutine: 
    
    def __init__(self, n, k, trainloader, testloader, 
                 lr=1e-3, weight_decay=1e-5, epochs=10, device="cuda"):
        """Constructor of training routine.
        
        Parameters
        ----------
        n : int
            size of feature vector
        
        k : int
            size of embedding to use
        
        trainloader : torch.utils.data.DataLoader 
            iterator over training data
        
        testloader : torch.utils.data.DataLoader 
            iterator over test data
        
        lr : float
            learning rate for optimizer
        
        weight_decay : float
            regularization parameter for optimizer
        
        epochs : int
            number of training epochs
        
        device : str
            device to use for training
        
        Returns
        -------
        None
        """
        self.n = n
        self.k = k
        self.trainloader = trainloader
        self.testloader = testloader
        self.lr = lr
        self.weight_decay = weight_decay
        self.epochs = epochs
        self.device = device
        
        self.model = FactorizationMachine(n, k).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr, weight_decay=weight_decay)
        self.scheduler = optim.lr_scheduler.MultiStepLR(self.optimizer, 
                                                        milestones=list(range(0, epochs + 1, 5))[1:], 
                                                        gamma=0.1)
        self.criterion = nn.MSELoss().to(device)
        
    def fit(self):
        "High-level outline of the training process."
        for epoch in range(self.epochs):
            train_loss = self._train_one_epoch()
            valid_loss = self._test()
            self.scheduler.step()
            print(f"Epoch #{epoch + 1} | Train loss: {(math.sqrt(train_loss)):.4f} | Test loss: {(math.sqrt(valid_loss)):.4f} |")
        return self.model
    
    def _train_one_epoch(self):
        "Train model for one epoch on train dataset."
        device = self.device
        train_loss = 0
        self.model.train()
        for x, y in self.trainloader:
            self.optimizer.zero_grad()
            y_hat = self.model(x.to(device))
            loss = self.criterion(y_hat, y.to(device))
            train_loss += loss.item() * x.shape[0]
            loss.backward()
            self.optimizer.step()
        return train_loss / len(self.trainloader.dataset)
    
    def _test(self):
        "Test model on validation dataset."
        device = self.device
        test_loss = 0
        self.model.eval()
        for x, y in self.testloader:                    
            with torch.no_grad():
                y_hat = self.model(x.to(device))
            loss = self.criterion(y_hat, y.to(device))
            test_loss += loss.item() * x.shape[0]
        return test_loss / len(self.testloader.dataset)

In [19]:
training = TrainingRoutine(X.max() + 1, 120, trainloader, testloader, device=device)

In [20]:
model = training.fit()

Epoch #1 | Train loss: 0.9423 | Test loss: 0.9132 |
Epoch #2 | Train loss: 0.8988 | Test loss: 0.8989 |
Epoch #3 | Train loss: 0.8757 | Test loss: 0.8840 |
Epoch #4 | Train loss: 0.8510 | Test loss: 0.8732 |
Epoch #5 | Train loss: 0.8274 | Test loss: 0.8661 |
Epoch #6 | Train loss: 0.7913 | Test loss: 0.8621 |
Epoch #7 | Train loss: 0.7856 | Test loss: 0.8613 |
Epoch #8 | Train loss: 0.7815 | Test loss: 0.8610 |
Epoch #9 | Train loss: 0.7780 | Test loss: 0.8608 |
Epoch #10 | Train loss: 0.7746 | Test loss: 0.8608 |


## 3 Inference

In [21]:
movies = ratings.drop_duplicates('movie_index').copy()
movie_embeddings = model.embeddings(torch.tensor(movies['movie_index'].values,device=device).long())
movies['embedding'] = movie_embeddings.tolist()
movie_biases = model.bias(torch.tensor(movies['movie_index'].values,device=device).long())
movies['bias'] = movie_biases.cpu().detach().numpy()

In [22]:
man_embedding = model.embeddings(torch.tensor(9754,device=device))
age18_25_embedding = model.embeddings(torch.tensor(9747,device=device))
metadata_embedding = man_embedding+age18_25_embedding
rankings = movie_biases.squeeze()+(metadata_embedding*movie_embeddings).sum(1)
[i for i in movies.iloc[rankings.argsort(descending=True).cpu()]['title'].values][:10]

['Usual Suspects, The (1995)',
 'Shawshank Redemption, The (1994)',
 'American Beauty (1999)',
 'Godfather, The (1972)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Braveheart (1995)',
 'Life Is Beautiful (La Vita è bella) (1997)',
 'Sanjuro (1962)',
 "Schindler's List (1993)",
 'Star Wars: Episode IV - A New Hope (1977)']