# Recommender System Approaches
This notebooks was written for the purpose of experimenting with different algorithms in recommender systems. 

We reserve the following assumptions for all the algorithms:
- Implicit feedback with binary data. We only know if a user interacted with an item (1), or not (0).
- Data is split using the user-based, time-ordered, leave-one-out strategy. This means that for each users we sort their interactions and split into train/val/test in the following way. The last interaction is the test data. The one before is the validation data. The rest is the training data.
- We experiment on 1 month extract of LFM2b (last month)
- Evaluate on NDCG and Hit Ratio

In [99]:
from protorec_dataset import get_protorecdataset_dataloader
from utilities.utils import reproducible, print_results
from utilities.eval import Evaluator

from scipy import sparse as sp
from scipy.sparse import linalg as sp_lin
import scipy as sc
import numpy as np
import bottleneck as bn
from functools import partial

from tqdm.notebook import tqdm,trange

from torch import nn 
from utilities.utils import general_weight_init

SEED = 1391075

In [2]:
train_loader = get_protorecdataset_dataloader(
                    data_path='./data/lfm2b-1m',
                    split_set='train',
                    n_neg=10,
                    neg_strategy='uniform',
                    batch_size=64,
                    shuffle=True,
                    num_workers=8,
)
val_loader = get_protorecdataset_dataloader(
                    data_path='./data/lfm2b-1m',
                    split_set='val',
                    n_neg=99,
                    neg_strategy='uniform',
                    batch_size=64,
                    num_workers=8,
)

test_loader = get_protorecdataset_dataloader(
                    data_path='./data/lfm2b-1m',
                    split_set='test',
                    n_neg=99,
                    neg_strategy='uniform',
                    batch_size=64,
                    num_workers=8,
)

Loading data
Built ProtoRecDataset module 
- data_path: ./data/lfm2b-1m 
- n_users: 3555 
- n_items: 77985 
- n_interactions: 870255 
- split_set: train 
- n_neg: 10 
- neg_strategy: uniform 

Loading data
Built ProtoRecDataset module 
- data_path: ./data/lfm2b-1m 
- n_users: 3555 
- n_items: 77985 
- n_interactions: 3555 
- split_set: val 
- n_neg: 99 
- neg_strategy: uniform 

Loading data
Built ProtoRecDataset module 
- data_path: ./data/lfm2b-1m 
- n_users: 3555 
- n_items: 77985 
- n_interactions: 3555 
- split_set: test 
- n_neg: 99 
- neg_strategy: uniform 



# Random Algorithm
For each user, recommend items sampled u.a.r 

In [3]:
reproducible(SEED)

evaluator = Evaluator(test_loader.dataset.n_users)

for u_idxs, i_idxs, labels in test_loader:
    n_batch_users, n_batch_items = i_idxs.shape
    # Generate random scores
    out = np.random.rand(n_batch_users,n_batch_items)
    
    evaluator.eval_batch(out)

metrics_values = evaluator.get_results()
print_results(metrics_values)

ndcg@1     : 0.007
hit_ratio@1 : 0.007
ndcg@3     : 0.020
hit_ratio@3 : 0.030
ndcg@5     : 0.028
hit_ratio@5 : 0.049
ndcg@10    : 0.046
hit_ratio@10 : 0.106
ndcg@50    : 0.127
hit_ratio@50 : 0.495


# Popular Items
For each item, assign a score that depends on the popularity of the item

In [4]:
reproducible(SEED)

evaluator = Evaluator(test_loader.dataset.n_users)

for u_idxs, i_idxs, labels in test_loader:
    
    # Look-up of the popularity (compared to the training data)
    out = test_loader.dataset.pop_distribution[i_idxs]
    
    evaluator.eval_batch(out)

metrics_values = evaluator.get_results()
print_results(metrics_values)

ndcg@1     : 0.046
hit_ratio@1 : 0.046
ndcg@3     : 0.081
hit_ratio@3 : 0.106
ndcg@5     : 0.102
hit_ratio@5 : 0.156
ndcg@10    : 0.132
hit_ratio@10 : 0.248
ndcg@50    : 0.205
hit_ratio@50 : 0.585


# SVD (constrainted matrix factorization)

In [None]:
csr_matrix = train_loader.dataset.csr_matrix
csr_matrix = csr_matrix.asfptype() # casting to float

u,s,vt = sp_lin.svds(csr_matrix,k=100)

users_factors = u * s
items_factors = vt.T

reproducible(SEED)

evaluator = Evaluator(test_loader.dataset.n_users)

for u_idxs, i_idxs, labels in test_loader:
    
    
    batch_users = users_factors[u_idxs]
    batch_items = items_factors[i_idxs]
    
    out = (batch_items * batch_users[:,None,:]).sum(axis=-1) # Carrying out the dot product
    
    
    evaluator.eval_batch(out)

metrics_values = evaluator.get_results()
print_results(metrics_values)

ndcg@1     : 0.188
hit_ratio@1 : 0.188
ndcg@3     : 0.272
hit_ratio@3 : 0.335
ndcg@5     : 0.307
hit_ratio@5 : 0.419
ndcg@10    : 0.350
hit_ratio@10 : 0.551
ndcg@50    : 0.419
hit_ratio@50 : 0.862


# User-based KNN (Collaborative Filtering)
There are numerous extensions of KNN. Here we just consider variation in terms of the similarity functions used.
In the end, we generate a n_user x n_user similarity matrix. The prediction for an item i and and user u is given by the weighted average of the similarities of user u and its top-k neighbours. 

In [4]:
csr_matrix = train_loader.dataset.csr_matrix

def take_only_top_k(sim_mtx,k=100):
    # This method slims down the similarity matrix by only picking the top-k most similar items. It also allows a faster computation of the prediction (as you can see in evaluate_user_knn and evaluate_item_knn)
    
    new_data = []
    new_indices = []
    new_indptr = [0]
    
    n_entities = sim_mtx.shape[0]
    
    cumulative_sum = 0
    
    for idx in range(n_entities):
        start_idx = sim_mtx.indptr[idx]
        end_idx = sim_mtx.indptr[idx + 1]
        
        data = sim_mtx.data[start_idx:end_idx]
        ind = sim_mtx.indices[start_idx:end_idx]
        
        # Avoding taking the user/item itself
        self_idx = np.where(ind == idx)[0][0]
        data[self_idx] = 0.
        
        top_k_indxs = np.argsort(-data)[:k] 
        
        top_k_data = data[top_k_indxs]
        top_k_indices = ind[top_k_indxs]
        
        new_data +=  list(top_k_data)
        new_indices += list(top_k_indices)
        
        cumulative_sum += len(top_k_data)
        
        new_indptr.append(cumulative_sum)
        
        
    return sp.csr_matrix((new_data,new_indices,new_indptr),shape=sim_mtx.shape)


def evaluate_user_knn(matrix,sim_fun,k=100):
    # matrix is the user x item binary matrix
    # sim_fun is the function that carries out the similarity between entities in the matrix (always considering the entities on the rows)
    # k is the number of neighbours
    
    
    sim_mtx = take_only_top_k(sim_fun(matrix),k)
    
    pred_mtx = sim_mtx @ matrix 
    pred_mtx = pred_mtx.toarray() # Ugly solution but I spent alreday too much on this

    reproducible(SEED)

    evaluator = Evaluator(test_loader.dataset.n_users)

    for u_idxs, i_idxs, labels in test_loader:

        out = pred_mtx[u_idxs[:,None],i_idxs]
        evaluator.eval_batch(out)

    metrics_values = evaluator.get_results()
    print_results(metrics_values)
    
csr_matrix = train_loader.dataset.csr_matrix

def evaluate_item_knn(matrix,sim_fun,k=100):
    
    sim_mtx = take_only_top_k(sim_fun(matrix.T),k)
    
    pred_mtx = matrix @ sim_mtx.T # Note that the item matrix has to be transposed since we took the top-k for each row!
    pred_mtx = pred_mtx.toarray() # Ugly solution but I spent alreday too much on this
    
    reproducible(SEED)

    evaluator = Evaluator(test_loader.dataset.n_users)

    for u_idxs, i_idxs, labels in test_loader:
        
        out = pred_mtx[u_idxs[:,None],i_idxs]
        evaluator.eval_batch(out)

    metrics_values = evaluator.get_results()
    print_results(metrics_values)

## Similarity Estimation

#### Jaccard Similarity
Defined a $\frac{|I_x \cap I_y|}{|I_x \cup I_y|}$ where $I_x$ and $I_y$ are the set of items interacted by user x and y respectively. 

**Attempt to a intuitive explanation/reading the formula**:
Two users are considered similar if they have a high number of items that both have consumed. This quantity is normalized by their 'total' items consumed. The last part allows to compare users with different number of items consumed. 

In [5]:
def compute_jaccard_sim_mtx(matrix):
    jaccard_sim_mtx = (matrix @ matrix.T)

    counts = np.array(matrix.sum(axis=1)).squeeze()
    try:
        union = counts.T + counts - jaccard_sim_mtx # unfortunately this consumes too much memory since it materialize a n_entities x n_entities matrix.
        jaccard_sim_mtx = sp.csr_matrix(jaccard_sim_mtx / union)
    except Exception as e:
        print('Resorting to slower method (never checked if it terminates though)')
        rows_nz,cols_nz = jaccard_sim_mtx.nonzero()
        jaccard_sim_mtx[rows_nz,cols_nz] = jaccard_sim_mtx[rows_nz,cols_nz] / (counts[rows_nz] + counts[cols_nz] - jaccard_sim_mtx[rows_nz,cols_nz])    
    
    return jaccard_sim_mtx

evaluate_user_knn(csr_matrix,compute_jaccard_sim_mtx)

ndcg@1     : 0.224
hit_ratio@1 : 0.224
ndcg@3     : 0.309
hit_ratio@3 : 0.372
ndcg@5     : 0.341
hit_ratio@5 : 0.451
ndcg@10    : 0.378
hit_ratio@10 : 0.566
ndcg@50    : 0.403
hit_ratio@50 : 0.662


### Cosine Similarity
Defined as $\frac{\sum_i u_x^i \cdot u_y^i}{\sqrt{\sum_i {u_x^i}^2} \sqrt{\sum_i {u_y^i}^2}}$ where $u_x$ and $u_y$ are the row vectors of the user-item interaction matrix

**Attempt to a intuitive explanation/reading the formula**: (Note that in the case of binary data as it is now, the cosine similarity is ~similar to jaccard. The difference is in the denominator: while for jaccard you simiply sum the elements in the union, for cosine we sum the square roots of the sum of the elements in each set.). The cosine similarity measures the angle between two vectors. Assuming that each item represent a specific dimension, the cosine ~checks how these vectors are oriented in the multidimensional space. If the vectors are oriented in the same direction (regardless of their intensity/length), then they are similar. If they are oriented in opposite directions, then they are dissimilar.


In [6]:
def compute_cosine_sim_mtx(matrix):
    
    norms = sp_lin.norm(matrix,axis=1)

    normalized_matrix = sp.csr_matrix((matrix.T / norms).T)

    cosine_sim_mtx = normalized_matrix @ normalized_matrix.T
    
    return cosine_sim_mtx

evaluate_user_knn(csr_matrix,compute_cosine_sim_mtx)

ndcg@1     : 0.226
hit_ratio@1 : 0.226
ndcg@3     : 0.321
hit_ratio@3 : 0.390
ndcg@5     : 0.354
hit_ratio@5 : 0.471
ndcg@10    : 0.393
hit_ratio@10 : 0.590
ndcg@50    : 0.434
hit_ratio@50 : 0.763


### Pearson Correlation
Defined as $\frac{\sum_i (u_x^i - \bar{u_x}) \cdot (u_y^i - \bar{u_y})}{\sqrt{\sum_i {(u_x^i - \bar{u_x})}^2} \sqrt{\sum_i {(u_y^i - \bar{u_y})}^2}}$ where $u_x$ and $u_y$ are the row vectors of the user-item interaction matrix and $\bar{u_x}$ and $\bar{u_y}$ the respective means

**Attempt to a intuitive explanation/reading the formula**: In general, pearson correlation evaluates if two measurments are linearly dependent to each other (i.e. if you can write Y=aX + b where X,Y are user vector ratings and a,b are parameters). The definition is similar to the cosine similarity but by subtracting the mean of the measurments , computed over *all* the items. Note that, otherwise, the mean would be 1, and thus leading to 0 everywhere when removing the mean (1-1=0). This mean, then, should take into account some information about the number consumed by the users (the higher the number of items consumed, the bigger the mean), similarly what the denominator does for the jaccard similarity. The mean is then removed by the vectors (only non-zero entries) and the definition of cosine similarity is applied. The mean is removed only to these entries because... performance? Removing the mean to 0s makes them go below 0. Does it make sense at all to have a negative similarity? Looking at the results, it seems that the metrics  are even better for the dense definition below



In [7]:
def compute_pearson_sim_mtx(matrix):

    means = np.array(matrix.mean(axis=1)).flatten()
    matrix_no_mean = matrix.copy().asfptype()

    for indx in range(matrix.shape[0]):
        matrix_no_mean.data[matrix.indptr[indx]:matrix.indptr[indx+1]] -= means[indx]

    norms_no_mean = sp_lin.norm(matrix_no_mean,axis=1)

    normalized_matrix_no_mean = sp.csr_matrix((matrix_no_mean.T / norms_no_mean).T)

    pearson_sim_mtx = normalized_matrix_no_mean @ normalized_matrix_no_mean.T

    return pearson_sim_mtx

evaluate_user_knn(csr_matrix,compute_pearson_sim_mtx)

ndcg@1     : 0.226
hit_ratio@1 : 0.226
ndcg@3     : 0.321
hit_ratio@3 : 0.390
ndcg@5     : 0.354
hit_ratio@5 : 0.471
ndcg@10    : 0.393
hit_ratio@10 : 0.590
ndcg@50    : 0.434
hit_ratio@50 : 0.763


In [9]:
# REMOVING THE MEAN TO ALL ENTRIES
### Watch out! We make dense a 3555 x 77985 matrix! Make sure you have enough memory!

def compute_pearson_dense_sim_mtx(matrix):
    
    means = np.array(matrix.mean(axis=1)).flatten()

    matrix_no_mean = matrix - means[:,None]

    norms_no_mean = sc.linalg.norm(matrix_no_mean,axis=1)

    normalized_matrix_no_mean = (matrix_no_mean.T / norms_no_mean).T

    pearson_dense_sim_mtx = sp.csr_matrix(normalized_matrix_no_mean @ normalized_matrix_no_mean.T)
    return pearson_dense_sim_mtx

evaluate_user_knn(csr_matrix,compute_pearson_dense_sim_mtx)

ndcg@1     : 0.226
hit_ratio@1 : 0.226
ndcg@3     : 0.322
hit_ratio@3 : 0.391
ndcg@5     : 0.355
hit_ratio@5 : 0.473
ndcg@10    : 0.393
hit_ratio@10 : 0.590
ndcg@50    : 0.434
hit_ratio@50 : 0.758


### Asymmetric Cosine Similarity
Defined as $\frac{|I_x \cap I_y|}{|I_x|^\alpha |I_y|^{1-\alpha}}$ where $I_x$ and $I_y$ are the set of items interacted by user x and y respectively and $\alpha$ is a hyperparameter. (see also [here](https://dl.acm.org/doi/pdf/10.1145/2507157.2507189))

**Attempt to a intuitive explanation/reading the formula**: The cosine similarity can be generalized in such a way to give different weght to the different cardilnalities of the  item sets. The paper reports this example for the items. Imagine you are comparing two songs and you want to estimates their similarity. Assume that song A is a popular song from (changing the band) Muse e.g. Madness and song B a more unknown song by them e.g. Unintended. If we know that a user consumed B, it is likely that the user is a Muse's fan and will also consume A. If we know that user consumed A, it is not clear if the user just listened to this popular song or is a fan of Muse (and consume song B). This leads to: song B is more similar to song A than song A is to B. For this reason, we scale the weight given a specific alpha. As an example, consider that Song A got 100 users, Song B 12 users and only 10 users listened to both Song A and B

In [10]:
def compute_asymmcosine_sim_mtx(alpha,matrix):
    
    sums = np.squeeze(np.asarray(matrix.sum(axis=1)))

    sums_alpha = np.power(sums,alpha)
    sums_1_min_alpha = np.power(sums,1-alpha)

    denominator = np.outer(sums_alpha,sums_1_min_alpha)

    asymmetric_sim_mtx = sp.csr_matrix((matrix @ matrix.T)/denominator)
    return asymmetric_sim_mtx

evaluate_user_knn(csr_matrix,partial(compute_asymmcosine_sim_mtx,0.8))

ndcg@1     : 0.214
hit_ratio@1 : 0.214
ndcg@3     : 0.305
hit_ratio@3 : 0.373
ndcg@5     : 0.342
hit_ratio@5 : 0.461
ndcg@10    : 0.379
hit_ratio@10 : 0.577
ndcg@50    : 0.430
hit_ratio@50 : 0.796


### Sørensen–Dice Coefficient
Defined as $2\frac{|I_x \cap I_y|}{|I_x| + |I_y|}$ where $I_x$ and $I_y$ are the set of items interacted by user x and y respectively. (see also [here](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient))

**Attempt to a intuitive explanation/reading the formula**:
No real explanation for this one. It basically just give twice the weight to the shared information. (It is also very similar to Jaccard)

In [11]:
def compute_sorensendice_sim_mtx(matrix):
    
    intersection = (matrix @ matrix.T)

    counts = matrix.sum(axis=1)
    counts_sum = counts + counts.T


    sorensedice_sim_mtx = sp.csr_matrix(2 * intersection / counts_sum)
    return sorensedice_sim_mtx

evaluate_user_knn(csr_matrix,compute_sorensendice_sim_mtx)

ndcg@1     : 0.228
hit_ratio@1 : 0.228
ndcg@3     : 0.319
hit_ratio@3 : 0.384
ndcg@5     : 0.358
hit_ratio@5 : 0.479
ndcg@10    : 0.396
hit_ratio@10 : 0.596
ndcg@50    : 0.433
hit_ratio@50 : 0.753


### Tversky Index
Defined as $\frac{|I_x \cap I_y|}{|I_x \cap I_y| + \alpha|I_x - I_y| + \beta|I_y - I_x|}$ where $I_x$ and $I_y$ are the set of items interacted by user x and y respectively and $\alpha$ and $\beta$ are hyperparameters. (see also [here](https://en.wikipedia.org/wiki/Tversky_index))

**Attempt to a intuitive explanation/reading the formula**:
As Asymmetric Cosine, the Tversky index is an asymmetric similarity function. It basically gives a 'different' weighting scheme for the common/ not-in-common items compared to asymmetric cosine. 

In [12]:
def compute_tversky_sim_mtx(alpha,beta,matrix):
    
    intersection = (matrix @ matrix.T)

    counts = matrix.sum(axis=1)
    complement = counts - intersection

    tversky_sim_mtx = sp.csr_matrix(intersection/(intersection + alpha * complement + beta * complement.T))
    return tversky_sim_mtx

evaluate_user_knn(csr_matrix,partial(compute_tversky_sim_mtx,0.8,0.6))

ndcg@1     : 0.228
hit_ratio@1 : 0.228
ndcg@3     : 0.318
hit_ratio@3 : 0.382
ndcg@5     : 0.356
hit_ratio@5 : 0.475
ndcg@10    : 0.394
hit_ratio@10 : 0.593
ndcg@50    : 0.434
hit_ratio@50 : 0.761


# Item-based KNN (Collaborative Filtering)
We generate a n_items x n_items similarity matrix. The prediction for an item i and and user u is given by the weighted average of the similarities of item i and its top-k neighbours that also were consumed by user u. 

**Most of these functions should be further optimized in order to work for huge similarity matrices**

### Jaccard Similarity

In [7]:
evaluate_item_knn(csr_matrix,compute_jaccard_sim_mtx)

Resorting to slower method


0it [00:00, ?it/s]

KeyboardInterrupt: 

### Cosine Similarity

In [8]:
evaluate_item_knn(csr_matrix,compute_cosine_sim_mtx)

ndcg@1     : 0.359
hit_ratio@1 : 0.359
ndcg@3     : 0.462
hit_ratio@3 : 0.535
ndcg@5     : 0.489
hit_ratio@5 : 0.602
ndcg@10    : 0.509
hit_ratio@10 : 0.664
ndcg@50    : 0.519
hit_ratio@50 : 0.702


### Asymmetric Cosine Similarity 

In [9]:
evaluate_item_knn(csr_matrix,partial(compute_asymmcosine_sim_mtx,0.8))

NameError: name 'compute_asymmcosine_sim_mtx' is not defined

# SLIM

In [None]:
from algorithms.slim_parallel import SLIM_parallel

W = SLIM_parallel(sp.csc_matrix(csr_matrix),1e-1,1e-1,100)

Running on 8 cores


0 -> 9748:  60%|████████████████▊           | 5858/9748 [25:57<20:47,  3.12it/s]

In [None]:
W

In [None]:
def evaluate_slim(matrix):
    
    pred_mtx = matrix @ W
    pred_mtx = pred_mtx.toarray() 
    
    reproducible(SEED)
    
    evaluator = Evaluator(test_loader.dataset.n_users)

    for u_idxs, i_idxs, labels in test_loader:

        out = pred_mtx[u_idxs[:,None],i_idxs]
        evaluator.eval_batch(out)

    metrics_values = evaluator.get_results()
    print_results(metrics_values)

# EASE

In [None]:
from algorithms.ease import 

# Deep Matrix Factorization Models
[paper](https://www.ijcai.org/Proceedings/2017/0447.pdf)

In [102]:
class dmf(nn.Module):
    def __init__(self, matrix:sp.spmatrix, middle_dim:int = 100, final_dim:int = 64,device='cpu'):
        
        super(dmf, self).__init__()
        self.matrix = matrix
        self.n_users = matrix.shape[0]
        self.n_items = matrix.shape[1]
        self.middle_dim = middle_dim
        self.final_dim = final_dim
        self.device = device
        
        
        # Going with a two-layer NN
        self.user_nn = nn.Sequential(
            nn.Linear(self.n_items,self.middle_dim),
            nn.ReLU(),
            nn.Linear(self.middle_dim,self.final_dim)
        )
        
        self.item_nn = nn.Sequential(
            nn.Linear(self.n_users,self.middle_dim),
            nn.ReLU(),
            nn.Linear(self.middle_dim,self.final_dim)
        )
        self.cosine_fun = nn.CosineSimilarity(dim=-1)
        
        self.apply(general_weight_init)
    
    
    def forward(self,u_idxs,i_idxs):
        
        # User pass
        u_vec = self.sparse_to_device(self.matrix[u_idxs])
        u_vec = self.user_nn(u_vec)
        
        # Item pass
        # We assume negative sampling has been applied
        i_vec = self.sparse_to_device(self.matrix[:,i_idxs.flatten()]).T
        i_vec = self.item_nn(i_vec)
        i_vec = i_vec.reshape(list(i_idxs.shape) + [-1])
        
        # Cosine 
        sim = self.cosine_fun(u_vec[:,None,:], i_vec)
        
        return sim
        
        
    def sparse_to_device(self,array):
        array.toarray()
        return torch.tensor(array.toarray(),dtype=torch.float).to(self.device)

def train_dmf(model:nn.Module, train_loader: torch.utils.data.DataLoader, n_epochs:int = 50,device = 'cpu'):
    
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    for epoch in range(n_epochs):

        model.train()

        epoch_train_loss = 0

        for u_idxs, i_idxs, labels in tqdm(train_loader):
            u_idxs = u_idxs.to(device)
            i_idxs = i_idxs.to(device)
            labels = labels.to(device)

            out = model(u_idxs, i_idxs)
            
            loss = nn.BCEWithLogitsLoss()(out.flatten(),labels.flatten())
                        
            epoch_train_loss += loss.item()
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            

        epoch_train_loss /= len(train_loader)
        print("Epoch {} - Epoch Avg Train Loss {:.3f} \n".format(epoch, epoch_train_loss))

    return model

In [104]:
dmf_model = dmf(csr_matrix)
trained_model = train_dmf(dmf_model,train_loader)

  0%|          | 0/13598 [00:00<?, ?it/s]

Epoch 0 - Epoch Avg Train Loss 0.404 



  0%|          | 0/13598 [00:00<?, ?it/s]

Epoch 1 - Epoch Avg Train Loss 0.403 



  0%|          | 0/13598 [00:00<?, ?it/s]

Epoch 2 - Epoch Avg Train Loss 0.403 



  0%|          | 0/13598 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [105]:
dmf_model

dmf(
  (user_nn): Sequential(
    (0): Linear(in_features=77985, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=64, bias=True)
  )
  (item_nn): Sequential(
    (0): Linear(in_features=3555, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=64, bias=True)
  )
  (cosine_fun): CosineSimilarity()
)

In [90]:
ran = sp.csr_matrix(sp.random(1000,5000))
model = dmf(sp.csr_matrix(ran))
u_idxs = torch.randint(high=1000,size=(10,))
i_idxs = torch.randint(high=5000,size=(10,5))
model(u_idxs,i_idxs)

torch.Size([10, 5, 64])


tensor([[ 0.0969,  0.0921,  0.1482,  0.2880,  0.0128],
        [-0.0506,  0.1868,  0.0706,  0.1106,  0.0961],
        [ 0.0128, -0.0221, -0.2083,  0.0252, -0.1793],
        [ 0.0917,  0.0296,  0.1250,  0.0412,  0.1849],
        [ 0.1244, -0.0803, -0.1423, -0.0265,  0.0140],
        [ 0.1159,  0.0045,  0.0973,  0.1033, -0.0479],
        [ 0.0464,  0.1173, -0.0350,  0.0558, -0.0056],
        [ 0.0616, -0.1558,  0.2952,  0.2151, -0.0018],
        [ 0.0887,  0.2490,  0.1356,  0.3124,  0.1213],
        [-0.1694, -0.0566, -0.0799, -0.1195,  0.0241]], grad_fn=<DivBackward0>)

In [86]:
n_epochs = 50

for epoch in range(n_epochs):

    self.model.train()

    epoch_train_loss = 0

    for u_idxs, i_idxs, labels in self.train_loader:
        u_idxs = u_idxs.to(self.device)
        i_idxs = i_idxs.to(self.device)
        labels = labels.to(self.device)

        out = self.model(u_idxs, i_idxs)

        loss = self.model.module.loss_func(out, labels)

        epoch_train_loss += loss.item()

        loss.backward()
        self.optimizer.step()
        self.optimizer.zero_grad()

    epoch_train_loss /= len(self.train_loader)
    print("Epoch {} - Epoch Avg Train Loss {:.3f} \n".format(epoch, epoch_train_loss))

    metrics_values = self.val()
    curr_value = metrics_values[self.optimizing_metric]
    print('Epoch {} - Avg Val Value {:.3f} \n'.format(epoch, curr_value))
    tune.report({**metrics_values, 'epoch_train_loss': epoch_train_loss})

    if curr_value > best_value:
        best_value = curr_value
        print('Epoch {} - New best model found (val value {:.3f}) \n'.format(epoch, curr_value))
        with tune.checkpoint_dir(0) as checkpoint_dir:
            torch.save(self.model.module.state_dict(), os.path.join(checkpoint_dir, 'best_model.pth'))


tensor([[4164, 1614, 2889, 3548, 3035],
        [1384, 2288,  449, 4561, 2573],
        [2294, 3702, 1056, 2849, 4373],
        [4383, 4243, 3606, 4536,    1],
        [  66,  271,  970, 1679, 1848],
        [4810, 3644,  186,  599, 4784],
        [2940, 4984, 2397, 1612, 4747],
        [2634, 4905, 3054,  693, 1766],
        [  48, 1929, 2009,  475, 2511],
        [2875,  113, 2030, 3618,  494]])