# Matrix completion

In [None]:
"""
Matrix completion on toy and Movielens datasets
JJV for Deep Learning course, 2022
"""
import torch
from torch import nn
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd


LEARNING_RATE = 0.1
EMBEDDING_SIZE = 3


DATA = 'toy'
# DATA = 'movielens'
if DATA == 'toy':
    N_EPOCHS = 1000
    DISPLAY_EPOCH_EVERY = 100
    BATCH_SIZE = 50
    N, K, M = 10, 3, 5
    U = np.random.normal(size=(N, K))
    V = np.random.normal(size=(M, K))
    R = U @ V.T
    X = []
    y = []
    for i in range(N):  # Can be done using pd.unstack
        for j in range(M):
            X.append((i, N + j))
            y.append(R[i, j])
else:
    N_EPOCHS = 50
    DISPLAY_EPOCH_EVERY = 2
    BATCH_SIZE = 1000
    df = pd.read_csv('ml-latest-small/ratings.csv')
    films = pd.read_csv('ml-latest-small/movies.csv')
    df = df.merge(films, on='movieId')
    df['user'] = np.unique(df['userId'], return_inverse=True)[1]
    df['item'] = np.unique(df['movieId'], return_inverse=True)[1]
    N = df['user'].nunique()
    M = df['item'].nunique()
    df['item'] += N
    X = torch.LongTensor(df[['user', 'item']].to_numpy())
    y = torch.Tensor(df['rating'])

X = torch.LongTensor(X)
y = torch.Tensor(y)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True)
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
test_dataset = torch.utils.data.TensorDataset(X_test, y_test)
train_iter = torch.utils.data.DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True)

- $X$ contains pairs of indices user and item. Please note that the data has been preprocessed so that user and item indices are disjoint: users are 0 to 610 and item IDs start at 611.
- $y$ contains ratings between 1 and 5.

## Toy dataset

1. Write a first model using `nn.Sequential` that computes embeddings of size `EMBEDDING_SIZE` for both user and item considered; concatenates them, then feeds them to a linear layer for scalar prediction. Check `outputs.shape` to debug your code. Observe the train loss.

2. Then modify the `CF` class to get a model that computes $u_i^T v_j$ for an $(i, j)$ user-item pair. What happens if `EMBEDDING_SIZE` is too high, say 20? When it is too small?

## Movielens dataset

1. Try the performance of both models on the Movielens dataset.

2. We would like to combine both models: indeed, people don't have the same rating habits: some people rate between 0 and 2, some others between 4 and 5. Improve your model so that the predictions are:
$$ f(i, j) = \mu^U_{i} + \mu^I_{j} + u_i^T v_j$$
where $\mu^U$ (resp. $\mu^I$) is a vector of user (resp. item) biases, and u_i and v_j are embeddings of size `EMBEDDING_SIZE`.
3. How would you implement L2 regularization? (Trick: use `weight_decay=1e-4` in your optimizer.) But without this trick, how would you do it?

In [None]:
model = nn.Sequential(
    ...  # Your code here
)


class CF(nn.Module):
    """
    Recommender system
    """
    def __init__(self, embedding_size):
        super().__init__()
        # Your code here

    def forward(self, x):
        # Your code here
        pass


# model = CF(EMBEDDING_SIZE)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
# optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
losses = []
for epoch in tqdm(range(N_EPOCHS)):
    losses = []
    for indices, target in train_iter:
        outputs = model(indices).squeeze()
        # print(outputs.shape)
        loss = loss_function(outputs, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses.append(loss.item())

    if epoch % DISPLAY_EPOCH_EVERY == 0:
        print(f"Epoch {epoch}: Train MSE {np.mean(losses)}")

        y_pred = model(X_test).squeeze()
        loss = loss_function(y_pred, y_test)
        print('Test MSE', loss)


writer = SummaryWriter(log_dir='logs/embeddings')  # TBoard
item_embeddings = list(model.parameters())[1][N:]
user_meta = pd.DataFrame(np.arange(N), columns=('item',))
user_meta['title'] = ''
item_meta = df.sort_values('item')[['item', 'title']].drop_duplicates()
metadata = pd.concat((user_meta, item_meta), axis=0)
writer.add_embedding(
    item_embeddings, metadata=item_meta.values.tolist(),
    metadata_header=item_meta.columns.tolist())
writer.close()