# Model with 2 embeddings and `CosineEmbeddingLoss`

In [23]:
import pandas as pd
data = pd.read_csv("../data/diaries_quotes_emb_twitter_interactions.csv")

## 1. Define a model
We have 2 embeddings: one for diary and one for quote. We calculate cosine similarity between them, as before, and use [`CosineEmbeddingLoss`](https://pytorch.org/docs/stable/generated/torch.nn.CosineEmbeddingLoss.html) to train the model.

This loss function is used for measuring whether two inputs are similar or dissimilar, using the cosine similarity, and is typically used for learning nonlinear embeddings or semi-supervised learning.

In [22]:
import torch
from torch import nn, optim
from torch.nn import functional as F
from transformers import AutoTokenizer, RobertaModel

BASE_MODEL = "roberta-base"
tokenizer_reddit = AutoTokenizer.from_pretrained(BASE_MODEL)

class DSSM(nn.Module):
    def __init__(self):
        super().__init__()
        self.diary_emb = RobertaModel.from_pretrained(BASE_MODEL, add_pooling_layer=False)
        self.quote_emb = RobertaModel.from_pretrained(BASE_MODEL, add_pooling_layer=False)

    def forward(self, diary, quote):
        return self.diary_emb(**diary), self.quote_emb(**quote)


config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## 2. Prepare data

In [24]:
from sklearn.model_selection import train_test_split

data_train, data_test = train_test_split(data, test_size=0.2, random_state=42)
data_train = data_train.reset_index(drop=True)
data_test = data_test.reset_index(drop=True)

In [25]:
from torch.utils.data import DataLoader
import numpy as np

def collate_batch(batch):
    np_batch = np.array(batch)
    return (tokenizer_reddit(np_batch[:, 0].tolist(), padding=True, return_tensors='pt'),
            tokenizer_reddit(np_batch[:, 1].tolist(), padding=True, return_tensors='pt'),
            torch.tensor(np_batch[:, 2].tolist()))

train_dataloader = DataLoader(
    data_train.to_numpy(), batch_size=16, shuffle=True, collate_fn=collate_batch
)
val_dataloader = DataLoader(
    data_test.to_numpy(), batch_size=16, collate_fn=collate_batch
)

## 3. Train the model

In [26]:
device = 'cuda'

In [27]:
model = DSSM()
model.to(device)

loss_fn = nn.CosineEmbeddingLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

We use [WandB](https://wandb.ai/) to track our experiments.

In [9]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [30]:
import wandb

wandb.init(
    # set the wandb project where this run will be logged
    project="Quotes RecSys",

    # track hyperparameters and run metadata
    config={
        "learning_rate": 1e-5,
        "architecture": "Our DSSM",
        "dataset": "diaries_quotes_emb_twitter_interactions",
        "epochs": 100,
        "base_emb_model": "roberta-large"
    }
)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/loss,█▇▅▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
val/loss,█▇▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁

0,1
train/loss,0.01623
val/loss,0.21403


In [31]:
from tqdm import tqdm

epochs = 20

for epoch in range(epochs):
    train_loss = 0.0
    model.train()
    for i, (texts, quotes, labels) in enumerate(pbar := tqdm(train_dataloader)):
        texts, quotes, labels = texts.to(device), quotes.to(device), labels.to(device)

        optimizer.zero_grad()
        diary_emb, quote_emb = model(texts, quotes)
        loss = loss_fn(diary_emb[0][:, 0, :], quote_emb[0][:, 0, :], labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        # print(loss.item(), train_loss, i+1,  train_loss / (i + 1))

        pbar.set_description(f"Training loss: {train_loss / (i + 1)}")

    train_loss = train_loss / (i + 1)

    val_loss = 0.0
    preds = []
    trues = []
    with torch.no_grad():
        model.eval()
        for i, (texts, quotes, labels) in enumerate(pbar := tqdm(val_dataloader)):
            texts, quotes, labels = texts.to(device), quotes.to(device), labels.to(device)

            diary_emb, quote_emb = model(texts, quotes)
            loss = loss_fn(diary_emb[0][:, 0, :], quote_emb[0][:, 0, :], labels)

            val_loss += loss.item()

            pbar.set_description(f"Validation loss: {val_loss / (i + 1)}")

    val_loss = val_loss / (i + 1)

    wandb.log({"train/loss": train_loss, "val/loss": val_loss})
    
    # TODO: save model if val_loss is lower than previous val_loss

Training loss: 0.6679766178131104:   4%|▍         | 1/26 [00:02<00:56,  2.24s/it]


OutOfMemoryError: ignored

In [None]:
torch.save(model.state_dict(), "../models/dssm_twitter.pt")