# Imports


In [1]:
import os
from warnings import filterwarnings

filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

from transformers import BertTokenizer, BertForQuestionAnswering, BertConfig

from captum.attr import visualization as viz
from captum.attr import LayerConductance, LayerIntegratedGradients

%matplotlib inline

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [23]:
RANDOM_SEED = 42
DATA_DIR = "data"
MODEL_DIR = "models"
# MODEL_NAME = "imdb-model-cnn-large.pt"
# MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
# MODEL_URL = "https://github.com/pytorch/captum/blob/master/tutorials/models/imdb-model-cnn-large.pt"
# !wget -O $MODEL_PATH $MODEL_URL

# Read Model


In [24]:
from transformers import BertTokenizer, BertModel
import torch.nn.functional as F
import random

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 23.1kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 8.05MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 8.46MB/s]
config.json: 100%|██████████| 570/570 [00:00<00:00, 5.09MB/s]
model.safetensors: 100%|██████████| 440M/440M [00:46<00:00, 9.56MB/s] 


In [142]:
class EmbeddingModel:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def get_embeddings(self, text, state="last_hidden_state"):
        encoded_input = self.tokenizer(
            text, add_special_tokens=False, return_tensors="pt"
        )
        output = self.model(**encoded_input)
        return output.get(state), encoded_input["input_ids"]

    def get_similarity(self, query_emb, doc_emb, doc_ids):
        expand_dim = doc_emb.shape[1]
        query_emb_exp = query_emb.unsqueeze(1).expand(-1, expand_dim, -1)
        cos_sim = F.cosine_similarity(query_emb_exp, doc_emb, dim=-1)
        doc_tokens = self.tokenizer.convert_ids_to_tokens(doc_ids[0])
        return cos_sim, doc_tokens


emb = EmbeddingModel(model, tokenizer)

query = "bottle of red wine"
document = "coconuts, pasta, hawaiian shirts, pizza!"

query_emb, _ = emb.get_embeddings(query, state="pooler_output")
doc_emb, doc_ids = emb.get_embeddings(document)

cos_sim, doc_tokens = emb.get_similarity(query_emb, doc_emb, doc_ids)
print("Cosine similarity:", cos_sim)
print("Document tokens:", doc_tokens)

cos_sim = (
    2 * (cos_sim - torch.min(cos_sim)) / (torch.max(cos_sim) - torch.min(cos_sim)) - 1
)
cos_sim_l = cos_sim.detach().numpy()[0].tolist()

Cosine similarity: tensor([[-0.0400, -0.0251, -0.0267, -0.0469, -0.0344, -0.0574, -0.0384, -0.0373,
         -0.0472, -0.0443]], grad_fn=<SumBackward1>)
Document tokens: ['coconut', '##s', ',', 'pasta', ',', 'hawaiian', 'shirts', ',', 'pizza', '!']


In [143]:
from IPython.display import display, HTML
from captum.attr import visualization


def visualize_text_x(datarecords):
    dom = ["<table width: 100%>"]
    rows = ["<th></th>"]
    for datarecord in datarecords:
        rows.append(
            "".join(
                [
                    "<tr>",
                    visualization.format_word_importances(
                        datarecord.raw_input_ids, datarecord.word_attributions
                    ),
                    "<tr>",
                ]
            )
        )

    dom.append("".join(rows))
    dom.append("</table>")
    html = HTML("".join(dom))
    display(html)


vis_data_records = [
    visualization.VisualizationDataRecord(cos_sim_l, 0, 0, 0, 0, 0, doc_tokens, 1),
]
visualize_text_x(vis_data_records)

0
"coconut ##s , pasta , hawaiian shirts , pizza !"
