<a href="https://colab.research.google.com/github/jstenner/ainotebooks/blob/main/CLIP_Text_Token_Similarity_Listing_(Public).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### This group installs and loads things
Open this up if you want to change CLIP's model or some other guts, otherwise just run it.

In [None]:
%%capture
%pip install --no-deps git+https://github.com/openai/CLIP.git
%pip install --no-deps ftfy regex tqdm

In [None]:
import torch
import gc
import warnings
warnings.filterwarnings('ignore')
torch.set_grad_enabled(False)

def clear_mem():
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
import clip
torch.set_grad_enabled(False)

perceptor, clip_preprocess = clip.load('ViT-B/16')
#perceptor, clip_preprocess = clip.load('ViT-B/32')
perceptor.eval().float().requires_grad_(False);

tokenizer = clip.simple_tokenizer.SimpleTokenizer()

100%|████████████████████████████████████████| 335M/335M [00:03<00:00, 101MiB/s]


In [None]:
def similar_words(target_word="cool", topk=32):
    target_tokens = tokenizer.encode(target_word)
    if len(target_tokens) > 1:
        print("This word uses more than one token, can't use it!")
        return
    target_emb = perceptor.token_embedding.weight[target_tokens[0],None].detach()
    token_sim  = torch.cosine_similarity(target_emb,perceptor.token_embedding.weight.detach(),-1)
    top_token_sim = torch.topk(token_sim,topk+1,-1,True,True)
    top_indices = top_token_sim.indices[1:]
    top_values  = top_token_sim.values[1:]
    for i in range(top_indices.shape[0]):
        print('"'+tokenizer.decode([top_indices[i].item()])+'"',"   ", top_values[i].item())
    return

In [None]:
from IPython.display import HTML, display

def similar_words_fancy(target_word="cool", topk=32):
    target_tokens = tokenizer.encode(target_word)
    if len(target_tokens) > 1:
        print("This word uses more than one token, can't use it!")
        return
    target_emb = perceptor.token_embedding.weight[target_tokens[0],None].detach()
    token_sim  = torch.cosine_similarity(target_emb,perceptor.token_embedding.weight.detach(),-1)
    top_token_sim = torch.topk(token_sim,topk+1,-1,True,True)
    top_indices = top_token_sim.indices[1:]
    top_values  = top_token_sim.values[1:]
    output = []
    for i in range(top_indices.shape[0]):
        output.append([tokenizer.decode([top_indices[i].item()]), top_values[i].item()]) 

    table_build = ""
    for i in range(len(output)):
        table_build = table_build + "<tr><td>"+output[i][0]+"</td><td>"+str(output[i][1])+"</td></tr>"

    table_built = """
    <style>
    #output-body {
        display: flex;
        align-items: left;
        justify-content: left;
    }
    th {
      text-align: left;
    }
    .treecolumn{
        column-count: 4;
    }
    </style>
    <div class="treecolumn">
    <table style="width:100%">
    """ + table_build + """
    </table>
    </div>
    """
    display(HTML(table_built))

### Play Area

In [None]:
#@title Best matching tokens
#@markdown Limited to the topk tokens, listed as decoded text and its cosine similarity to the input.
#@markdown <br>Some tokens might be repeated with or without a space after the text ("cool" is different from "cool " as one is a part of a word and the other is a single word)
#@markdown <br>List reads top-down then left-right. CSS + Table issues.

word = "uwu" #@param {type:"string"}
topk = 16 #@param {type:"integer"}

#similar_words(word, topk)
similar_words_fancy(word, topk)

0,1
owo,0.3114655911922455
🥺,0.2836507558822632
:>,0.2684445083141327
😭💕,0.2445578724145889
🥳,0.2379431128501892
😭😭,0.2361124753952026
ㅋㅋㅋ,0.232012540102005
😭😭😭,0.2307145446538925
smol,0.2303338944911956
🥺,0.2263646274805069
