In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from boolrank import DualSiglip2Model
from my_processing import paths_to_dataset

# Load data
data_path = "training"
paths = [
    f"data/{data_path}.jsonl",
    "data/TAR_data.jsonl",
    "data/sysrev_conv.jsonl",
]
dataset = paths_to_dataset(
    paths,
    test_only_sources=['TAR', 'sysrev'],
    # train_sources=['pubmed-searchrefiner','pubmed-query','raw-jsonl']
    train_sources=['pubmed-searchrefiner','raw-jsonl']
)

data/training.jsonl
data/TAR_data.jsonl
data/sysrev_conv.jsonl


Finding similar: 100%|██████████| 2088/2088 [00:00<00:00, 4023.77it/s]
Finding similar: 100%|██████████| 343/343 [00:00<00:00, 14667.19it/s]
Finding similar: 100%|██████████| 79/79 [00:00<00:00, 25976.01it/s]
Finding similar: 100%|██████████| 50/50 [00:00<00:00, 33151.31it/s]
Finding similar: 100%|██████████| 40/40 [00:00<00:00, 40031.53it/s]
Finding similar: 100%|██████████| 3782/3782 [00:03<00:00, 1055.93it/s]


In [2]:
N = 10000
df = pd.DataFrame({
    "nl": dataset["train"]["nl_query"],
    "bool": dataset["train"]["bool_query"],
    "quality": dataset["train"]["quality"],
    "source": dataset["train"]["source"]
})
df = df.sample(min(N, df.shape[0])).reset_index(drop=True)
df

Unnamed: 0,nl,bool,quality,source
0,Measuring outcomes in people who have had a st...,"(""Telemedicine""[Mesh] OR ""Videoconferencing""[M...",0.313163,pubmed-searchrefiner
1,High-Temperature Short-Time and Holder Pasteur...,(((human milk[Tw]) OR (donor milk[Tw]) OR (don...,0.089475,pubmed-searchrefiner
2,Clinical efficacy of home-use blue-light thera...,Acne[tiab] OR Blackheads[tiab] OR Whiteheads[t...,0.004015,pubmed-searchrefiner
3,Mobile clinic in Massachusetts associated with...,"(""Mobile Health Units"" OR ""Mobile health"" OR ""...",0.626325,pubmed-searchrefiner
4,Laparoscopic versus open surgical techniques f...,"(""hernia, ventral""[MeSH Terms] OR (hernia*[Ti...",0.549417,raw-jsonl
...,...,...,...,...
3777,Laparoscopic and transanal pull-through for Hi...,"((""child*""[Title/Abstract] OR ""stepchild*""[Tit...",0.626325,pubmed-searchrefiner
3778,Epidemiology and outcomes of candidemia in 201...,"""candida""[All Fields] AND ""glabrata""[All Field...",0.313163,pubmed-searchrefiner
3779,Control methods for <I>Aedes albopictus</I> an...,"((Dengue OR ""yellow fever"" OR "" chikungunya"" ...",0.549417,raw-jsonl
3780,The clinical and histological effect of home-u...,"((""Acne Vulgaris""[Mesh] OR Acne[tiab] OR Black...",0.004094,pubmed-searchrefiner


In [62]:
import nltk
import numpy as np

nltk.download("words")
words = nltk.corpus.words.words()
words = np.random.choice(words, 10000).tolist()

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Simon\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [161]:
import torch

# model = DualSiglip2Model('BAAI/bge-small-en-v1.5')
# model.load(r"models\clip\bge-small-en-v1.5\b16_lr1E-05_(pubmed-que_pubmed-sea_raw-jsonl)^4\checkpoint-11288\model.safetensors")
model = DualSiglip2Model('dmis-lab/biobert-v1.1')
model.load(r"models\clip\biobert-v1.1\b16_lr1E-05_(pubmed-que_pubmed-sea_raw-jsonl)^4\checkpoint-14110\model.safetensors")

# embeddings = model.encode_bool(df["bool"].tolist(), batch_size=200).detach().cpu().numpy()
embeddings = model.encode_bool(words, batch_size=200).detach().cpu().numpy()
torch.cuda.empty_cache()

In [182]:
import umap

um = umap.UMAP(n_neighbors=15, n_components=3)
trans = um.fit_transform(embeddings)

x = trans[:,0]
y = trans[:,1]
z = trans[:,2]
# df["x"] = x
# df["y"] = y

# def cutoff(n): return lambda x: x if len(x) < n else x[:n] + "..."
# cut = 100
# df["nl"] = df["nl"].map(cutoff(cut))
# df["bool"] = df["bool"].map(cutoff(cut))

In [187]:
# query = dataset["test"]["pubmed-searchrefiner"]["nl_query"][0]
query = "cancer"
query_emb = model.encode_text(query).detach().cpu().numpy()

In [188]:
similarity = model.get_similarities(embeddings, query_emb)
# df["sim"] = similarity
# df["sim"].values

In [189]:
top_n = (-similarity).argsort()[:100]
mask = torch.zeros_like(similarity).numpy()
bool_mask = mask + 1
bool_mask[top_n] = 0
mask[top_n] = 0.9
mask += 0.1

In [190]:
import plotly.express as px

# fig = px.scatter(
#     df, x="x", y="y",
#     color="source",               # color by data source
#     # size="quality",               # optionally size by quality score
#     # opacity=df["sim"],
#     # size="sim",
#     # opacity=mask,
#     hover_data={
#         "nl": True,
#         "bool": True,
#         "quality": True,
#         "sim": True,
#         "x": False, "y": False,
#     },
#     title=f"t-SNE of {N} Text Embeddings"
# )

fig = px.scatter(
    x=x, y=y, opacity=mask,
    # color=bool_mask,
    hover_data={
        "word": words
    },
    title=f"Visualization of {len(x)} Embeddings"
)

# fig.update_traces(marker=dict(opacity=0.7, line_width=0.5))
fig.update_layout(width=800, height=600)
fig.show()