In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from sklearn.manifold import TSNE
import plotly.express as px
from boolrank import DualSiglip2Model
from my_processing import paths_to_dataset

# Load data
data_path = "training"
paths = [
    f"data/{data_path}.jsonl",
    "data/TAR_data.jsonl",
    "data/sysrev_conv.jsonl",
]
dataset = paths_to_dataset(
    paths,
    test_only_sources=['TAR', 'sysrev'],
    # train_sources=['pubmed-searchrefiner','pubmed-query','raw-jsonl']
    train_sources=['pubmed-searchrefiner','raw-jsonl']
)

data/training.jsonl
data/TAR_data.jsonl
data/sysrev_conv.jsonl


Finding similar: 100%|██████████| 2088/2088 [00:00<00:00, 4067.38it/s]
Finding similar: 100%|██████████| 343/343 [00:00<00:00, 16703.58it/s]
Finding similar: 100%|██████████| 79/79 [00:00<00:00, 31443.35it/s]
Finding similar: 100%|██████████| 50/50 [00:00<00:00, 19929.22it/s]
Finding similar: 100%|██████████| 40/40 [00:00<00:00, 20015.77it/s]
Finding similar: 100%|██████████| 3782/3782 [00:03<00:00, 1080.21it/s]


In [2]:
N = 10000
df = pd.DataFrame({
    "nl": dataset["train"]["nl_query"],
    "bool": dataset["train"]["bool_query"],
    "quality": dataset["train"]["quality"],
    "source": dataset["train"]["source"]
})
df = df.sample(min(N, df.shape[0])).reset_index(drop=True)
df

Unnamed: 0,nl,bool,quality,source
0,Autism after adolescence: population-based 13-...,"(ASD[tiab] OR autism [tiab] OR ""autism spectru...",0.626325,pubmed-searchrefiner
1,Targeted therapy for metastatic renal cell car...,((randomized controlled trial.pt OR controlle...,0.549417,raw-jsonl
2,The Tele.TAnDem Intervention: Telephone-based ...,"(exp ""Osteoarthritis,Hip""/ OR (Coxarthros* OR ...",1.252650,pubmed-searchrefiner
3,Laparoscopic compared with robotic sacrocolpop...,((Pelvic Organ Prolapse[Mesh Terms:noexp])),1.252650,pubmed-searchrefiner
4,The effect of initial serum neuron-specific en...,"""Vasospasm, Intracranial""[Mesh] OR ""Intracrani...",0.250530,pubmed-searchrefiner
...,...,...,...,...
3777,The Impact of Community-acquired Pneumonia on ...,"(('adult':ti,ab[All Fields] OR 'older adults':...",0.313163,pubmed-searchrefiner
3778,High versus low medium chain triglyceride cont...,"(((""Infant Food""[Mesh] OR ""Infant Formula""[Me...",0.549417,raw-jsonl
3779,A randomized controlled study for the treatmen...,"((""Acne Vulgaris""[Mesh] OR Acne[tiab] OR Black...",0.003890,pubmed-searchrefiner
3780,Concurrent chemoradiotherapy in non-small cell...,(randomized controlled trial [pt] OR controll...,0.549417,raw-jsonl


In [5]:
import torch

model = DualSiglip2Model('BAAI/bge-small-en-v1.5', loss_type="clip")

embeddings = model.encode_bool(df["bool"].tolist(), batch_size=1000).detach().cpu().numpy()
torch.cuda.empty_cache()

In [4]:
tsne = TSNE(n_components=2,
            perplexity=30,
            learning_rate=200,
            max_iter=1000,
            random_state=42,
            init="pca")
coords = tsne.fit_transform(embeddings)

df["x"] = coords[:,0]
df["y"] = coords[:,1]

def cutoff(n): return lambda x: x if len(x) < n else x[:n] + "..."
cut = 100
df["nl"] = df["nl"].map(cutoff(cut))
df["bool"] = df["bool"].map(cutoff(cut))

fig = px.scatter(
    df, x="x", y="y",
    color="source",               # color by data source
    # size="quality",               # optionally size by quality score
    hover_data={
        "nl": True,
        "bool": True,
        "quality": True,
        "x": False, "y": False
    },
    title=f"t-SNE of {N} Text Embeddings"
)

fig.update_traces(marker=dict(opacity=0.7, line_width=0.5))
fig.update_layout(width=800, height=600)
fig.show()