In [1]:
# install openTSNE
# !conda install -c pytorch -c nvidia faiss-gpu -y
!pip install openTSNE duckdb plotly huggingface_hub pandas umap-learn



In [None]:
#download the packages

from huggingface_hub import hf_hub_download, login
import pandas as pd
import duckdb
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly import colors
import openTSNE
import umap
import os

In [None]:
login(token='')

In [None]:
local_db_files = []
for i in range(1, 30):
    if not os.path.exists(f'data/embeddings_{i}.db'):
        _f = hf_hub_download(
        repo_id="lalit3c/S2_CS_PHY_PYSCH_papers",
        repo_type="dataset",  # Important: specify it's a dataset repo
        filename=f'embeddings/embeddings_{i}.db',
        local_dir = 'data',
        local_dir_use_symlinks=False
    )
        local_db_files.append(_f)
    else:
        local_db_files.append(f'data/embeddings_{i}.db')



In [5]:
con = duckdb.connect()

con.execute("PRAGMA threads=8")

for i, path in enumerate(local_db_files):
    con.execute(f"ATTACH '{path}' AS db{i}")


In [None]:
s2_paper_db = 'S2_papers_cleaned.db'
s2_addeditional_paper_db = 'S2_papers_cleaned_additional_papers.db'

if not os.path.exists(os.path.join('data', s2_paper_db)):
    local_cleaned = hf_hub_download(
        repo_id="lalit3c/S2_CS_PHY_PYSCH_papers",
        repo_type="dataset",  # Important: specify it's a dataset repo
        filename=s2_paper_db,
        local_dir = 'data',
        local_dir_use_symlinks=False
    )
if not os.path.exists(os.path.join('data', s2_addeditional_paper_db)):
    local_cleaned_additional = hf_hub_download(
        repo_id="lalit3c/S2_CS_PHY_PYSCH_papers",
        repo_type="dataset",  # Important: specify it's a dataset repo
        filename=s2_addeditional_paper_db,
        local_dir = 'data',
    local_dir_use_symlinks=False
)

In [None]:
for i, path in enumerate([s2_paper_db, s2_addeditional_paper_db]):
    con.execute(f"ATTACH 'data/{path}' AS title_db{i}")

In [None]:
# these are the top most cited paper ids
query_ids = ['206594692', '6628106', '13756489', '14124313', '3719281', '10328909', '225039882', '218971783', '206592484', '4650265', '5808102', '206594738', '2930547', '9433631', '1629541', '231591445', '1957433', '5959482', '3144218', '1033682', '54465873', '206593880', '11212020', '215827080', '21889700', '198953378', '206770307', '5590763', '10716717', '219955663', '4714433', '15019293', '3292002', '204838007', '28695052', '12670695', '740063', '4555207', '7200347', '211096730', '28637672', '7961699', '6200260', '167217261', '6706414', '49867180', '245335280', '3429309', '13740328', '13029170', '46701966', '216078090', '60814714', '257219404', '786357', '246426909', '3960646', '2375110', '15798713', '16120223', '218889832', '604334', '61050894', '7624311', '3638670', '1023605', '201646309', '1799558', '259950998', '11758569', '4766599', '235458009', '16326763', '216080778', '206592766', '5201925', '9672033', '207930212', '3488815', '5299559', '6719686', '15238391', '502946', '436933', '3626819', '49670925', '54482423', '204960716', '3252915', '8236317', '207238980', '211227', '8485068', '12803511', '19011676', '980236', '3162051', '3641284', '6447277', '1055111']

In [13]:
con.execute("INSTALL vss;LOAD vss;")

<_duckdb.DuckDBPyConnection at 0x7f2a9f35df70>

In [14]:
union_sql = "\nUNION ALL\n".join(
    f"SELECT * FROM db{i}.embeddings" for i in range(len(local_db_files))
)

In [15]:
con.execute(f"""
CREATE TABLE IF NOT EXISTS global_embeddings AS
{union_sql};
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<_duckdb.DuckDBPyConnection at 0x7f2a9f35df70>

In [18]:
rows = con.execute(f"""
SELECT corpusid, embedding
FROM global_embeddings
WHERE corpusid IN ?
""", [query_ids]).fetchall()

query_ids = [r[0] for r in rows]
query_embeddings = np.vstack([np.array(r[1], dtype="float32") for r in rows])


In [19]:
con.execute("""
CREATE TABLE IF NOT EXISTS titles AS
SELECT corpusid, title, publication_date FROM title_db0.papers_with_abstracts
UNION ALL
SELECT corpusid, title, publication_date FROM title_db1.papers_with_abstracts;
""")

<_duckdb.DuckDBPyConnection at 0x7f2a9f35df70>

In [20]:
neighbors = {}
from tqdm import tqdm

for qid, qemb in tqdm(zip(query_ids, query_embeddings), total=len(query_embeddings)):
    qlist = qemb.tolist()  # DuckDB expects list of floats

    query_pub_date = con.execute(
        """
        SELECT publication_date
        FROM titles
        WHERE corpusid = ?
        """,
        [qid]
    ).fetchone()[0]

    result_rows = con.execute(
        """
        SELECT
            g.corpusid,
            array_cosine_distance(g.embedding, ?::FLOAT[768]) AS distance
        FROM global_embeddings AS g
        JOIN titles AS t USING(corpusid)
        WHERE t.publication_date < ?
        ORDER BY distance
        LIMIT 100;
        """,
        [qlist, query_pub_date]
    ).fetchall()

    # Remove self and keep (corpusid, distance)
    neighbors[qid] = [
        (cid, dist)
        for cid, dist in result_rows
        if cid != qid
    ][:100]

100%|██████████| 92/92 [00:39<00:00,  2.32it/s]


In [21]:
all_ids = list(query_ids) + [
    nid
    for nlist in neighbors.values()
    for (nid, dist) in nlist
]

rows = con.execute("""
SELECT g.corpusid, g.embedding, t.title
FROM global_embeddings AS g
LEFT JOIN titles AS t USING(corpusid)
WHERE g.corpusid IN ?
""", [all_ids]).fetchall()


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [22]:
# Map corpusid → embedding, title
embedding_map = {r[0]: np.array(r[1], dtype="float32") for r in rows}
title_map = {r[0]: r[2] for r in rows}


In [24]:
# === t-SNE per query ===
from sklearn.manifold import TSNE
combined_data = []
for qid, qemb in tqdm(zip(query_ids, query_embeddings)):
    X_query = [qemb] + [embedding_map[nid] for nid, _ in neighbors[qid]]
    labels = [("query", qid)] + [("neighbor", qid)] * len(neighbors[qid])
    titles = [title_map.get(qid, "Unknown")] + [title_map.get(nid, "Unknown") for nid, _ in neighbors[qid]]
    distances = [0.0] + [dist for _, dist in neighbors[qid]]

    X_query = np.vstack(X_query)
    tsne_2d = TSNE(
        n_components=2,
        perplexity=15,
        learning_rate='auto',
        metric='cosine',
        random_state=42,
        init='pca'
    ).fit_transform(X_query)

    df_q = pd.DataFrame({
        'x': tsne_2d[:, 0],
        'y': tsne_2d[:, 1],
        'type': [l[0] for l in labels],
        'query_id': [l[1] for l in labels],
        'title': titles,
        'cosine_distance': distances
    })
    combined_data.append(df_q)

df = pd.concat(combined_data, ignore_index=True)

92it [00:16,  5.73it/s]


In [25]:

# === Assign colors per query ===
unique_queries = sorted(df['query_id'].unique())
num_queries = len(unique_queries)
cmap = plt.get_cmap("tab20")
colors = [cmap(i / num_queries) for i in range(num_queries)]

def rgba_to_hex(rgba):
    r, g, b, a = rgba
    return f"#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}"

hex_colors = [rgba_to_hex(c) for c in colors]
query_to_color = {qid: hex_colors[i] for i, qid in enumerate(unique_queries)}
df['color'] = df['query_id'].map(query_to_color)
df

Unnamed: 0,x,y,type,query_id,title,cosine_distance,color
0,-1.349863,-1.654713,query,1023605,"Inception-v4, Inception-ResNet and the Impact ...",0.000000,#1f77b4
1,-0.895953,-2.121089,neighbor,1023605,Deep Residual Learning for Image Recognition,0.035079,#1f77b4
2,-0.615728,-0.698985,neighbor,1023605,Resnet in Resnet: Generalizing Residual Archit...,0.053926,#1f77b4
3,-1.993905,1.048468,neighbor,1023605,Delving Deep into Rectifiers: Surpassing Human...,0.054637,#1f77b4
4,-1.847369,2.563065,neighbor,1023605,Fast and Accurate Deep Network Learning by Exp...,0.055968,#1f77b4
...,...,...,...,...,...,...,...
9287,1.994541,-6.411585,neighbor,6447277,Understanding Deep Architectures using a Recur...,0.078342,#17becf
9288,1.820635,0.968764,neighbor,6447277,Relay Backpropagation for Effective Learning o...,0.078456,#17becf
9289,3.180607,-1.237798,neighbor,6447277,Do Deep Nets Really Need to be Deep?,0.078486,#17becf
9290,4.190481,10.455592,neighbor,6447277,Understanding learned CNN features through Fil...,0.078525,#17becf


In [None]:
df.to_csv("data/ai_top_papers_tsne_per_query.csv", index=False)