In [4]:
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import Vector, LanceModel
import polars as pl

In [5]:
registry = get_registry()
func = registry.get("sentence-transformers").create(device="cuda")

In [6]:
class ArxivPapers(LanceModel):
    key: int
    titles: str
    terms: str
    # We'll embed this column using `func`
    abstracts: str = func.SourceField()
    # The vector and number of dimensions is provided by the embedding function automatically
    vector: Vector(func.ndims()) = func.VectorField()

In [8]:
db = lancedb.connect("arxivdb")

In [None]:
table = db.create_table("arxiv", schema=ArxivPapers, mode="overwrite")

In [10]:
table.head()

pyarrow.Table
key: int64 not null
titles: string not null
terms: string not null
abstracts: string not null
vector: fixed_size_list<item: float>[384] not null
  child 0, item: float
----
key: []
titles: []
terms: []
abstracts: []
vector: []

In [11]:
# https://www.kaggle.com/datasets/spsayakpaul/arxiv-paper-abstracts
df = pl.read_csv("arxiv_abstracts.csv")
df.head()

terms,titles,abstracts
str,str,str
"""['cs.LG']""","""Multi-Level At…","""Graph neural n…"
"""['cs.LG', 'cs.…","""Decision Fores…","""Deep networks …"
"""['cs.LG', 'cs.…","""Power up! Robu…","""Graph convolut…"
"""['cs.LG', 'cs.…","""Releasing Grap…","""With the incre…"
"""['cs.LG']""","""Recurrence-Awa…","""Machine learni…"


In [12]:
n = df.height
df = df.with_columns([pl.Series(name="key", values=range(n))])

In [13]:
df = df.select(["key", "abstracts", "titles", "terms"])

In [14]:
df

key,abstracts,titles,terms
i64,str,str,str
0,"""Graph neural n…","""Multi-Level At…","""['cs.LG']"""
1,"""Deep networks …","""Decision Fores…","""['cs.LG', 'cs.…"
2,"""Graph convolut…","""Power up! Robu…","""['cs.LG', 'cs.…"
3,"""With the incre…","""Releasing Grap…","""['cs.LG', 'cs.…"
4,"""Machine learni…","""Recurrence-Awa…","""['cs.LG']"""
…,…,…,…
56176,"""Despite the gr…","""Mining Spatio-…","""['cs.CV', 'cs.…"
56177,"""This paper pre…","""Wav2Letter: an…","""['cs.LG', 'cs.…"
56178,"""The popular Q-…","""Deep Reinforce…","""['cs.LG']"""
56179,"""Principal comp…","""Generalized Lo…","""['stat.ML', 'c…"


In [15]:
table.add(df.sample(1000))

In [20]:
table.search().to_polars()

key,titles,terms,abstracts,vector
i64,str,str,str,"array[f32, 384]"
37051,"""History for Vi…","""['cs.CV', 'cs.…","""Visual Dialog …","[0.010497, -0.048358, … 0.014043]"
9924,"""Black-box Off-…","""['cs.LG', 'cs.…","""Off-policy est…","[-0.028978, -0.062137, … 0.039674]"
27965,"""AniGAN: Style-…","""['cs.CV', 'cs.…","""In this paper,…","[-0.091088, -0.038203, … -0.01065]"
22540,"""Relative Affer…","""['cs.CV', 'ees…","""Abnormalities …","[-0.051458, -0.065525, … -0.027434]"
29326,"""Generating Dif…","""['cs.CV']""","""Diffusion magn…","[-0.0647, -0.112234, … -0.029517]"
45616,"""Self-Supervise…","""['cs.CV']""","""Self-supervise…","[-0.012768, -0.03064, … -0.040037]"
45476,"""Deep Two-View …","""['cs.CV']""","""Two-view struc…","[-0.052596, -0.028142, … -0.05199]"
48038,"""Parameter Prio…","""['stat.ML', 'c…","""We develop sim…","[-0.065217, -0.13041, … 0.031038]"
25812,"""AdaBins: Depth…","""['cs.CV']""","""We address the…","[-0.08025, -0.115382, … -0.054427]"
52165,"""U-GAT: Multimo…","""['cs.CV', 'cs.…","""During the fir…","[0.032779, -0.049483, … -0.003957]"
