In [1]:
import lance
import duckdb
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.dataset
import shutil

## Creating datasets

Via pyarrow it's really easy to create lance datasets

Create a dataframe

In [2]:
df = pd.DataFrame({"a": [5]})
df

Unnamed: 0,a
0,5


Write it to lance

In [3]:
shutil.rmtree("/tmp/test_df.lance", ignore_errors=True)

dataset = lance.write_dataset(df, "/tmp/test_df.lance")
dataset.to_table().to_pandas()

Unnamed: 0,a
0,5


### Converting from parquet

In [4]:
shutil.rmtree("/tmp/test.parquet", ignore_errors=True)
shutil.rmtree("/tmp/test_parquet.lance", ignore_errors=True)

tbl = pa.Table.from_pandas(df)
pa.dataset.write_dataset(tbl, "/tmp/test.parquet", format='parquet')

parquet = pa.dataset.dataset("/tmp/test.parquet")
parquet.to_table().to_pandas()

Unnamed: 0,a
0,5


Write to lance in 1 line

In [5]:
dataset = lance.write_dataset(parquet, "/tmp/test_parquet.lance")

In [6]:
# make sure it's the same
dataset.to_table().to_pandas()

Unnamed: 0,a
0,5


## Versioning

We can append rows

In [7]:
df = pd.DataFrame({"a": [10]})
tbl = pa.Table.from_pandas(df)
dataset = lance.write_dataset(tbl, "/tmp/test.lance", mode="append")

dataset.to_table().to_pandas()

Unnamed: 0,a
0,5
1,10


We can overwrite the data and create a new version

In [8]:
df = pd.DataFrame({"a": [50, 100]})
tbl = pa.Table.from_pandas(df)
dataset = lance.write_dataset(tbl, "/tmp/test.lance", mode="overwrite")

In [9]:
dataset.to_table().to_pandas()

Unnamed: 0,a
0,50
1,100


The old version is still there

In [10]:
dataset.versions()

[{'version': 2,
  'timestamp': datetime.datetime(2023, 2, 10, 12, 31, 16),
  'metadata': {}},
 {'version': 3,
  'timestamp': datetime.datetime(2023, 2, 10, 12, 31, 16),
  'metadata': {}},
 {'version': 1,
  'timestamp': datetime.datetime(2023, 2, 10, 12, 31, 16),
  'metadata': {}}]

In [11]:
lance.dataset('/tmp/test.lance', version=1).to_table().to_pandas()

Unnamed: 0,a
0,5


In [12]:
lance.dataset('/tmp/test.lance', version=2).to_table().to_pandas()

Unnamed: 0,a
0,5
1,10


## Vectors

### Data preparation

For this tutorial let's use the Sift 1M dataset:

- Download `ANN_SIFT1M` from: http://corpus-texmex.irisa.fr/
- Direct link should be `ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz`
- Download and then unzip the tarball

In [13]:
!rm -rf sift* vec_data.lance
!wget ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz
!tar -xzf sift.tar.gz

--2023-02-10 12:31:16--  ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz
           => ‘sift.tar.gz’
Resolving ftp.irisa.fr (ftp.irisa.fr)... 131.254.254.45, 2001:660:7303:254::45
Connecting to ftp.irisa.fr (ftp.irisa.fr)|131.254.254.45|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /local/texmex/corpus ... done.
==> SIZE sift.tar.gz ... 168280445
==> PASV ... done.    ==> RETR sift.tar.gz ... done.
Length: 168280445 (160M) (unauthoritative)


2023-02-10 12:31:46 (5.96 MB/s) - ‘sift.tar.gz’ saved [168280445]



Convert it to Lance

In [14]:
import struct

uri = "vec_data.lance"

with open("sift/sift_base.fvecs", mode="rb") as fobj:
    buf = fobj.read()
    data = np.array(struct.unpack("<128000000f", buf[4 : 4 + 4 * 1000000 * 128]))

    schema = pa.schema([
        pa.field("id", pa.uint32(), False),
        pa.field("vector", pa.list_(pa.float32(), 128), False)
    ])
    table = pa.Table.from_arrays([
        pa.array(range(1000000), type=pa.uint32()),
        pa.FixedSizeListArray.from_arrays(pa.array(data, type=pa.float32()), list_size=128)
    ], schema=schema)

    lance.write_dataset(table, uri, max_rows_per_group=8192, max_rows_per_file=1024*1024)

In [15]:
uri = "vec_data.lance"
sift1m = lance.dataset(uri)

### KNN (no index)

Sample 100 vectors as query vectors

In [16]:
import duckdb
vtable = sift1m.to_table() # Next release of DuckDB would no longer require this
samples = duckdb.query("SELECT vector FROM vtable USING SAMPLE 100").to_df().vector
samples

0     [0.0, 39.0, 35.0, 2.0, 11.0, 43.0, 20.0, 4.0, ...
1     [16.0, 15.0, 3.0, 22.0, 25.0, 12.0, 11.0, 6.0,...
2     [0.0, 0.0, 0.0, 3.0, 23.0, 1.793662034335766e-...
3     [12.0, 10.0, 6.0, 18.0, 102.0, 75.0, 12.0, 0.0...
4     [18.0, 11.0, 2.0, 0.0, 0.0, 0.0, 10.0, 82.0, 2...
                            ...                        
95    [18.0, 104.0, 92.0, 6.0, 9.0, 13.0, 0.0, 133.0...
96    [0.0, 0.0, 0.0, 0.0, 19.0, 0.0, 0.0, 0.0, 0.0,...
97    [38.0, 15.0, 0.0, 0.0, 0.0, 6.0, 1.79366203433...
98    [2.0, 7.0, 16.0, 86.0, 23.0, 4.0, 5.0, 11.0, 3...
99    [3.0, 7.0, 35.0, 54.0, 21.0, 15.0, 8.0, 8.0, 0...
Name: vector, Length: 100, dtype: object

Call nearest neighbors (no ANN index here)

In [17]:
import time

start = time.time()
tbl = sift1m.to_table(columns=["id"], nearest={"column": "vector", "q": samples[0], "k": 10})
end = time.time()

print(f"Time(sec): {end-start}")
print(tbl.to_pandas())

Time(sec): 0.06611108779907227
       id                                             vector    score
0  559106  [0.0, 39.0, 35.0, 2.0, 11.0, 43.0, 20.0, 4.0, ...      0.0
1  977711  [21.0, 17.0, 56.0, 33.0, 67.0, 77.0, 5.0, 1.0,...  46279.0
2  426365  [0.0, 0.0, 49.0, 25.0, 0.0, 0.0, 0.0, 0.0, 0.0...  54707.0
3   24530  [2.0, 49.0, 128.0, 7.0, 0.0, 0.0, 0.0, 0.0, 11...  59340.0
4  913985  [0.0, 0.0, 56.0, 16.0, 3.0, 0.0, 0.0, 0.0, 0.0...  62090.0
5  429977  [33.0, 85.0, 51.0, 16.0, 10.0, 8.0, 1.0, 16.0,...  63941.0
6  224480  [0.0, 1.0, 49.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0,...  64691.0
7  896054  [3.0, 129.0, 130.0, 16.0, 0.0, 0.0, 0.0, 0.0, ...  65923.0
8  238670  [4.0, 21.0, 134.0, 14.0, 10.0, 3.0, 1.0, 1.0, ...  66527.0
9  861740  [0.0, 0.0, 37.0, 24.0, 0.0, 0.0, 0.0, 6.0, 10....  67687.0


Without the index this is scanning through the whole dataset to compute the distance. <br/>

For real-time serving we can do much better with an ANN index

### Build index

Now let's build an index. We haven't implemented HNSW but IVF+PQ is shown here

**NOTE** If you'd rather not wait for index build, you can download a version with the index pre-built from [here](https://eto-public.s3.us-west-2.amazonaws.com/datasets/sift/sift_ivf256_pq16.tar.gz) and skip the next cell

In [18]:
sift1m.create_index("vector",
                    index_type="IVF_PQ", 
                    num_partitions=256,  # IVF
                    num_sub_vectors=16)  # PQ

Building vector index: IVF256,PQ16
Sample 65536 out of 1000000 to train kmeans of 128 dim, 256 clusters
Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters
Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters
Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters
Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters
Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters
Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters
Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters
Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters
Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters
Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters
Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters
Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters
Sample 65536 out of 1000000 to train kmeans of 8 dim, 256 clusters
Sample 65536 out of 10000

### Try nearest neighbors again with ANN index

Let's look for nearest neighbors again

In [19]:
sift1m = lance.dataset(uri)

In [20]:
import time

tot = 0
for q in samples:
    start = time.time()
    tbl = sift1m.to_table(nearest={"column": "vector", "q": q, "k": 10})
    end = time.time()
    tot += (end - start)

print(f"Avg(sec): {tot / len(samples)}")
print(tbl.to_pandas())

Avg(sec): 0.000644688606262207
       id                                             vector         score
0  849075  [3.0, 7.0, 35.0, 54.0, 21.0, 15.0, 8.0, 8.0, 0...  13057.361328
1  249096  [11.0, 2.0, 20.0, 15.0, 28.0, 31.0, 2.0, 0.0, ...  47776.257812
2   71979  [0.0, 0.0, 40.0, 75.0, 33.0, 1.0, 0.0, 0.0, 0....  48108.562500
3  570048  [0.0, 0.0, 48.0, 111.0, 2.0, 7.0, 15.0, 3.0, 0...  50237.187500
4  726654  [0.0, 0.0, 3.0, 19.0, 102.0, 15.0, 0.0, 0.0, 0...  52966.761719
5  920025  [0.0, 0.0, 67.0, 98.0, 27.0, 0.0, 0.0, 0.0, 0....  53882.226562
6  722526  [0.0, 0.0, 6.0, 143.0, 9.0, 0.0, 0.0, 0.0, 0.0...  54794.750000
7  447369  [0.0, 0.0, 11.0, 61.0, 19.0, 13.0, 18.0, 0.0, ...  55112.449219
8  832434  [0.0, 0.0, 12.0, 37.0, 30.0, 3.0, 1.0, 0.0, 0....  56225.007812
9  481941  [0.0, 0.0, 0.0, 16.0, 98.0, 33.0, 0.0, 0.0, 0....  56493.011719


The latency vs recall is tunable via:
- nprobes: how many IVF partitions to search
- refine_factor: determines how many vectors are retrieved during re-ranking

In [21]:
%%time

sift1m.to_table(nearest={"column": "vector", 
                         "q": samples[0], 
                         "k": 10, 
                         "nprobes": 10, 
                         "refine_factor": 5}).to_pandas()

CPU times: user 2.44 ms, sys: 2.38 ms, total: 4.81 ms
Wall time: 2.75 ms


Unnamed: 0,id,vector,score
0,559106,"[0.0, 39.0, 35.0, 2.0, 11.0, 43.0, 20.0, 4.0, ...",0.0
1,977711,"[21.0, 17.0, 56.0, 33.0, 67.0, 77.0, 5.0, 1.0,...",46279.0
2,426365,"[0.0, 0.0, 49.0, 25.0, 0.0, 0.0, 0.0, 0.0, 0.0...",54707.0
3,24530,"[2.0, 49.0, 128.0, 7.0, 0.0, 0.0, 0.0, 0.0, 11...",59340.0
4,913985,"[0.0, 0.0, 56.0, 16.0, 3.0, 0.0, 0.0, 0.0, 0.0...",62090.0
5,429977,"[33.0, 85.0, 51.0, 16.0, 10.0, 8.0, 1.0, 16.0,...",63941.0
6,224480,"[0.0, 1.0, 49.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0,...",64691.0
7,896054,"[3.0, 129.0, 130.0, 16.0, 0.0, 0.0, 0.0, 0.0, ...",65923.0
8,324713,"[1.0, 1.0, 51.0, 12.0, 0.0, 0.0, 21.0, 69.0, 1...",68300.0
9,797627,"[1.0, 60.0, 40.0, 1.0, 33.0, 55.0, 3.0, 0.0, 3...",68375.0


q => sample vector

k => how many neighbors to return

nprobes => how many partitions (in the coarse quantizer) to probe

refine_factor => controls "re-ranking". If k=10 and refine_factor=5 then retrieve 50 nearest neighbors by ANN and re-sort using actual distances then return top 10. This improves recall without sacrificing performance too much

**NOTE** the latencies above include file io as lance currently doesn't hold anything in memory. Along with index building speed, creating a purely in memory version of the dataset would make the biggest impact on performance.

### Features and vector can be retrieved together

Usually we have other feature or metadata columns that need to be stored and fetched together.
If you're managing data and the index separately, you to do a bunch of annoying plumbing to put stuff together. With Lance it's a single call

In [22]:
tbl = sift1m.to_table()
tbl = tbl.append_column("item_id", pa.array(range(len(tbl))))
tbl = tbl.append_column("revenue", pa.array((np.random.randn(len(tbl))+5)*1000))
tbl.to_pandas()

Unnamed: 0,id,vector,item_id,revenue
0,0,"[0.0, 16.0, 35.0, 5.0, 32.0, 31.0, 14.0, 10.0,...",0,4548.441366
1,1,"[1.8e-43, 14.0, 35.0, 19.0, 20.0, 3.0, 1.0, 13...",1,4111.953914
2,2,"[33.0, 1.8e-43, 0.0, 1.0, 5.0, 3.0, 44.0, 40.0...",2,4106.037798
3,3,"[23.0, 10.0, 1.8e-43, 12.0, 47.0, 14.0, 25.0, ...",3,4104.232698
4,4,"[27.0, 29.0, 21.0, 1.8e-43, 1.0, 1.0, 0.0, 0.0...",4,6065.668134
...,...,...,...,...
999995,999995,"[8.0, 9.0, 5.0, 0.0, 10.0, 39.0, 72.0, 68.0, 3...",999995,4266.285842
999996,999996,"[3.0, 28.0, 55.0, 29.0, 35.0, 12.0, 1.0, 2.0, ...",999996,4107.380332
999997,999997,"[0.0, 13.0, 41.0, 72.0, 40.0, 9.0, 0.0, 0.0, 0...",999997,6376.527926
999998,999998,"[41.0, 121.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 24...",999998,5320.111631


In [23]:
sift1m = lance.write_dataset(tbl, uri, mode="overwrite")

In [24]:
sift1m.to_table(columns=["revenue"], nearest={"column": "vector", "q": samples[0], "k": 10}).to_pandas()

Unnamed: 0,revenue,vector,score
0,6018.616861,"[0.0, 39.0, 35.0, 2.0, 11.0, 43.0, 20.0, 4.0, ...",0.0
1,4752.182805,"[21.0, 17.0, 56.0, 33.0, 67.0, 77.0, 5.0, 1.0,...",46279.0
2,4471.487709,"[0.0, 0.0, 49.0, 25.0, 0.0, 0.0, 0.0, 0.0, 0.0...",54707.0
3,4199.64164,"[2.0, 49.0, 128.0, 7.0, 0.0, 0.0, 0.0, 0.0, 11...",59340.0
4,6747.487293,"[0.0, 0.0, 56.0, 16.0, 3.0, 0.0, 0.0, 0.0, 0.0...",62090.0
5,4117.257633,"[33.0, 85.0, 51.0, 16.0, 10.0, 8.0, 1.0, 16.0,...",63941.0
6,4631.559643,"[0.0, 1.0, 49.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0,...",64691.0
7,4600.905378,"[3.0, 129.0, 130.0, 16.0, 0.0, 0.0, 0.0, 0.0, ...",65923.0
8,3532.79516,"[4.0, 21.0, 134.0, 14.0, 10.0, 3.0, 1.0, 1.0, ...",66527.0
9,6240.809896,"[0.0, 0.0, 37.0, 24.0, 0.0, 0.0, 0.0, 6.0, 10....",67687.0
