Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
from pyarrow import RecordBatch, Schema
from pyarrow._compute import Expression

from .lance import __version__, _Dataset, _Scanner, _write_dataset
from .fragment import LanceFragment
from .lance import __version__, _Dataset, _Scanner, _write_dataset


class LanceDataset(pa.dataset.Dataset):
Expand Down Expand Up @@ -193,7 +193,9 @@ def replace_schema(self, schema: Schema):
"""
raise NotImplementedError("not changing schemas yet")

def get_fragments(self, filter: Optional[Expression] = None) -> Iterator[pa.dataset.Fragment]:
def get_fragments(
self, filter: Optional[Expression] = None
) -> Iterator[pa.dataset.Fragment]:
"""Get all fragments from the dataset.

Note: filter is not supported yet.
Expand Down
3 changes: 2 additions & 1 deletion python/python/lance/fragment.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
"""Dataset Fragment"""

from __future__ import annotations
from typing import Union, Optional, Iterator

from typing import Iterator, Optional, Union

import pyarrow as pa

Expand Down
14 changes: 0 additions & 14 deletions python/python/lance/vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,6 @@ def vec_to_table(
values = list(data.values())
if check_ndim:
ndim = _validate_ndim(values, ndim)
if ndim % 8 != 0:
raise ValueError(
"Vector dimensions should be multiples of 8 "
"for SIMD performance. To continue creating "
f"vectors with {ndim}-dimensions, "
"set `check_ndim=False"
)
vectors = _normalize_vectors(values, ndim)
ids = pa.array(data.keys())
arrays = [ids, vectors]
Expand All @@ -112,13 +105,6 @@ def vec_to_table(
raise ValueError(f"names cannot be more than 1 got {len(names)}")
if check_ndim:
ndim = _validate_ndim(data, ndim)
if ndim % 8 != 0:
raise ValueError(
"Vector dimensions should be multiples of 8 "
"for SIMD performance. To continue creating "
f"vectors with {ndim}-dimensions, "
"set `check_ndim=False"
)
vectors = _normalize_vectors(data, ndim)
arrays = [vectors]
else:
Expand Down
3 changes: 1 addition & 2 deletions python/python/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@
from pathlib import Path
from unittest import mock

import pandas as pd

import lance
import pandas as pd
import pandas.testing as tm
import polars as pl
import pyarrow as pa
Expand Down
19 changes: 19 additions & 0 deletions python/python/tests/test_lance.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import pyarrow as pa
import pyarrow.dataset
import pytest
from lance.vector import vec_to_table


def test_table_roundtrip(tmp_path):
Expand Down Expand Up @@ -104,6 +105,24 @@ def l2sq(vec, mat):
return np.sum((mat - vec) ** 2, axis=1)


def test_nearest_cosine(tmp_path):
tbl = vec_to_table([[2.5, 6], [7.4, 3.3]])
lance.write_dataset(tbl, tmp_path)

q = [483.5, 1384.5]
dataset = lance.dataset(tmp_path)
rs = dataset.to_table(
nearest={"column": "vector", "q": q, "k": 10, "metric": "cosine"}
).to_pandas()
for i in range(len(rs)):
assert rs.score[i] == pytest.approx(cosine_distance(rs.vector[i], q), abs=1e-6)
assert 0 <= rs.score[i] <= 1


def cosine_distance(vec1, vec2):
return 1 - np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


def test_count_rows(tmp_path):
df = pd.DataFrame({"values": range(100)})
tbl = pa.Table.from_pandas(df)
Expand Down
4 changes: 2 additions & 2 deletions rust/src/index/vector/pq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ use crate::arrow::*;
use crate::dataset::ROW_ID;
use crate::index::{pb, vector::kmeans::train_kmeans, vector::SCORE_COL};
use crate::io::object_reader::{read_fixed_stride_array, ObjectReader};
use crate::linalg::{l2::l2_distance_batch, normalize::normalize};
use crate::linalg::{l2::l2_distance_batch, norm_l2::norm_l2};
use crate::{Error, Result};

/// Product Quantization Index.
Expand Down Expand Up @@ -133,7 +133,7 @@ impl PQIndex {
let mut xy_table: Vec<f32> = vec![];
let mut y_norm_table: Vec<f32> = vec![];

let x_norm = normalize(key.values()).powi(2);
let x_norm = norm_l2(key.values()).powi(2);

let sub_vector_length = self.dimension / self.num_sub_vectors;
for i in 0..self.num_sub_vectors {
Expand Down
2 changes: 1 addition & 1 deletion rust/src/linalg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
pub mod cosine;
pub mod dot;
pub mod l2;
pub mod normalize;
pub mod norm_l2;

#[cfg(target_arch = "x86_64")]
pub mod x86_64;
39 changes: 32 additions & 7 deletions rust/src/linalg/cosine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use arrow_array::Float32Array;
use num_traits::real::Real;

use super::dot::dot;
use super::normalize::normalize;
use super::norm_l2::norm_l2;

/// Cosine Distance
pub trait Cosine {
Expand All @@ -37,7 +37,7 @@ impl Cosine for [f32] {

#[inline]
fn cosine(&self, other: &[f32]) -> f32 {
let x_norm = normalize(self);
let x_norm = norm_l2(self);
self.cosine_fast(x_norm, other)
}

Expand Down Expand Up @@ -79,7 +79,7 @@ pub fn cosine_distance(from: &[f32], to: &[f32]) -> f32 {
///
/// <https://en.wikipedia.org/wiki/Cosine_similarity>
pub fn cosine_distance_batch(from: &[f32], to: &[f32], dimension: usize) -> Arc<Float32Array> {
let x_norm = normalize(from);
let x_norm = norm_l2(from);

let dists = unsafe {
Float32Array::from_trusted_len_iter(
Expand All @@ -94,6 +94,9 @@ pub fn cosine_distance_batch(from: &[f32], to: &[f32], dimension: usize) -> Arc<
mod x86_64 {
use std::arch::x86_64::*;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sort import by sections? std / third party / lance with a blank line in between.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed


use super::dot;
use super::norm_l2;

pub(crate) mod avx {
use super::*;

Expand All @@ -102,7 +105,7 @@ mod x86_64 {
unsafe {
use crate::linalg::x86_64::avx::add_f32_register;

let len = x_vector.len();
let len = x_vector.len() / 8 * 8;
let mut xy = _mm256_setzero_ps();
let mut y_sq = _mm256_setzero_ps();
for i in (0..len).step_by(8) {
Expand All @@ -111,7 +114,12 @@ mod x86_64 {
xy = _mm256_fmadd_ps(x, y, xy);
y_sq = _mm256_fmadd_ps(y, y, y_sq);
}
1.0 - add_f32_register(xy) / (x_norm * add_f32_register(y_sq).sqrt())
// handle remaining elements
let mut dotprod = add_f32_register(xy);
dotprod += dot(&x_vector[len..], &y_vector[len..]);
let mut y_sq_sum = add_f32_register(y_sq);
y_sq_sum += norm_l2(&y_vector[len..]).powi(2);
1.0 - dotprod / (x_norm * y_sq_sum.sqrt())
}
}
}
Expand All @@ -121,13 +129,16 @@ mod x86_64 {
mod aarch64 {
use std::arch::aarch64::*;

use super::dot;
use super::norm_l2;

pub(crate) mod neon {
use super::*;

#[inline]
pub(crate) fn cosine_f32(x: &[f32], y: &[f32], x_norm: f32) -> f32 {
unsafe {
let len = x.len();
let len = x.len() / 4 * 4;
let buf = [0.0_f32; 4];
let mut xy = vld1q_f32(buf.as_ptr());
let mut y_sq = xy;
Expand All @@ -137,7 +148,12 @@ mod aarch64 {
xy = vfmaq_f32(xy, left, right);
y_sq = vfmaq_f32(y_sq, right, right);
}
1.0 - vaddvq_f32(xy) / (x_norm * vaddvq_f32(y_sq).sqrt())
// handle remaining elements
let mut dotprod = vaddvq_f32(xy);
dotprod += dot(&x[len..], &y[len..]);
let mut y_sq_sum = vaddvq_f32(y_sq);
y_sq_sum += norm_l2(&y[len..]).powi(2);
1.0 - dotprod / (x_norm * y_sq_sum.sqrt())
}
}
}
Expand All @@ -163,4 +179,13 @@ mod tests {
// from sklearn.metrics.pairwise import cosine_similarity
assert_relative_eq!(d.value(0), 1.0 - 0.8735806510613104);
}

#[test]
fn test_cosine_not_aligned() {
let x: Float32Array = vec![16 as f32, 32 as f32].into();
let y: Float32Array = vec![1 as f32, 2 as f32, 4 as f32, 8 as f32].into();
let d = cosine_distance_batch(x.values(), y.values(), 2);
assert_relative_eq!(d.value(0), 0.0);
assert_relative_eq!(d.value(1), 0.0);
}
}
16 changes: 8 additions & 8 deletions rust/src/linalg/normalize.rs → rust/src/linalg/norm_l2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,16 @@
/// The parameters must be cache line aligned. For example, from
/// Arrow Arrays, i.e., Float32Array
#[inline]
pub fn normalize(vector: &[f32]) -> f32 {
pub fn norm_l2(vector: &[f32]) -> f32 {
#[cfg(target_arch = "aarch64")]
{
aarch64::neon::normalize_f32(vector)
aarch64::neon::norm_l2(vector)
}

#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("fma") {
return x86_64::avx::normalize_f32(vector);
return x86_64::avx::norm_l2_f32(vector);
}
}

Expand All @@ -42,7 +42,7 @@ mod x86_64 {
use std::arch::x86_64::*;

#[inline]
pub(crate) fn normalize_f32(vector: &[f32]) -> f32 {
pub(crate) fn norm_l2_f32(vector: &[f32]) -> f32 {
let len = vector.len() / 8 * 8;
let mut sum = unsafe {
let mut sums = _mm256_setzero_ps();
Expand All @@ -64,7 +64,7 @@ mod aarch64 {
use std::arch::aarch64::*;

#[inline]
pub(crate) fn normalize_f32(vector: &[f32]) -> f32 {
pub(crate) fn norm_l2(vector: &[f32]) -> f32 {
let len = vector.len() / 4 * 4;
let mut sum = unsafe {
let buf = [0.0_f32; 4];
Expand All @@ -86,13 +86,13 @@ mod tests {
use super::*;

#[test]
fn test_normalize() {
fn test_norm_l2() {
let data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];

let result = normalize(&data);
let result = norm_l2(&data);
assert_eq!(result, (1..=8).map(|v| (v * v) as f32).sum::<f32>().sqrt());

let not_aligned = normalize(&data[2..]);
let not_aligned = norm_l2(&data[2..]);
assert_eq!(
not_aligned,
(3..=8).map(|v| (v * v) as f32).sum::<f32>().sqrt()
Expand Down