diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index 34232348c20..46fd16cf3aa 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -28,8 +28,8 @@
 from pyarrow import RecordBatch, Schema
 from pyarrow._compute import Expression
 
-from .lance import __version__, _Dataset, _Scanner, _write_dataset
 from .fragment import LanceFragment
+from .lance import __version__, _Dataset, _Scanner, _write_dataset
 
 
 class LanceDataset(pa.dataset.Dataset):
@@ -193,7 +193,9 @@ def replace_schema(self, schema: Schema):
         """
         raise NotImplementedError("not changing schemas yet")
 
-    def get_fragments(self, filter: Optional[Expression] = None) -> Iterator[pa.dataset.Fragment]:
+    def get_fragments(
+        self, filter: Optional[Expression] = None
+    ) -> Iterator[pa.dataset.Fragment]:
         """Get all fragments from the dataset.
 
         Note: filter is not supported yet.
diff --git a/python/python/lance/fragment.py b/python/python/lance/fragment.py
index 9fdeb703b8e..36bc31db362 100644
--- a/python/python/lance/fragment.py
+++ b/python/python/lance/fragment.py
@@ -16,7 +16,8 @@
 """Dataset Fragment"""
 
 from __future__ import annotations
-from typing import Union, Optional, Iterator
+
+from typing import Iterator, Optional, Union
 
 import pyarrow as pa
 
diff --git a/python/python/lance/vector.py b/python/python/lance/vector.py
index 7ffe3efd0b8..b3f700f7573 100644
--- a/python/python/lance/vector.py
+++ b/python/python/lance/vector.py
@@ -93,13 +93,6 @@ def vec_to_table(
         values = list(data.values())
         if check_ndim:
             ndim = _validate_ndim(values, ndim)
-            if ndim % 8 != 0:
-                raise ValueError(
-                    "Vector dimensions should be multiples of 8 "
-                    "for SIMD performance. To continue creating "
-                    f"vectors with {ndim}-dimensions, "
-                    "set `check_ndim=False"
-                )
         vectors = _normalize_vectors(values, ndim)
         ids = pa.array(data.keys())
         arrays = [ids, vectors]
@@ -112,13 +105,6 @@ def vec_to_table(
             raise ValueError(f"names cannot be more than 1 got {len(names)}")
         if check_ndim:
             ndim = _validate_ndim(data, ndim)
-            if ndim % 8 != 0:
-                raise ValueError(
-                    "Vector dimensions should be multiples of 8 "
-                    "for SIMD performance. To continue creating "
-                    f"vectors with {ndim}-dimensions, "
-                    "set `check_ndim=False"
-                )
         vectors = _normalize_vectors(data, ndim)
         arrays = [vectors]
     else:
diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py
index 0cc62d73430..e0c3b11da6c 100644
--- a/python/python/tests/test_dataset.py
+++ b/python/python/tests/test_dataset.py
@@ -20,9 +20,8 @@
 from pathlib import Path
 from unittest import mock
 
-import pandas as pd
-
 import lance
+import pandas as pd
 import pandas.testing as tm
 import polars as pl
 import pyarrow as pa
diff --git a/python/python/tests/test_lance.py b/python/python/tests/test_lance.py
index 0c4fa84c01c..a9ba20ff34a 100644
--- a/python/python/tests/test_lance.py
+++ b/python/python/tests/test_lance.py
@@ -20,6 +20,7 @@
 import pyarrow as pa
 import pyarrow.dataset
 import pytest
+from lance.vector import vec_to_table
 
 
 def test_table_roundtrip(tmp_path):
@@ -104,6 +105,24 @@ def l2sq(vec, mat):
     return np.sum((mat - vec) ** 2, axis=1)
 
 
+def test_nearest_cosine(tmp_path):
+    tbl = vec_to_table([[2.5, 6], [7.4, 3.3]])
+    lance.write_dataset(tbl, tmp_path)
+
+    q = [483.5, 1384.5]
+    dataset = lance.dataset(tmp_path)
+    rs = dataset.to_table(
+        nearest={"column": "vector", "q": q, "k": 10, "metric": "cosine"}
+    ).to_pandas()
+    for i in range(len(rs)):
+        assert rs.score[i] == pytest.approx(cosine_distance(rs.vector[i], q), abs=1e-6)
+        assert 0 <= rs.score[i] <= 1
+
+
+def cosine_distance(vec1, vec2):
+    return 1 - np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
+
+
 def test_count_rows(tmp_path):
     df = pd.DataFrame({"values": range(100)})
     tbl = pa.Table.from_pandas(df)
diff --git a/rust/src/index/vector/pq.rs b/rust/src/index/vector/pq.rs
index f19bd475a45..e1e699e1835 100644
--- a/rust/src/index/vector/pq.rs
+++ b/rust/src/index/vector/pq.rs
@@ -34,7 +34,7 @@ use crate::arrow::*;
 use crate::dataset::ROW_ID;
 use crate::index::{pb, vector::kmeans::train_kmeans, vector::SCORE_COL};
 use crate::io::object_reader::{read_fixed_stride_array, ObjectReader};
-use crate::linalg::{l2::l2_distance_batch, normalize::normalize};
+use crate::linalg::{l2::l2_distance_batch, norm_l2::norm_l2};
 use crate::{Error, Result};
 
 /// Product Quantization Index.
@@ -133,7 +133,7 @@ impl PQIndex {
         let mut xy_table: Vec<f32> = vec![];
         let mut y_norm_table: Vec<f32> = vec![];
 
-        let x_norm = normalize(key.values()).powi(2);
+        let x_norm = norm_l2(key.values()).powi(2);
 
         let sub_vector_length = self.dimension / self.num_sub_vectors;
         for i in 0..self.num_sub_vectors {
diff --git a/rust/src/linalg.rs b/rust/src/linalg.rs
index 8b9708a65de..f8460fb7e05 100644
--- a/rust/src/linalg.rs
+++ b/rust/src/linalg.rs
@@ -15,7 +15,7 @@
 pub mod cosine;
 pub mod dot;
 pub mod l2;
-pub mod normalize;
+pub mod norm_l2;
 
 #[cfg(target_arch = "x86_64")]
 pub mod x86_64;
diff --git a/rust/src/linalg/cosine.rs b/rust/src/linalg/cosine.rs
index 9cf2bd649e0..f4be811712c 100644
--- a/rust/src/linalg/cosine.rs
+++ b/rust/src/linalg/cosine.rs
@@ -19,7 +19,7 @@ use arrow_array::Float32Array;
 use num_traits::real::Real;
 
 use super::dot::dot;
-use super::normalize::normalize;
+use super::norm_l2::norm_l2;
 
 /// Cosine Distance
 pub trait Cosine {
@@ -37,7 +37,7 @@ impl Cosine for [f32] {
 
     #[inline]
     fn cosine(&self, other: &[f32]) -> f32 {
-        let x_norm = normalize(self);
+        let x_norm = norm_l2(self);
         self.cosine_fast(x_norm, other)
     }
 
@@ -79,7 +79,7 @@ pub fn cosine_distance(from: &[f32], to: &[f32]) -> f32 {
 ///
 /// <https://en.wikipedia.org/wiki/Cosine_similarity>
 pub fn cosine_distance_batch(from: &[f32], to: &[f32], dimension: usize) -> Arc<Float32Array> {
-    let x_norm = normalize(from);
+    let x_norm = norm_l2(from);
 
     let dists = unsafe {
         Float32Array::from_trusted_len_iter(
@@ -94,6 +94,9 @@ pub fn cosine_distance_batch(from: &[f32], to: &[f32], dimension: usize) -> Arc<
 mod x86_64 {
     use std::arch::x86_64::*;
 
+    use super::dot;
+    use super::norm_l2;
+
     pub(crate) mod avx {
         use super::*;
 
@@ -102,7 +105,7 @@ mod x86_64 {
             unsafe {
                 use crate::linalg::x86_64::avx::add_f32_register;
 
-                let len = x_vector.len();
+                let len = x_vector.len() / 8 * 8;
                 let mut xy = _mm256_setzero_ps();
                 let mut y_sq = _mm256_setzero_ps();
                 for i in (0..len).step_by(8) {
@@ -111,7 +114,12 @@ mod x86_64 {
                     xy = _mm256_fmadd_ps(x, y, xy);
                     y_sq = _mm256_fmadd_ps(y, y, y_sq);
                 }
-                1.0 - add_f32_register(xy) / (x_norm * add_f32_register(y_sq).sqrt())
+                // handle remaining elements
+                let mut dotprod = add_f32_register(xy);
+                dotprod += dot(&x_vector[len..], &y_vector[len..]);
+                let mut y_sq_sum = add_f32_register(y_sq);
+                y_sq_sum += norm_l2(&y_vector[len..]).powi(2);
+                1.0 - dotprod / (x_norm * y_sq_sum.sqrt())
             }
         }
     }
@@ -121,13 +129,16 @@ mod x86_64 {
 mod aarch64 {
     use std::arch::aarch64::*;
 
+    use super::dot;
+    use super::norm_l2;
+
     pub(crate) mod neon {
         use super::*;
 
         #[inline]
         pub(crate) fn cosine_f32(x: &[f32], y: &[f32], x_norm: f32) -> f32 {
             unsafe {
-                let len = x.len();
+                let len = x.len() / 4 * 4;
                 let buf = [0.0_f32; 4];
                 let mut xy = vld1q_f32(buf.as_ptr());
                 let mut y_sq = xy;
@@ -137,7 +148,12 @@ mod aarch64 {
                     xy = vfmaq_f32(xy, left, right);
                     y_sq = vfmaq_f32(y_sq, right, right);
                 }
-                1.0 - vaddvq_f32(xy) / (x_norm * vaddvq_f32(y_sq).sqrt())
+                // handle remaining elements
+                let mut dotprod = vaddvq_f32(xy);
+                dotprod += dot(&x[len..], &y[len..]);
+                let mut y_sq_sum = vaddvq_f32(y_sq);
+                y_sq_sum += norm_l2(&y[len..]).powi(2);
+                1.0 - dotprod / (x_norm * y_sq_sum.sqrt())
             }
         }
     }
@@ -163,4 +179,13 @@ mod tests {
         // from sklearn.metrics.pairwise import cosine_similarity
         assert_relative_eq!(d.value(0), 1.0 - 0.8735806510613104);
     }
+
+    #[test]
+    fn test_cosine_not_aligned() {
+        let x: Float32Array = vec![16 as f32, 32 as f32].into();
+        let y: Float32Array = vec![1 as f32, 2 as f32, 4 as f32, 8 as f32].into();
+        let d = cosine_distance_batch(x.values(), y.values(), 2);
+        assert_relative_eq!(d.value(0), 0.0);
+        assert_relative_eq!(d.value(1), 0.0);
+    }
 }
diff --git a/rust/src/linalg/normalize.rs b/rust/src/linalg/norm_l2.rs
similarity index 87%
rename from rust/src/linalg/normalize.rs
rename to rust/src/linalg/norm_l2.rs
index 585198af38a..5b03ced9ae8 100644
--- a/rust/src/linalg/normalize.rs
+++ b/rust/src/linalg/norm_l2.rs
@@ -17,16 +17,16 @@
 /// The parameters must be cache line aligned. For example, from
 /// Arrow Arrays, i.e., Float32Array
 #[inline]
-pub fn normalize(vector: &[f32]) -> f32 {
+pub fn norm_l2(vector: &[f32]) -> f32 {
     #[cfg(target_arch = "aarch64")]
     {
-        aarch64::neon::normalize_f32(vector)
+        aarch64::neon::norm_l2(vector)
     }
 
     #[cfg(target_arch = "x86_64")]
     {
         if is_x86_feature_detected!("fma") {
-            return x86_64::avx::normalize_f32(vector);
+            return x86_64::avx::norm_l2_f32(vector);
         }
     }
 
@@ -42,7 +42,7 @@ mod x86_64 {
         use std::arch::x86_64::*;
 
         #[inline]
-        pub(crate) fn normalize_f32(vector: &[f32]) -> f32 {
+        pub(crate) fn norm_l2_f32(vector: &[f32]) -> f32 {
             let len = vector.len() / 8 * 8;
             let mut sum = unsafe {
                 let mut sums = _mm256_setzero_ps();
@@ -64,7 +64,7 @@ mod aarch64 {
         use std::arch::aarch64::*;
 
         #[inline]
-        pub(crate) fn normalize_f32(vector: &[f32]) -> f32 {
+        pub(crate) fn norm_l2(vector: &[f32]) -> f32 {
             let len = vector.len() / 4 * 4;
             let mut sum = unsafe {
                 let buf = [0.0_f32; 4];
@@ -86,13 +86,13 @@ mod tests {
     use super::*;
 
     #[test]
-    fn test_normalize() {
+    fn test_norm_l2() {
         let data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
 
-        let result = normalize(&data);
+        let result = norm_l2(&data);
         assert_eq!(result, (1..=8).map(|v| (v * v) as f32).sum::<f32>().sqrt());
 
-        let not_aligned = normalize(&data[2..]);
+        let not_aligned = norm_l2(&data[2..]);
         assert_eq!(
             not_aligned,
             (3..=8).map(|v| (v * v) as f32).sum::<f32>().sqrt()