Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Optimized utf8 checking
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao committed Dec 7, 2022
1 parent 68f0aff commit 3c9c081
Showing 1 changed file with 16 additions and 2 deletions.
18 changes: 16 additions & 2 deletions src/array/specification.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,18 @@ pub(crate) fn try_check_utf8<O: Offset, C: OffsetsContainer<O>>(
} else {
simdutf8::basic::from_utf8(values)?;

for start in offsets.starts() {
// offsets can be == values.len()
// let's find first offset from the end that is different
let starts = offsets.starts();
let last = starts
.iter()
.rev()
.enumerate()
.find_map(|(i, offset)| (offset.to_usize() != values.len()).then(|| i + 1))
.unwrap_or(starts.len() - 1);

let mut any_invalid = false;
for start in &starts[..=last] {
let start = start.to_usize();

// Safety: `try_check_offsets_bounds` just checked for bounds
Expand All @@ -68,9 +79,12 @@ pub(crate) fn try_check_utf8<O: Offset, C: OffsetsContainer<O>>(
// A valid code-point iff it does not start with 0b10xxxxxx
// Bit-magic taken from `std::str::is_char_boundary`
if (b as i8) < -0x40 {
return Err(Error::oos("Non-valid char boundary detected"));
any_invalid = true
}
}
if any_invalid {
return Err(Error::oos("Non-valid char boundary detected"));
}
Ok(())
}
}
Expand Down

0 comments on commit 3c9c081

Please sign in to comment.