Skip to content

Commit

Permalink
Make UTS 46 normalization non-experimental
Browse files Browse the repository at this point in the history
 * Bake ignored/disallow data into the normalization data after all.
 * Make public operations available via a dedicated wrapper type
   instead of the main normalizer types.

Closes unicode-org#2850
  • Loading branch information
hsivonen committed Mar 20, 2024
1 parent 13d97ea commit d18887c
Show file tree
Hide file tree
Showing 10 changed files with 932 additions and 496 deletions.
148 changes: 91 additions & 57 deletions components/normalizer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ extern crate alloc;
mod error;
pub mod properties;
pub mod provider;
pub mod uts46;

pub use crate::error::NormalizerError;

Expand All @@ -79,7 +80,6 @@ pub use NormalizerError as Error;
use crate::provider::CanonicalDecompositionDataV1Marker;
use crate::provider::CompatibilityDecompositionSupplementV1Marker;
use crate::provider::DecompositionDataV1;
#[cfg(feature = "experimental")]
use crate::provider::Uts46DecompositionSupplementV1Marker;
use alloc::string::String;
use alloc::vec::Vec;
Expand All @@ -106,20 +106,30 @@ use zerovec::{zeroslice, ZeroSlice};
#[derive(Debug)]
enum SupplementPayloadHolder {
Compatibility(DataPayload<CompatibilityDecompositionSupplementV1Marker>),
#[cfg(feature = "experimental")]
Uts46(DataPayload<Uts46DecompositionSupplementV1Marker>),
}

impl SupplementPayloadHolder {
fn get(&self) -> &DecompositionSupplementV1 {
match self {
SupplementPayloadHolder::Compatibility(d) => d.get(),
#[cfg(feature = "experimental")]
SupplementPayloadHolder::Uts46(d) => d.get(),
}
}
}

/// Treatment of the ignorable marker (0xFFFFFFFF) in data.
#[derive(Debug, PartialEq, Eq)]
enum IgnorableBehavior {
/// 0xFFFFFFFF in data is not supported.
Unsupported,
/// Ignorables are ignored.
Ignored,
/// Ignorables are treated as singleton decompositions
/// to the REPLACEMENT CHARACTER.
ReplacementCharacter,
}

/// Number of iterations allowed on the fast path before flushing.
/// Since a typical UTF-16 iteration advances over a 2-byte BMP
/// character, this means two memory pages.
Expand All @@ -132,6 +142,9 @@ impl SupplementPayloadHolder {
/// passes an error through from `Write`.
const UTF16_FAST_PATH_FLUSH_THRESHOLD: usize = 4096;

/// Marker for UTS 46 ignorables.
const IGNORABLE_MARKER: u32 = 0xFFFFFFFF;

/// Marker for starters that decompose to themselves but may
/// combine backwards under canonical composition.
/// (Main trie only; not used in the supplementary trie.)
Expand Down Expand Up @@ -528,6 +541,7 @@ where
/// 1. Decomposes to self.
/// 2. Decomposition starts with a non-starter
decomposition_passthrough_bound: u32, // never above 0xC0
ignorable_behavior: IgnorableBehavior, // Arguably should be a type parameter
}

impl<'data, I> Decomposition<'data, I>
Expand All @@ -549,7 +563,15 @@ where
decompositions: &'data DecompositionDataV1,
tables: &'data DecompositionTablesV1,
) -> Self {
Self::new_with_supplements(delegate, decompositions, None, tables, None, 0xC0)
Self::new_with_supplements(
delegate,
decompositions,
None,
tables,
None,
0xC0,
IgnorableBehavior::Unsupported,
)
}

/// Constructs a decomposing iterator adapter from a delegate
Expand All @@ -565,6 +587,7 @@ where
tables: &'data DecompositionTablesV1,
supplementary_tables: Option<&'data DecompositionTablesV1>,
decomposition_passthrough_bound: u8,
ignorable_behavior: IgnorableBehavior,
) -> Self {
let half_width_voicing_marks_become_non_starters =
if let Some(supplementary) = supplementary_decompositions {
Expand Down Expand Up @@ -595,6 +618,7 @@ where
},
half_width_voicing_marks_become_non_starters,
decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound),
ignorable_behavior,
};
let _ = ret.next(); // Remove the U+FFFF placeholder
ret
Expand Down Expand Up @@ -721,16 +745,42 @@ where

fn delegate_next_no_pending(&mut self) -> Option<CharacterAndTrieValue> {
debug_assert!(self.pending.is_none());
let c = self.delegate.next()?;
loop {
let c = self.delegate.next()?;

// TODO(#2384): Measure if this check is actually an optimization even in the
// non-supplementary case of if this should go inside the supplementary
// `if` below.
if u32::from(c) < self.decomposition_passthrough_bound {
return Some(CharacterAndTrieValue::new(c, 0));
}
// TODO(#2384): Measure if this check is actually an optimization even in the
// non-supplementary case of if this should go inside the supplementary
// `if` below.
if u32::from(c) < self.decomposition_passthrough_bound {
return Some(CharacterAndTrieValue::new(c, 0));
}

Some(self.attach_trie_value(c))
if let Some(supplementary) = self.supplementary_trie {
if let Some(value) = self.attach_supplementary_trie_value(c, supplementary) {
if value.trie_val == IGNORABLE_MARKER {
match self.ignorable_behavior {
IgnorableBehavior::Unsupported => {
debug_assert!(false);
}
IgnorableBehavior::ReplacementCharacter => {
return Some(CharacterAndTrieValue::new(
c,
u32::from(REPLACEMENT_CHARACTER),
));
}
IgnorableBehavior::Ignored => {
// Else ignore this character by reading the next one from the delegate.
continue;
}
}
}
return Some(value);
}
}
let trie_val = self.trie.get(c);
debug_assert_ne!(trie_val, IGNORABLE_MARKER);
return Some(CharacterAndTrieValue::new(c, trie_val));
}
}

fn delegate_next(&mut self) -> Option<CharacterAndTrieValue> {
Expand Down Expand Up @@ -1229,6 +1279,7 @@ macro_rules! composing_normalize_to {
) -> core::fmt::Result {
$prolog
let mut $composition = self.normalize_iter($text.chars());
debug_assert_eq!($composition.decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
for cc in $composition.decomposition.buffer.drain(..) {
$sink.write_char(cc.character())?;
}
Expand Down Expand Up @@ -1416,6 +1467,7 @@ macro_rules! decomposing_normalize_to {
$prolog

let mut $decomposition = self.normalize_iter($text.chars());
debug_assert_eq!($decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);

// Try to get the compiler to hoist the bound to a register.
let $decomposition_passthrough_bound = $decomposition.decomposition_passthrough_bound;
Expand Down Expand Up @@ -1730,8 +1782,8 @@ impl DecomposingNormalizer {
}

#[doc(hidden)]
#[cfg(all(feature = "experimental", feature = "compiled_data"))]
pub const fn new_uts46_decomposed_without_ignored_and_disallowed() -> Self {
#[cfg(feature = "compiled_data")]
pub(crate) const fn new_uts46_decomposed() -> Self {
const _: () = assert!(
crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
.scalars16
Expand Down Expand Up @@ -1807,8 +1859,7 @@ impl DecomposingNormalizer {
///
/// Public for testing only.
#[doc(hidden)]
#[cfg(feature = "experimental")]
pub fn try_new_uts46_decomposed_without_ignored_and_disallowed_unstable<D>(
pub(crate) fn try_new_uts46_decomposed_unstable<D>(
provider: &D,
) -> Result<Self, NormalizerError>
where
Expand Down Expand Up @@ -1872,6 +1923,7 @@ impl DecomposingNormalizer {
self.tables.get(),
self.supplementary_tables.as_ref().map(|s| s.get()),
self.decomposition_passthrough_bound,
IgnorableBehavior::Unsupported,
)
}

Expand Down Expand Up @@ -2241,52 +2293,27 @@ impl ComposingNormalizer {
})
}

/// See [`Self::try_new_uts46_without_ignored_and_disallowed_unstable`].
#[cfg(all(feature = "experimental", feature = "compiled_data"))]
pub const fn new_uts46_without_ignored_and_disallowed() -> Self {
ComposingNormalizer {
decomposing_normalizer:
DecomposingNormalizer::new_uts46_decomposed_without_ignored_and_disallowed(),
canonical_compositions: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
),
}
}

/// 🚧 \[Experimental\] UTS 46 constructor
///
/// This is a special building block normalization for IDNA that implements parts of the Map
/// step and the following Normalize step. The caller is responsible for performing the
/// "disallowed", "ignored", and "deviation" parts of the Map step before passing data to
/// this normalizer such that disallowed and ignored characters aren't passed to this
/// normalizer.
///
/// This is ICU4C's UTS 46 normalization with two exceptions: characters that UTS 46 disallows
/// and ICU4C maps to U+FFFD and characters that UTS 46 maps to the empty string normalize as
/// in NFC in this normalization. Making the disallowed characters behave like this is beneficial
/// to data size, and this normalizer implementation cannot deal with a character normalizing
/// to the empty string, which doesn't happen in NFC or NFKC as of Unicode 14.
/// step and the following Normalize step.
///
/// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
/// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
/// U+0345 from a reordered character into a non-reordered character before reordering happens.
/// Therefore, the output of this normalization may differ for different inputs that are
/// canonically equivalents with each other if they differ by how U+0345 is ordered relative
/// to other reorderable characters.
///
/// NOTE: This method remains experimental until suitability of this feature as part of
/// IDNA processing has been demonstrated.
///
/// <div class="stab unstable">
/// 🚧 This code is experimental; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. It can be enabled with the "experimental" Cargo feature
/// of the icu meta-crate. Use with caution.
/// <a href="https://github.com/unicode-org/icu4x/issues/2614">#2614</a>
/// </div>
#[cfg(feature = "experimental")]
pub fn try_new_uts46_without_ignored_and_disallowed_unstable<D>(
provider: &D,
) -> Result<Self, NormalizerError>
#[cfg(feature = "compiled_data")]
pub(crate) const fn new_uts46() -> Self {
ComposingNormalizer {
decomposing_normalizer: DecomposingNormalizer::new_uts46_decomposed(),
canonical_compositions: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
),
}
}

#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_uts46)]
pub(crate) fn try_new_uts46_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
where
D: DataProvider<CanonicalDecompositionDataV1Marker>
+ DataProvider<Uts46DecompositionSupplementV1Marker>
Expand All @@ -2297,9 +2324,7 @@ impl ComposingNormalizer {
+ ?Sized,
{
let decomposing_normalizer =
DecomposingNormalizer::try_new_uts46_decomposed_without_ignored_and_disallowed_unstable(
provider,
)?;
DecomposingNormalizer::try_new_uts46_decomposed_unstable(provider)?;

let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> =
provider.load(Default::default())?.take_payload()?;
Expand All @@ -2313,6 +2338,14 @@ impl ComposingNormalizer {
/// Wraps a delegate iterator into a composing iterator
/// adapter by using the data already held by this normalizer.
pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Composition<I> {
self.normalize_iter_private(iter, IgnorableBehavior::Unsupported)
}

fn normalize_iter_private<I: Iterator<Item = char>>(
&self,
iter: I,
ignorable_behavior: IgnorableBehavior,
) -> Composition<I> {
Composition::new(
Decomposition::new_with_supplements(
iter,
Expand All @@ -2327,6 +2360,7 @@ impl ComposingNormalizer {
.as_ref()
.map(|s| s.get()),
self.decomposing_normalizer.decomposition_passthrough_bound,
ignorable_behavior,
),
ZeroFrom::zero_from(&self.canonical_compositions.get().canonical_compositions),
self.decomposing_normalizer.composition_passthrough_bound,
Expand Down

0 comments on commit d18887c

Please sign in to comment.