Skip to content

Commit

Permalink
Improve speed of duplicate attr test
Browse files Browse the repository at this point in the history
  • Loading branch information
kornelski committed Jul 2, 2023
1 parent de654fe commit 618b1a6
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 21 deletions.
7 changes: 4 additions & 3 deletions src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,18 @@ use crate::common::{Position, TextPosition};

pub use self::config::ParserConfig;
pub use self::config::ParserConfig2;

pub use self::error::{Error, ErrorKind};
pub use self::events::XmlEvent;

use self::parser::PullParser;

mod config;
mod events;
mod lexer;
mod parser;

mod indexset;
mod error;
pub use self::error::{Error, ErrorKind};


/// A result type yielded by `XmlReader`.
pub type Result<T, E = Error> = result::Result<T, E>;
Expand Down
100 changes: 100 additions & 0 deletions src/reader/indexset.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
use crate::attribute::OwnedAttribute;
use crate::name::OwnedName;

use std::collections::hash_map::RandomState;
use std::collections::HashSet;
use std::hash::BuildHasher;
use std::hash::Hash;
use std::hash::Hasher;

/// An ordered set
pub(crate) struct AttributesSet {
vec: Vec<OwnedAttribute>,
/// Uses a no-op hasher, because these u64s are hashes already
may_contain: HashSet<u64, U64HasherBuilder>,
/// This is real hasher for the `OwnedName`
hasher: RandomState,
}

impl AttributesSet {
pub fn new() -> Self {
Self {
vec: Vec::new(),
hasher: RandomState::new(),
may_contain: HashSet::default(),
}
}

fn hash(&self, val: &OwnedName) -> u64 {
let mut h = self.hasher.build_hasher();
val.hash(&mut h);
h.finish()
}

pub fn contains(&self, name: &OwnedName) -> bool {
// fall back to linear search only on duplicate or hash collision
self.may_contain.contains(&self.hash(name)) &&
self.vec.iter().any(move |a| &a.name == name)
}

pub fn push(&mut self, attr: OwnedAttribute) {
self.may_contain.insert(self.hash(&attr.name));
self.vec.push(attr);
}

pub fn into_vec(self) -> Vec<OwnedAttribute> {
self.vec
}
}

#[test]
fn indexset() {
let mut s = AttributesSet::new();
let not_here = OwnedName {
local_name: "attr1000".into(),
namespace: Some("test".into()),
prefix: None,
};

// this test will take a lot of time if the `contains()` is linear, and the loop is quadratic
for i in 0..50000 {
let name = OwnedName {
local_name: format!("attr{i}"), namespace: None, prefix: None,
};
assert!(!s.contains(&name));

s.push(OwnedAttribute { name, value: String::new() });
assert!(!s.contains(&not_here));
}

assert!(s.contains(&OwnedName {
local_name: format!("attr1234"), namespace: None, prefix: None,
}));
assert!(s.contains(&OwnedName {
local_name: format!("attr0"), namespace: None, prefix: None,
}));
assert!(s.contains(&OwnedName {
local_name: format!("attr49999"), namespace: None, prefix: None,
}));
}

/// Hashser that does nothing except passing u64 through
struct U64Hasher(u64);

impl Hasher for U64Hasher {
fn finish(&self) -> u64 { self.0 }
fn write(&mut self, slice: &[u8]) {
for &v in slice { self.0 ^= v as u64 } // unused in practice
}
fn write_u64(&mut self, i: u64) {
self.0 ^= i;
}
}

#[derive(Default)]
struct U64HasherBuilder;

impl BuildHasher for U64HasherBuilder {
type Hasher = U64Hasher;
fn build_hasher(&self) -> U64Hasher { U64Hasher(0) }
}
29 changes: 12 additions & 17 deletions src/reader/parser.rs
Original file line number Diff line number Diff line change
@@ -1,24 +1,19 @@
//! Contains an implementation of pull-based XML parser.


use crate::common::is_xml11_char;
use crate::common::is_xml10_char;
use crate::common::is_xml11_char_not_restricted;
use crate::reader::error::SyntaxError;
use std::collections::HashMap;
use std::io::prelude::*;

use crate::attribute::OwnedAttribute;
use crate::common::{self, is_name_char, is_name_start_char, Position, TextPosition, XmlVersion, is_whitespace_char};
use crate::common::{is_xml10_char, is_xml11_char, is_xml11_char_not_restricted, is_name_char, is_name_start_char, is_whitespace_char};
use crate::common::{Position, TextPosition, XmlVersion};
use crate::name::OwnedName;
use crate::namespace::NamespaceStack;

use crate::reader::config::ParserConfig2;
use crate::reader::error::SyntaxError;
use crate::reader::events::XmlEvent;
use crate::reader::indexset::AttributesSet;
use crate::reader::lexer::{Lexer, Token};

use super::{Error, ErrorKind};

use std::collections::HashMap;
use std::io::Read;

macro_rules! gen_takes(
($($field:ident -> $method:ident, $t:ty, $def:expr);+) => (
$(
Expand All @@ -42,7 +37,7 @@ gen_takes!(
element_name -> take_element_name, Option<OwnedName>, None;

attr_name -> take_attr_name, Option<OwnedName>, None;
attributes -> take_attributes, Vec<OwnedAttribute>, vec!()
attributes -> take_attributes, AttributesSet, AttributesSet::new()
);

mod inside_cdata;
Expand Down Expand Up @@ -133,7 +128,7 @@ impl PullParser {
element_name: None,
quote: None,
attr_name: None,
attributes: Vec::new(),
attributes: AttributesSet::new(),
},
final_result: None,
next_event: None,
Expand Down Expand Up @@ -299,15 +294,15 @@ struct MarkupData {
name: String, // used for processing instruction name
ref_data: String, // used for reference content

version: Option<common::XmlVersion>, // used for XML declaration version
version: Option<XmlVersion>, // used for XML declaration version
encoding: Option<String>, // used for XML declaration encoding
standalone: Option<bool>, // used for XML declaration standalone parameter

element_name: Option<OwnedName>, // used for element name

quote: Option<QuoteToken>, // used to hold opening quote for attribute value
attr_name: Option<OwnedName>, // used to hold attribute name
attributes: Vec<OwnedAttribute> // used to hold all accumulated attributes
attributes: AttributesSet, // used to hold all accumulated attributes
}

impl PullParser {
Expand Down Expand Up @@ -576,7 +571,7 @@ impl PullParser {

fn emit_start_element(&mut self, emit_end_element: bool) -> Option<Result> {
let mut name = self.data.take_element_name()?;
let mut attributes = self.data.take_attributes();
let mut attributes = self.data.take_attributes().into_vec();

// check whether the name prefix is bound and fix its namespace
match self.nst.get(name.borrow().prefix_repr()) {
Expand Down
2 changes: 1 addition & 1 deletion src/reader/parser/inside_opening_tag.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ impl PullParser {
OpeningTagSubstate::InsideAttributeName => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
// check that no attribute with such name is already present
// if there is one, XML is not well-formed
if this.data.attributes.iter().any(|a| a.name == name) {
if this.data.attributes.contains(&name) {
return Some(this.error(SyntaxError::RedefinedAttribute(name.to_string().into())))
}

Expand Down

0 comments on commit 618b1a6

Please sign in to comment.